diff options
Diffstat (limited to 'kernel')
72 files changed, 3355 insertions, 1926 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 35536d9c0964..76768ee812b2 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks | |||
@@ -220,9 +220,16 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE | |||
220 | 220 | ||
221 | endif | 221 | endif |
222 | 222 | ||
223 | config ARCH_SUPPORTS_ATOMIC_RMW | ||
224 | bool | ||
225 | |||
223 | config MUTEX_SPIN_ON_OWNER | 226 | config MUTEX_SPIN_ON_OWNER |
224 | def_bool y | 227 | def_bool y |
225 | depends on SMP && !DEBUG_MUTEXES | 228 | depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW |
229 | |||
230 | config RWSEM_SPIN_ON_OWNER | ||
231 | def_bool y | ||
232 | depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW | ||
226 | 233 | ||
227 | config ARCH_USE_QUEUE_RWLOCK | 234 | config ARCH_USE_QUEUE_RWLOCK |
228 | bool | 235 | bool |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 70776aec2562..7dc8788cfd52 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -149,12 +149,14 @@ struct cgroup_root cgrp_dfl_root; | |||
149 | */ | 149 | */ |
150 | static bool cgrp_dfl_root_visible; | 150 | static bool cgrp_dfl_root_visible; |
151 | 151 | ||
152 | /* | ||
153 | * Set by the boot param of the same name and makes subsystems with NULL | ||
154 | * ->dfl_files to use ->legacy_files on the default hierarchy. | ||
155 | */ | ||
156 | static bool cgroup_legacy_files_on_dfl; | ||
157 | |||
152 | /* some controllers are not supported in the default hierarchy */ | 158 | /* some controllers are not supported in the default hierarchy */ |
153 | static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 | 159 | static unsigned int cgrp_dfl_root_inhibit_ss_mask; |
154 | #ifdef CONFIG_CGROUP_DEBUG | ||
155 | | (1 << debug_cgrp_id) | ||
156 | #endif | ||
157 | ; | ||
158 | 160 | ||
159 | /* The list of hierarchy roots */ | 161 | /* The list of hierarchy roots */ |
160 | 162 | ||
@@ -180,13 +182,15 @@ static u64 css_serial_nr_next = 1; | |||
180 | */ | 182 | */ |
181 | static int need_forkexit_callback __read_mostly; | 183 | static int need_forkexit_callback __read_mostly; |
182 | 184 | ||
183 | static struct cftype cgroup_base_files[]; | 185 | static struct cftype cgroup_dfl_base_files[]; |
186 | static struct cftype cgroup_legacy_base_files[]; | ||
184 | 187 | ||
185 | static void cgroup_put(struct cgroup *cgrp); | 188 | static void cgroup_put(struct cgroup *cgrp); |
186 | static int rebind_subsystems(struct cgroup_root *dst_root, | 189 | static int rebind_subsystems(struct cgroup_root *dst_root, |
187 | unsigned int ss_mask); | 190 | unsigned int ss_mask); |
188 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 191 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
189 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); | 192 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, |
193 | bool visible); | ||
190 | static void css_release(struct percpu_ref *ref); | 194 | static void css_release(struct percpu_ref *ref); |
191 | static void kill_css(struct cgroup_subsys_state *css); | 195 | static void kill_css(struct cgroup_subsys_state *css); |
192 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | 196 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], |
@@ -1037,6 +1041,58 @@ static void cgroup_put(struct cgroup *cgrp) | |||
1037 | } | 1041 | } |
1038 | 1042 | ||
1039 | /** | 1043 | /** |
1044 | * cgroup_refresh_child_subsys_mask - update child_subsys_mask | ||
1045 | * @cgrp: the target cgroup | ||
1046 | * | ||
1047 | * On the default hierarchy, a subsystem may request other subsystems to be | ||
1048 | * enabled together through its ->depends_on mask. In such cases, more | ||
1049 | * subsystems than specified in "cgroup.subtree_control" may be enabled. | ||
1050 | * | ||
1051 | * This function determines which subsystems need to be enabled given the | ||
1052 | * current @cgrp->subtree_control and records it in | ||
1053 | * @cgrp->child_subsys_mask. The resulting mask is always a superset of | ||
1054 | * @cgrp->subtree_control and follows the usual hierarchy rules. | ||
1055 | */ | ||
1056 | static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) | ||
1057 | { | ||
1058 | struct cgroup *parent = cgroup_parent(cgrp); | ||
1059 | unsigned int cur_ss_mask = cgrp->subtree_control; | ||
1060 | struct cgroup_subsys *ss; | ||
1061 | int ssid; | ||
1062 | |||
1063 | lockdep_assert_held(&cgroup_mutex); | ||
1064 | |||
1065 | if (!cgroup_on_dfl(cgrp)) { | ||
1066 | cgrp->child_subsys_mask = cur_ss_mask; | ||
1067 | return; | ||
1068 | } | ||
1069 | |||
1070 | while (true) { | ||
1071 | unsigned int new_ss_mask = cur_ss_mask; | ||
1072 | |||
1073 | for_each_subsys(ss, ssid) | ||
1074 | if (cur_ss_mask & (1 << ssid)) | ||
1075 | new_ss_mask |= ss->depends_on; | ||
1076 | |||
1077 | /* | ||
1078 | * Mask out subsystems which aren't available. This can | ||
1079 | * happen only if some depended-upon subsystems were bound | ||
1080 | * to non-default hierarchies. | ||
1081 | */ | ||
1082 | if (parent) | ||
1083 | new_ss_mask &= parent->child_subsys_mask; | ||
1084 | else | ||
1085 | new_ss_mask &= cgrp->root->subsys_mask; | ||
1086 | |||
1087 | if (new_ss_mask == cur_ss_mask) | ||
1088 | break; | ||
1089 | cur_ss_mask = new_ss_mask; | ||
1090 | } | ||
1091 | |||
1092 | cgrp->child_subsys_mask = cur_ss_mask; | ||
1093 | } | ||
1094 | |||
1095 | /** | ||
1040 | * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods | 1096 | * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods |
1041 | * @kn: the kernfs_node being serviced | 1097 | * @kn: the kernfs_node being serviced |
1042 | * | 1098 | * |
@@ -1208,12 +1264,15 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) | |||
1208 | up_write(&css_set_rwsem); | 1264 | up_write(&css_set_rwsem); |
1209 | 1265 | ||
1210 | src_root->subsys_mask &= ~(1 << ssid); | 1266 | src_root->subsys_mask &= ~(1 << ssid); |
1211 | src_root->cgrp.child_subsys_mask &= ~(1 << ssid); | 1267 | src_root->cgrp.subtree_control &= ~(1 << ssid); |
1268 | cgroup_refresh_child_subsys_mask(&src_root->cgrp); | ||
1212 | 1269 | ||
1213 | /* default hierarchy doesn't enable controllers by default */ | 1270 | /* default hierarchy doesn't enable controllers by default */ |
1214 | dst_root->subsys_mask |= 1 << ssid; | 1271 | dst_root->subsys_mask |= 1 << ssid; |
1215 | if (dst_root != &cgrp_dfl_root) | 1272 | if (dst_root != &cgrp_dfl_root) { |
1216 | dst_root->cgrp.child_subsys_mask |= 1 << ssid; | 1273 | dst_root->cgrp.subtree_control |= 1 << ssid; |
1274 | cgroup_refresh_child_subsys_mask(&dst_root->cgrp); | ||
1275 | } | ||
1217 | 1276 | ||
1218 | if (ss->bind) | 1277 | if (ss->bind) |
1219 | ss->bind(css); | 1278 | ss->bind(css); |
@@ -1233,8 +1292,6 @@ static int cgroup_show_options(struct seq_file *seq, | |||
1233 | for_each_subsys(ss, ssid) | 1292 | for_each_subsys(ss, ssid) |
1234 | if (root->subsys_mask & (1 << ssid)) | 1293 | if (root->subsys_mask & (1 << ssid)) |
1235 | seq_printf(seq, ",%s", ss->name); | 1294 | seq_printf(seq, ",%s", ss->name); |
1236 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) | ||
1237 | seq_puts(seq, ",sane_behavior"); | ||
1238 | if (root->flags & CGRP_ROOT_NOPREFIX) | 1295 | if (root->flags & CGRP_ROOT_NOPREFIX) |
1239 | seq_puts(seq, ",noprefix"); | 1296 | seq_puts(seq, ",noprefix"); |
1240 | if (root->flags & CGRP_ROOT_XATTR) | 1297 | if (root->flags & CGRP_ROOT_XATTR) |
@@ -1268,6 +1325,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1268 | bool all_ss = false, one_ss = false; | 1325 | bool all_ss = false, one_ss = false; |
1269 | unsigned int mask = -1U; | 1326 | unsigned int mask = -1U; |
1270 | struct cgroup_subsys *ss; | 1327 | struct cgroup_subsys *ss; |
1328 | int nr_opts = 0; | ||
1271 | int i; | 1329 | int i; |
1272 | 1330 | ||
1273 | #ifdef CONFIG_CPUSETS | 1331 | #ifdef CONFIG_CPUSETS |
@@ -1277,6 +1335,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1277 | memset(opts, 0, sizeof(*opts)); | 1335 | memset(opts, 0, sizeof(*opts)); |
1278 | 1336 | ||
1279 | while ((token = strsep(&o, ",")) != NULL) { | 1337 | while ((token = strsep(&o, ",")) != NULL) { |
1338 | nr_opts++; | ||
1339 | |||
1280 | if (!*token) | 1340 | if (!*token) |
1281 | return -EINVAL; | 1341 | return -EINVAL; |
1282 | if (!strcmp(token, "none")) { | 1342 | if (!strcmp(token, "none")) { |
@@ -1361,37 +1421,33 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1361 | return -ENOENT; | 1421 | return -ENOENT; |
1362 | } | 1422 | } |
1363 | 1423 | ||
1364 | /* Consistency checks */ | ||
1365 | |||
1366 | if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { | 1424 | if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { |
1367 | pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); | 1425 | pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); |
1368 | 1426 | if (nr_opts != 1) { | |
1369 | if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || | 1427 | pr_err("sane_behavior: no other mount options allowed\n"); |
1370 | opts->cpuset_clone_children || opts->release_agent || | ||
1371 | opts->name) { | ||
1372 | pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); | ||
1373 | return -EINVAL; | 1428 | return -EINVAL; |
1374 | } | 1429 | } |
1375 | } else { | 1430 | return 0; |
1376 | /* | ||
1377 | * If the 'all' option was specified select all the | ||
1378 | * subsystems, otherwise if 'none', 'name=' and a subsystem | ||
1379 | * name options were not specified, let's default to 'all' | ||
1380 | */ | ||
1381 | if (all_ss || (!one_ss && !opts->none && !opts->name)) | ||
1382 | for_each_subsys(ss, i) | ||
1383 | if (!ss->disabled) | ||
1384 | opts->subsys_mask |= (1 << i); | ||
1385 | |||
1386 | /* | ||
1387 | * We either have to specify by name or by subsystems. (So | ||
1388 | * all empty hierarchies must have a name). | ||
1389 | */ | ||
1390 | if (!opts->subsys_mask && !opts->name) | ||
1391 | return -EINVAL; | ||
1392 | } | 1431 | } |
1393 | 1432 | ||
1394 | /* | 1433 | /* |
1434 | * If the 'all' option was specified select all the subsystems, | ||
1435 | * otherwise if 'none', 'name=' and a subsystem name options were | ||
1436 | * not specified, let's default to 'all' | ||
1437 | */ | ||
1438 | if (all_ss || (!one_ss && !opts->none && !opts->name)) | ||
1439 | for_each_subsys(ss, i) | ||
1440 | if (!ss->disabled) | ||
1441 | opts->subsys_mask |= (1 << i); | ||
1442 | |||
1443 | /* | ||
1444 | * We either have to specify by name or by subsystems. (So all | ||
1445 | * empty hierarchies must have a name). | ||
1446 | */ | ||
1447 | if (!opts->subsys_mask && !opts->name) | ||
1448 | return -EINVAL; | ||
1449 | |||
1450 | /* | ||
1395 | * Option noprefix was introduced just for backward compatibility | 1451 | * Option noprefix was introduced just for backward compatibility |
1396 | * with the old cpuset, so we allow noprefix only if mounting just | 1452 | * with the old cpuset, so we allow noprefix only if mounting just |
1397 | * the cpuset subsystem. | 1453 | * the cpuset subsystem. |
@@ -1399,7 +1455,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1399 | if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) | 1455 | if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) |
1400 | return -EINVAL; | 1456 | return -EINVAL; |
1401 | 1457 | ||
1402 | |||
1403 | /* Can't specify "none" and some subsystems */ | 1458 | /* Can't specify "none" and some subsystems */ |
1404 | if (opts->subsys_mask && opts->none) | 1459 | if (opts->subsys_mask && opts->none) |
1405 | return -EINVAL; | 1460 | return -EINVAL; |
@@ -1414,8 +1469,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | |||
1414 | struct cgroup_sb_opts opts; | 1469 | struct cgroup_sb_opts opts; |
1415 | unsigned int added_mask, removed_mask; | 1470 | unsigned int added_mask, removed_mask; |
1416 | 1471 | ||
1417 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { | 1472 | if (root == &cgrp_dfl_root) { |
1418 | pr_err("sane_behavior: remount is not allowed\n"); | 1473 | pr_err("remount is not allowed\n"); |
1419 | return -EINVAL; | 1474 | return -EINVAL; |
1420 | } | 1475 | } |
1421 | 1476 | ||
@@ -1434,11 +1489,10 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | |||
1434 | removed_mask = root->subsys_mask & ~opts.subsys_mask; | 1489 | removed_mask = root->subsys_mask & ~opts.subsys_mask; |
1435 | 1490 | ||
1436 | /* Don't allow flags or name to change at remount */ | 1491 | /* Don't allow flags or name to change at remount */ |
1437 | if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || | 1492 | if ((opts.flags ^ root->flags) || |
1438 | (opts.name && strcmp(opts.name, root->name))) { | 1493 | (opts.name && strcmp(opts.name, root->name))) { |
1439 | pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", | 1494 | pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", |
1440 | opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", | 1495 | opts.flags, opts.name ?: "", root->flags, root->name); |
1441 | root->flags & CGRP_ROOT_OPTION_MASK, root->name); | ||
1442 | ret = -EINVAL; | 1496 | ret = -EINVAL; |
1443 | goto out_unlock; | 1497 | goto out_unlock; |
1444 | } | 1498 | } |
@@ -1563,6 +1617,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) | |||
1563 | { | 1617 | { |
1564 | LIST_HEAD(tmp_links); | 1618 | LIST_HEAD(tmp_links); |
1565 | struct cgroup *root_cgrp = &root->cgrp; | 1619 | struct cgroup *root_cgrp = &root->cgrp; |
1620 | struct cftype *base_files; | ||
1566 | struct css_set *cset; | 1621 | struct css_set *cset; |
1567 | int i, ret; | 1622 | int i, ret; |
1568 | 1623 | ||
@@ -1600,7 +1655,12 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) | |||
1600 | } | 1655 | } |
1601 | root_cgrp->kn = root->kf_root->kn; | 1656 | root_cgrp->kn = root->kf_root->kn; |
1602 | 1657 | ||
1603 | ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); | 1658 | if (root == &cgrp_dfl_root) |
1659 | base_files = cgroup_dfl_base_files; | ||
1660 | else | ||
1661 | base_files = cgroup_legacy_base_files; | ||
1662 | |||
1663 | ret = cgroup_addrm_files(root_cgrp, base_files, true); | ||
1604 | if (ret) | 1664 | if (ret) |
1605 | goto destroy_root; | 1665 | goto destroy_root; |
1606 | 1666 | ||
@@ -1638,7 +1698,7 @@ destroy_root: | |||
1638 | exit_root_id: | 1698 | exit_root_id: |
1639 | cgroup_exit_root_id(root); | 1699 | cgroup_exit_root_id(root); |
1640 | cancel_ref: | 1700 | cancel_ref: |
1641 | percpu_ref_cancel_init(&root_cgrp->self.refcnt); | 1701 | percpu_ref_exit(&root_cgrp->self.refcnt); |
1642 | out: | 1702 | out: |
1643 | free_cgrp_cset_links(&tmp_links); | 1703 | free_cgrp_cset_links(&tmp_links); |
1644 | return ret; | 1704 | return ret; |
@@ -1672,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1672 | goto out_unlock; | 1732 | goto out_unlock; |
1673 | 1733 | ||
1674 | /* look for a matching existing root */ | 1734 | /* look for a matching existing root */ |
1675 | if (!opts.subsys_mask && !opts.none && !opts.name) { | 1735 | if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) { |
1676 | cgrp_dfl_root_visible = true; | 1736 | cgrp_dfl_root_visible = true; |
1677 | root = &cgrp_dfl_root; | 1737 | root = &cgrp_dfl_root; |
1678 | cgroup_get(&root->cgrp); | 1738 | cgroup_get(&root->cgrp); |
@@ -1730,15 +1790,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1730 | goto out_unlock; | 1790 | goto out_unlock; |
1731 | } | 1791 | } |
1732 | 1792 | ||
1733 | if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { | 1793 | if (root->flags ^ opts.flags) |
1734 | if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { | 1794 | pr_warn("new mount options do not match the existing superblock, will be ignored\n"); |
1735 | pr_err("sane_behavior: new mount options should match the existing superblock\n"); | ||
1736 | ret = -EINVAL; | ||
1737 | goto out_unlock; | ||
1738 | } else { | ||
1739 | pr_warn("new mount options do not match the existing superblock, will be ignored\n"); | ||
1740 | } | ||
1741 | } | ||
1742 | 1795 | ||
1743 | /* | 1796 | /* |
1744 | * We want to reuse @root whose lifetime is governed by its | 1797 | * We want to reuse @root whose lifetime is governed by its |
@@ -2457,9 +2510,7 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v) | |||
2457 | 2510 | ||
2458 | static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) | 2511 | static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) |
2459 | { | 2512 | { |
2460 | struct cgroup *cgrp = seq_css(seq)->cgroup; | 2513 | seq_puts(seq, "0\n"); |
2461 | |||
2462 | seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); | ||
2463 | return 0; | 2514 | return 0; |
2464 | } | 2515 | } |
2465 | 2516 | ||
@@ -2496,7 +2547,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v) | |||
2496 | { | 2547 | { |
2497 | struct cgroup *cgrp = seq_css(seq)->cgroup; | 2548 | struct cgroup *cgrp = seq_css(seq)->cgroup; |
2498 | 2549 | ||
2499 | cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); | 2550 | cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control); |
2500 | return 0; | 2551 | return 0; |
2501 | } | 2552 | } |
2502 | 2553 | ||
@@ -2505,7 +2556,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) | |||
2505 | { | 2556 | { |
2506 | struct cgroup *cgrp = seq_css(seq)->cgroup; | 2557 | struct cgroup *cgrp = seq_css(seq)->cgroup; |
2507 | 2558 | ||
2508 | cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); | 2559 | cgroup_print_ss_mask(seq, cgrp->subtree_control); |
2509 | return 0; | 2560 | return 0; |
2510 | } | 2561 | } |
2511 | 2562 | ||
@@ -2611,6 +2662,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2611 | loff_t off) | 2662 | loff_t off) |
2612 | { | 2663 | { |
2613 | unsigned int enable = 0, disable = 0; | 2664 | unsigned int enable = 0, disable = 0; |
2665 | unsigned int css_enable, css_disable, old_ctrl, new_ctrl; | ||
2614 | struct cgroup *cgrp, *child; | 2666 | struct cgroup *cgrp, *child; |
2615 | struct cgroup_subsys *ss; | 2667 | struct cgroup_subsys *ss; |
2616 | char *tok; | 2668 | char *tok; |
@@ -2650,11 +2702,26 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2650 | 2702 | ||
2651 | for_each_subsys(ss, ssid) { | 2703 | for_each_subsys(ss, ssid) { |
2652 | if (enable & (1 << ssid)) { | 2704 | if (enable & (1 << ssid)) { |
2653 | if (cgrp->child_subsys_mask & (1 << ssid)) { | 2705 | if (cgrp->subtree_control & (1 << ssid)) { |
2654 | enable &= ~(1 << ssid); | 2706 | enable &= ~(1 << ssid); |
2655 | continue; | 2707 | continue; |
2656 | } | 2708 | } |
2657 | 2709 | ||
2710 | /* unavailable or not enabled on the parent? */ | ||
2711 | if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || | ||
2712 | (cgroup_parent(cgrp) && | ||
2713 | !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) { | ||
2714 | ret = -ENOENT; | ||
2715 | goto out_unlock; | ||
2716 | } | ||
2717 | |||
2718 | /* | ||
2719 | * @ss is already enabled through dependency and | ||
2720 | * we'll just make it visible. Skip draining. | ||
2721 | */ | ||
2722 | if (cgrp->child_subsys_mask & (1 << ssid)) | ||
2723 | continue; | ||
2724 | |||
2658 | /* | 2725 | /* |
2659 | * Because css offlining is asynchronous, userland | 2726 | * Because css offlining is asynchronous, userland |
2660 | * might try to re-enable the same controller while | 2727 | * might try to re-enable the same controller while |
@@ -2677,23 +2744,15 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2677 | 2744 | ||
2678 | return restart_syscall(); | 2745 | return restart_syscall(); |
2679 | } | 2746 | } |
2680 | |||
2681 | /* unavailable or not enabled on the parent? */ | ||
2682 | if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || | ||
2683 | (cgroup_parent(cgrp) && | ||
2684 | !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { | ||
2685 | ret = -ENOENT; | ||
2686 | goto out_unlock; | ||
2687 | } | ||
2688 | } else if (disable & (1 << ssid)) { | 2747 | } else if (disable & (1 << ssid)) { |
2689 | if (!(cgrp->child_subsys_mask & (1 << ssid))) { | 2748 | if (!(cgrp->subtree_control & (1 << ssid))) { |
2690 | disable &= ~(1 << ssid); | 2749 | disable &= ~(1 << ssid); |
2691 | continue; | 2750 | continue; |
2692 | } | 2751 | } |
2693 | 2752 | ||
2694 | /* a child has it enabled? */ | 2753 | /* a child has it enabled? */ |
2695 | cgroup_for_each_live_child(child, cgrp) { | 2754 | cgroup_for_each_live_child(child, cgrp) { |
2696 | if (child->child_subsys_mask & (1 << ssid)) { | 2755 | if (child->subtree_control & (1 << ssid)) { |
2697 | ret = -EBUSY; | 2756 | ret = -EBUSY; |
2698 | goto out_unlock; | 2757 | goto out_unlock; |
2699 | } | 2758 | } |
@@ -2707,7 +2766,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2707 | } | 2766 | } |
2708 | 2767 | ||
2709 | /* | 2768 | /* |
2710 | * Except for the root, child_subsys_mask must be zero for a cgroup | 2769 | * Except for the root, subtree_control must be zero for a cgroup |
2711 | * with tasks so that child cgroups don't compete against tasks. | 2770 | * with tasks so that child cgroups don't compete against tasks. |
2712 | */ | 2771 | */ |
2713 | if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { | 2772 | if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { |
@@ -2716,36 +2775,75 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2716 | } | 2775 | } |
2717 | 2776 | ||
2718 | /* | 2777 | /* |
2719 | * Create csses for enables and update child_subsys_mask. This | 2778 | * Update subsys masks and calculate what needs to be done. More |
2720 | * changes cgroup_e_css() results which in turn makes the | 2779 | * subsystems than specified may need to be enabled or disabled |
2721 | * subsequent cgroup_update_dfl_csses() associate all tasks in the | 2780 | * depending on subsystem dependencies. |
2722 | * subtree to the updated csses. | 2781 | */ |
2782 | cgrp->subtree_control |= enable; | ||
2783 | cgrp->subtree_control &= ~disable; | ||
2784 | |||
2785 | old_ctrl = cgrp->child_subsys_mask; | ||
2786 | cgroup_refresh_child_subsys_mask(cgrp); | ||
2787 | new_ctrl = cgrp->child_subsys_mask; | ||
2788 | |||
2789 | css_enable = ~old_ctrl & new_ctrl; | ||
2790 | css_disable = old_ctrl & ~new_ctrl; | ||
2791 | enable |= css_enable; | ||
2792 | disable |= css_disable; | ||
2793 | |||
2794 | /* | ||
2795 | * Create new csses or make the existing ones visible. A css is | ||
2796 | * created invisible if it's being implicitly enabled through | ||
2797 | * dependency. An invisible css is made visible when the userland | ||
2798 | * explicitly enables it. | ||
2723 | */ | 2799 | */ |
2724 | for_each_subsys(ss, ssid) { | 2800 | for_each_subsys(ss, ssid) { |
2725 | if (!(enable & (1 << ssid))) | 2801 | if (!(enable & (1 << ssid))) |
2726 | continue; | 2802 | continue; |
2727 | 2803 | ||
2728 | cgroup_for_each_live_child(child, cgrp) { | 2804 | cgroup_for_each_live_child(child, cgrp) { |
2729 | ret = create_css(child, ss); | 2805 | if (css_enable & (1 << ssid)) |
2806 | ret = create_css(child, ss, | ||
2807 | cgrp->subtree_control & (1 << ssid)); | ||
2808 | else | ||
2809 | ret = cgroup_populate_dir(child, 1 << ssid); | ||
2730 | if (ret) | 2810 | if (ret) |
2731 | goto err_undo_css; | 2811 | goto err_undo_css; |
2732 | } | 2812 | } |
2733 | } | 2813 | } |
2734 | 2814 | ||
2735 | cgrp->child_subsys_mask |= enable; | 2815 | /* |
2736 | cgrp->child_subsys_mask &= ~disable; | 2816 | * At this point, cgroup_e_css() results reflect the new csses |
2737 | 2817 | * making the following cgroup_update_dfl_csses() properly update | |
2818 | * css associations of all tasks in the subtree. | ||
2819 | */ | ||
2738 | ret = cgroup_update_dfl_csses(cgrp); | 2820 | ret = cgroup_update_dfl_csses(cgrp); |
2739 | if (ret) | 2821 | if (ret) |
2740 | goto err_undo_css; | 2822 | goto err_undo_css; |
2741 | 2823 | ||
2742 | /* all tasks are now migrated away from the old csses, kill them */ | 2824 | /* |
2825 | * All tasks are migrated out of disabled csses. Kill or hide | ||
2826 | * them. A css is hidden when the userland requests it to be | ||
2827 | * disabled while other subsystems are still depending on it. The | ||
2828 | * css must not actively control resources and be in the vanilla | ||
2829 | * state if it's made visible again later. Controllers which may | ||
2830 | * be depended upon should provide ->css_reset() for this purpose. | ||
2831 | */ | ||
2743 | for_each_subsys(ss, ssid) { | 2832 | for_each_subsys(ss, ssid) { |
2744 | if (!(disable & (1 << ssid))) | 2833 | if (!(disable & (1 << ssid))) |
2745 | continue; | 2834 | continue; |
2746 | 2835 | ||
2747 | cgroup_for_each_live_child(child, cgrp) | 2836 | cgroup_for_each_live_child(child, cgrp) { |
2748 | kill_css(cgroup_css(child, ss)); | 2837 | struct cgroup_subsys_state *css = cgroup_css(child, ss); |
2838 | |||
2839 | if (css_disable & (1 << ssid)) { | ||
2840 | kill_css(css); | ||
2841 | } else { | ||
2842 | cgroup_clear_dir(child, 1 << ssid); | ||
2843 | if (ss->css_reset) | ||
2844 | ss->css_reset(css); | ||
2845 | } | ||
2846 | } | ||
2749 | } | 2847 | } |
2750 | 2848 | ||
2751 | kernfs_activate(cgrp->kn); | 2849 | kernfs_activate(cgrp->kn); |
@@ -2755,8 +2853,9 @@ out_unlock: | |||
2755 | return ret ?: nbytes; | 2853 | return ret ?: nbytes; |
2756 | 2854 | ||
2757 | err_undo_css: | 2855 | err_undo_css: |
2758 | cgrp->child_subsys_mask &= ~enable; | 2856 | cgrp->subtree_control &= ~enable; |
2759 | cgrp->child_subsys_mask |= disable; | 2857 | cgrp->subtree_control |= disable; |
2858 | cgroup_refresh_child_subsys_mask(cgrp); | ||
2760 | 2859 | ||
2761 | for_each_subsys(ss, ssid) { | 2860 | for_each_subsys(ss, ssid) { |
2762 | if (!(enable & (1 << ssid))) | 2861 | if (!(enable & (1 << ssid))) |
@@ -2764,8 +2863,14 @@ err_undo_css: | |||
2764 | 2863 | ||
2765 | cgroup_for_each_live_child(child, cgrp) { | 2864 | cgroup_for_each_live_child(child, cgrp) { |
2766 | struct cgroup_subsys_state *css = cgroup_css(child, ss); | 2865 | struct cgroup_subsys_state *css = cgroup_css(child, ss); |
2767 | if (css) | 2866 | |
2867 | if (!css) | ||
2868 | continue; | ||
2869 | |||
2870 | if (css_enable & (1 << ssid)) | ||
2768 | kill_css(css); | 2871 | kill_css(css); |
2872 | else | ||
2873 | cgroup_clear_dir(child, 1 << ssid); | ||
2769 | } | 2874 | } |
2770 | } | 2875 | } |
2771 | goto out_unlock; | 2876 | goto out_unlock; |
@@ -2878,9 +2983,9 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, | |||
2878 | 2983 | ||
2879 | /* | 2984 | /* |
2880 | * This isn't a proper migration and its usefulness is very | 2985 | * This isn't a proper migration and its usefulness is very |
2881 | * limited. Disallow if sane_behavior. | 2986 | * limited. Disallow on the default hierarchy. |
2882 | */ | 2987 | */ |
2883 | if (cgroup_sane_behavior(cgrp)) | 2988 | if (cgroup_on_dfl(cgrp)) |
2884 | return -EPERM; | 2989 | return -EPERM; |
2885 | 2990 | ||
2886 | /* | 2991 | /* |
@@ -2964,9 +3069,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | |||
2964 | 3069 | ||
2965 | for (cft = cfts; cft->name[0] != '\0'; cft++) { | 3070 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
2966 | /* does cft->flags tell us to skip this file on @cgrp? */ | 3071 | /* does cft->flags tell us to skip this file on @cgrp? */ |
2967 | if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) | 3072 | if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) |
2968 | continue; | 3073 | continue; |
2969 | if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) | 3074 | if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) |
2970 | continue; | 3075 | continue; |
2971 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) | 3076 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) |
2972 | continue; | 3077 | continue; |
@@ -3024,6 +3129,9 @@ static void cgroup_exit_cftypes(struct cftype *cfts) | |||
3024 | kfree(cft->kf_ops); | 3129 | kfree(cft->kf_ops); |
3025 | cft->kf_ops = NULL; | 3130 | cft->kf_ops = NULL; |
3026 | cft->ss = NULL; | 3131 | cft->ss = NULL; |
3132 | |||
3133 | /* revert flags set by cgroup core while adding @cfts */ | ||
3134 | cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL); | ||
3027 | } | 3135 | } |
3028 | } | 3136 | } |
3029 | 3137 | ||
@@ -3109,7 +3217,7 @@ int cgroup_rm_cftypes(struct cftype *cfts) | |||
3109 | * function currently returns 0 as long as @cfts registration is successful | 3217 | * function currently returns 0 as long as @cfts registration is successful |
3110 | * even if some file creation attempts on existing cgroups fail. | 3218 | * even if some file creation attempts on existing cgroups fail. |
3111 | */ | 3219 | */ |
3112 | int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | 3220 | static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) |
3113 | { | 3221 | { |
3114 | int ret; | 3222 | int ret; |
3115 | 3223 | ||
@@ -3135,6 +3243,40 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
3135 | } | 3243 | } |
3136 | 3244 | ||
3137 | /** | 3245 | /** |
3246 | * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy | ||
3247 | * @ss: target cgroup subsystem | ||
3248 | * @cfts: zero-length name terminated array of cftypes | ||
3249 | * | ||
3250 | * Similar to cgroup_add_cftypes() but the added files are only used for | ||
3251 | * the default hierarchy. | ||
3252 | */ | ||
3253 | int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | ||
3254 | { | ||
3255 | struct cftype *cft; | ||
3256 | |||
3257 | for (cft = cfts; cft && cft->name[0] != '\0'; cft++) | ||
3258 | cft->flags |= __CFTYPE_ONLY_ON_DFL; | ||
3259 | return cgroup_add_cftypes(ss, cfts); | ||
3260 | } | ||
3261 | |||
3262 | /** | ||
3263 | * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies | ||
3264 | * @ss: target cgroup subsystem | ||
3265 | * @cfts: zero-length name terminated array of cftypes | ||
3266 | * | ||
3267 | * Similar to cgroup_add_cftypes() but the added files are only used for | ||
3268 | * the legacy hierarchies. | ||
3269 | */ | ||
3270 | int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | ||
3271 | { | ||
3272 | struct cftype *cft; | ||
3273 | |||
3274 | for (cft = cfts; cft && cft->name[0] != '\0'; cft++) | ||
3275 | cft->flags |= __CFTYPE_NOT_ON_DFL; | ||
3276 | return cgroup_add_cftypes(ss, cfts); | ||
3277 | } | ||
3278 | |||
3279 | /** | ||
3138 | * cgroup_task_count - count the number of tasks in a cgroup. | 3280 | * cgroup_task_count - count the number of tasks in a cgroup. |
3139 | * @cgrp: the cgroup in question | 3281 | * @cgrp: the cgroup in question |
3140 | * | 3282 | * |
@@ -3699,8 +3841,9 @@ after: | |||
3699 | * | 3841 | * |
3700 | * All this extra complexity was caused by the original implementation | 3842 | * All this extra complexity was caused by the original implementation |
3701 | * committing to an entirely unnecessary property. In the long term, we | 3843 | * committing to an entirely unnecessary property. In the long term, we |
3702 | * want to do away with it. Explicitly scramble sort order if | 3844 | * want to do away with it. Explicitly scramble sort order if on the |
3703 | * sane_behavior so that no such expectation exists in the new interface. | 3845 | * default hierarchy so that no such expectation exists in the new |
3846 | * interface. | ||
3704 | * | 3847 | * |
3705 | * Scrambling is done by swapping every two consecutive bits, which is | 3848 | * Scrambling is done by swapping every two consecutive bits, which is |
3706 | * non-identity one-to-one mapping which disturbs sort order sufficiently. | 3849 | * non-identity one-to-one mapping which disturbs sort order sufficiently. |
@@ -3715,7 +3858,7 @@ static pid_t pid_fry(pid_t pid) | |||
3715 | 3858 | ||
3716 | static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) | 3859 | static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) |
3717 | { | 3860 | { |
3718 | if (cgroup_sane_behavior(cgrp)) | 3861 | if (cgroup_on_dfl(cgrp)) |
3719 | return pid_fry(pid); | 3862 | return pid_fry(pid); |
3720 | else | 3863 | else |
3721 | return pid; | 3864 | return pid; |
@@ -3818,7 +3961,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3818 | css_task_iter_end(&it); | 3961 | css_task_iter_end(&it); |
3819 | length = n; | 3962 | length = n; |
3820 | /* now sort & (if procs) strip out duplicates */ | 3963 | /* now sort & (if procs) strip out duplicates */ |
3821 | if (cgroup_sane_behavior(cgrp)) | 3964 | if (cgroup_on_dfl(cgrp)) |
3822 | sort(array, length, sizeof(pid_t), fried_cmppid, NULL); | 3965 | sort(array, length, sizeof(pid_t), fried_cmppid, NULL); |
3823 | else | 3966 | else |
3824 | sort(array, length, sizeof(pid_t), cmppid, NULL); | 3967 | sort(array, length, sizeof(pid_t), cmppid, NULL); |
@@ -4040,7 +4183,8 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, | |||
4040 | return 0; | 4183 | return 0; |
4041 | } | 4184 | } |
4042 | 4185 | ||
4043 | static struct cftype cgroup_base_files[] = { | 4186 | /* cgroup core interface files for the default hierarchy */ |
4187 | static struct cftype cgroup_dfl_base_files[] = { | ||
4044 | { | 4188 | { |
4045 | .name = "cgroup.procs", | 4189 | .name = "cgroup.procs", |
4046 | .seq_start = cgroup_pidlist_start, | 4190 | .seq_start = cgroup_pidlist_start, |
@@ -4052,46 +4196,52 @@ static struct cftype cgroup_base_files[] = { | |||
4052 | .mode = S_IRUGO | S_IWUSR, | 4196 | .mode = S_IRUGO | S_IWUSR, |
4053 | }, | 4197 | }, |
4054 | { | 4198 | { |
4055 | .name = "cgroup.clone_children", | ||
4056 | .flags = CFTYPE_INSANE, | ||
4057 | .read_u64 = cgroup_clone_children_read, | ||
4058 | .write_u64 = cgroup_clone_children_write, | ||
4059 | }, | ||
4060 | { | ||
4061 | .name = "cgroup.sane_behavior", | ||
4062 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
4063 | .seq_show = cgroup_sane_behavior_show, | ||
4064 | }, | ||
4065 | { | ||
4066 | .name = "cgroup.controllers", | 4199 | .name = "cgroup.controllers", |
4067 | .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, | 4200 | .flags = CFTYPE_ONLY_ON_ROOT, |
4068 | .seq_show = cgroup_root_controllers_show, | 4201 | .seq_show = cgroup_root_controllers_show, |
4069 | }, | 4202 | }, |
4070 | { | 4203 | { |
4071 | .name = "cgroup.controllers", | 4204 | .name = "cgroup.controllers", |
4072 | .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, | 4205 | .flags = CFTYPE_NOT_ON_ROOT, |
4073 | .seq_show = cgroup_controllers_show, | 4206 | .seq_show = cgroup_controllers_show, |
4074 | }, | 4207 | }, |
4075 | { | 4208 | { |
4076 | .name = "cgroup.subtree_control", | 4209 | .name = "cgroup.subtree_control", |
4077 | .flags = CFTYPE_ONLY_ON_DFL, | ||
4078 | .seq_show = cgroup_subtree_control_show, | 4210 | .seq_show = cgroup_subtree_control_show, |
4079 | .write = cgroup_subtree_control_write, | 4211 | .write = cgroup_subtree_control_write, |
4080 | }, | 4212 | }, |
4081 | { | 4213 | { |
4082 | .name = "cgroup.populated", | 4214 | .name = "cgroup.populated", |
4083 | .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, | 4215 | .flags = CFTYPE_NOT_ON_ROOT, |
4084 | .seq_show = cgroup_populated_show, | 4216 | .seq_show = cgroup_populated_show, |
4085 | }, | 4217 | }, |
4218 | { } /* terminate */ | ||
4219 | }; | ||
4086 | 4220 | ||
4087 | /* | 4221 | /* cgroup core interface files for the legacy hierarchies */ |
4088 | * Historical crazy stuff. These don't have "cgroup." prefix and | 4222 | static struct cftype cgroup_legacy_base_files[] = { |
4089 | * don't exist if sane_behavior. If you're depending on these, be | 4223 | { |
4090 | * prepared to be burned. | 4224 | .name = "cgroup.procs", |
4091 | */ | 4225 | .seq_start = cgroup_pidlist_start, |
4226 | .seq_next = cgroup_pidlist_next, | ||
4227 | .seq_stop = cgroup_pidlist_stop, | ||
4228 | .seq_show = cgroup_pidlist_show, | ||
4229 | .private = CGROUP_FILE_PROCS, | ||
4230 | .write = cgroup_procs_write, | ||
4231 | .mode = S_IRUGO | S_IWUSR, | ||
4232 | }, | ||
4233 | { | ||
4234 | .name = "cgroup.clone_children", | ||
4235 | .read_u64 = cgroup_clone_children_read, | ||
4236 | .write_u64 = cgroup_clone_children_write, | ||
4237 | }, | ||
4238 | { | ||
4239 | .name = "cgroup.sane_behavior", | ||
4240 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
4241 | .seq_show = cgroup_sane_behavior_show, | ||
4242 | }, | ||
4092 | { | 4243 | { |
4093 | .name = "tasks", | 4244 | .name = "tasks", |
4094 | .flags = CFTYPE_INSANE, /* use "procs" instead */ | ||
4095 | .seq_start = cgroup_pidlist_start, | 4245 | .seq_start = cgroup_pidlist_start, |
4096 | .seq_next = cgroup_pidlist_next, | 4246 | .seq_next = cgroup_pidlist_next, |
4097 | .seq_stop = cgroup_pidlist_stop, | 4247 | .seq_stop = cgroup_pidlist_stop, |
@@ -4102,13 +4252,12 @@ static struct cftype cgroup_base_files[] = { | |||
4102 | }, | 4252 | }, |
4103 | { | 4253 | { |
4104 | .name = "notify_on_release", | 4254 | .name = "notify_on_release", |
4105 | .flags = CFTYPE_INSANE, | ||
4106 | .read_u64 = cgroup_read_notify_on_release, | 4255 | .read_u64 = cgroup_read_notify_on_release, |
4107 | .write_u64 = cgroup_write_notify_on_release, | 4256 | .write_u64 = cgroup_write_notify_on_release, |
4108 | }, | 4257 | }, |
4109 | { | 4258 | { |
4110 | .name = "release_agent", | 4259 | .name = "release_agent", |
4111 | .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, | 4260 | .flags = CFTYPE_ONLY_ON_ROOT, |
4112 | .seq_show = cgroup_release_agent_show, | 4261 | .seq_show = cgroup_release_agent_show, |
4113 | .write = cgroup_release_agent_write, | 4262 | .write = cgroup_release_agent_write, |
4114 | .max_write_len = PATH_MAX - 1, | 4263 | .max_write_len = PATH_MAX - 1, |
@@ -4175,6 +4324,8 @@ static void css_free_work_fn(struct work_struct *work) | |||
4175 | container_of(work, struct cgroup_subsys_state, destroy_work); | 4324 | container_of(work, struct cgroup_subsys_state, destroy_work); |
4176 | struct cgroup *cgrp = css->cgroup; | 4325 | struct cgroup *cgrp = css->cgroup; |
4177 | 4326 | ||
4327 | percpu_ref_exit(&css->refcnt); | ||
4328 | |||
4178 | if (css->ss) { | 4329 | if (css->ss) { |
4179 | /* css free path */ | 4330 | /* css free path */ |
4180 | if (css->parent) | 4331 | if (css->parent) |
@@ -4314,12 +4465,14 @@ static void offline_css(struct cgroup_subsys_state *css) | |||
4314 | * create_css - create a cgroup_subsys_state | 4465 | * create_css - create a cgroup_subsys_state |
4315 | * @cgrp: the cgroup new css will be associated with | 4466 | * @cgrp: the cgroup new css will be associated with |
4316 | * @ss: the subsys of new css | 4467 | * @ss: the subsys of new css |
4468 | * @visible: whether to create control knobs for the new css or not | ||
4317 | * | 4469 | * |
4318 | * Create a new css associated with @cgrp - @ss pair. On success, the new | 4470 | * Create a new css associated with @cgrp - @ss pair. On success, the new |
4319 | * css is online and installed in @cgrp with all interface files created. | 4471 | * css is online and installed in @cgrp with all interface files created if |
4320 | * Returns 0 on success, -errno on failure. | 4472 | * @visible. Returns 0 on success, -errno on failure. |
4321 | */ | 4473 | */ |
4322 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) | 4474 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, |
4475 | bool visible) | ||
4323 | { | 4476 | { |
4324 | struct cgroup *parent = cgroup_parent(cgrp); | 4477 | struct cgroup *parent = cgroup_parent(cgrp); |
4325 | struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); | 4478 | struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); |
@@ -4343,9 +4496,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) | |||
4343 | goto err_free_percpu_ref; | 4496 | goto err_free_percpu_ref; |
4344 | css->id = err; | 4497 | css->id = err; |
4345 | 4498 | ||
4346 | err = cgroup_populate_dir(cgrp, 1 << ss->id); | 4499 | if (visible) { |
4347 | if (err) | 4500 | err = cgroup_populate_dir(cgrp, 1 << ss->id); |
4348 | goto err_free_id; | 4501 | if (err) |
4502 | goto err_free_id; | ||
4503 | } | ||
4349 | 4504 | ||
4350 | /* @css is ready to be brought online now, make it visible */ | 4505 | /* @css is ready to be brought online now, make it visible */ |
4351 | list_add_tail_rcu(&css->sibling, &parent_css->children); | 4506 | list_add_tail_rcu(&css->sibling, &parent_css->children); |
@@ -4372,7 +4527,7 @@ err_list_del: | |||
4372 | err_free_id: | 4527 | err_free_id: |
4373 | cgroup_idr_remove(&ss->css_idr, css->id); | 4528 | cgroup_idr_remove(&ss->css_idr, css->id); |
4374 | err_free_percpu_ref: | 4529 | err_free_percpu_ref: |
4375 | percpu_ref_cancel_init(&css->refcnt); | 4530 | percpu_ref_exit(&css->refcnt); |
4376 | err_free_css: | 4531 | err_free_css: |
4377 | call_rcu(&css->rcu_head, css_free_rcu_fn); | 4532 | call_rcu(&css->rcu_head, css_free_rcu_fn); |
4378 | return err; | 4533 | return err; |
@@ -4385,6 +4540,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | |||
4385 | struct cgroup_root *root; | 4540 | struct cgroup_root *root; |
4386 | struct cgroup_subsys *ss; | 4541 | struct cgroup_subsys *ss; |
4387 | struct kernfs_node *kn; | 4542 | struct kernfs_node *kn; |
4543 | struct cftype *base_files; | ||
4388 | int ssid, ret; | 4544 | int ssid, ret; |
4389 | 4545 | ||
4390 | parent = cgroup_kn_lock_live(parent_kn); | 4546 | parent = cgroup_kn_lock_live(parent_kn); |
@@ -4455,14 +4611,20 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | |||
4455 | if (ret) | 4611 | if (ret) |
4456 | goto out_destroy; | 4612 | goto out_destroy; |
4457 | 4613 | ||
4458 | ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); | 4614 | if (cgroup_on_dfl(cgrp)) |
4615 | base_files = cgroup_dfl_base_files; | ||
4616 | else | ||
4617 | base_files = cgroup_legacy_base_files; | ||
4618 | |||
4619 | ret = cgroup_addrm_files(cgrp, base_files, true); | ||
4459 | if (ret) | 4620 | if (ret) |
4460 | goto out_destroy; | 4621 | goto out_destroy; |
4461 | 4622 | ||
4462 | /* let's create and online css's */ | 4623 | /* let's create and online css's */ |
4463 | for_each_subsys(ss, ssid) { | 4624 | for_each_subsys(ss, ssid) { |
4464 | if (parent->child_subsys_mask & (1 << ssid)) { | 4625 | if (parent->child_subsys_mask & (1 << ssid)) { |
4465 | ret = create_css(cgrp, ss); | 4626 | ret = create_css(cgrp, ss, |
4627 | parent->subtree_control & (1 << ssid)); | ||
4466 | if (ret) | 4628 | if (ret) |
4467 | goto out_destroy; | 4629 | goto out_destroy; |
4468 | } | 4630 | } |
@@ -4470,10 +4632,12 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | |||
4470 | 4632 | ||
4471 | /* | 4633 | /* |
4472 | * On the default hierarchy, a child doesn't automatically inherit | 4634 | * On the default hierarchy, a child doesn't automatically inherit |
4473 | * child_subsys_mask from the parent. Each is configured manually. | 4635 | * subtree_control from the parent. Each is configured manually. |
4474 | */ | 4636 | */ |
4475 | if (!cgroup_on_dfl(cgrp)) | 4637 | if (!cgroup_on_dfl(cgrp)) { |
4476 | cgrp->child_subsys_mask = parent->child_subsys_mask; | 4638 | cgrp->subtree_control = parent->subtree_control; |
4639 | cgroup_refresh_child_subsys_mask(cgrp); | ||
4640 | } | ||
4477 | 4641 | ||
4478 | kernfs_activate(kn); | 4642 | kernfs_activate(kn); |
4479 | 4643 | ||
@@ -4483,7 +4647,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | |||
4483 | out_free_id: | 4647 | out_free_id: |
4484 | cgroup_idr_remove(&root->cgroup_idr, cgrp->id); | 4648 | cgroup_idr_remove(&root->cgroup_idr, cgrp->id); |
4485 | out_cancel_ref: | 4649 | out_cancel_ref: |
4486 | percpu_ref_cancel_init(&cgrp->self.refcnt); | 4650 | percpu_ref_exit(&cgrp->self.refcnt); |
4487 | out_free_cgrp: | 4651 | out_free_cgrp: |
4488 | kfree(cgrp); | 4652 | kfree(cgrp); |
4489 | out_unlock: | 4653 | out_unlock: |
@@ -4736,8 +4900,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) | |||
4736 | */ | 4900 | */ |
4737 | int __init cgroup_init_early(void) | 4901 | int __init cgroup_init_early(void) |
4738 | { | 4902 | { |
4739 | static struct cgroup_sb_opts __initdata opts = | 4903 | static struct cgroup_sb_opts __initdata opts; |
4740 | { .flags = CGRP_ROOT_SANE_BEHAVIOR }; | ||
4741 | struct cgroup_subsys *ss; | 4904 | struct cgroup_subsys *ss; |
4742 | int i; | 4905 | int i; |
4743 | 4906 | ||
@@ -4775,7 +4938,8 @@ int __init cgroup_init(void) | |||
4775 | unsigned long key; | 4938 | unsigned long key; |
4776 | int ssid, err; | 4939 | int ssid, err; |
4777 | 4940 | ||
4778 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); | 4941 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); |
4942 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); | ||
4779 | 4943 | ||
4780 | mutex_lock(&cgroup_mutex); | 4944 | mutex_lock(&cgroup_mutex); |
4781 | 4945 | ||
@@ -4807,9 +4971,22 @@ int __init cgroup_init(void) | |||
4807 | * disabled flag and cftype registration needs kmalloc, | 4971 | * disabled flag and cftype registration needs kmalloc, |
4808 | * both of which aren't available during early_init. | 4972 | * both of which aren't available during early_init. |
4809 | */ | 4973 | */ |
4810 | if (!ss->disabled) { | 4974 | if (ss->disabled) |
4811 | cgrp_dfl_root.subsys_mask |= 1 << ss->id; | 4975 | continue; |
4812 | WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); | 4976 | |
4977 | cgrp_dfl_root.subsys_mask |= 1 << ss->id; | ||
4978 | |||
4979 | if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes) | ||
4980 | ss->dfl_cftypes = ss->legacy_cftypes; | ||
4981 | |||
4982 | if (!ss->dfl_cftypes) | ||
4983 | cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; | ||
4984 | |||
4985 | if (ss->dfl_cftypes == ss->legacy_cftypes) { | ||
4986 | WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); | ||
4987 | } else { | ||
4988 | WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); | ||
4989 | WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); | ||
4813 | } | 4990 | } |
4814 | } | 4991 | } |
4815 | 4992 | ||
@@ -5205,6 +5382,14 @@ static int __init cgroup_disable(char *str) | |||
5205 | } | 5382 | } |
5206 | __setup("cgroup_disable=", cgroup_disable); | 5383 | __setup("cgroup_disable=", cgroup_disable); |
5207 | 5384 | ||
5385 | static int __init cgroup_set_legacy_files_on_dfl(char *str) | ||
5386 | { | ||
5387 | printk("cgroup: using legacy files on the default hierarchy\n"); | ||
5388 | cgroup_legacy_files_on_dfl = true; | ||
5389 | return 0; | ||
5390 | } | ||
5391 | __setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl); | ||
5392 | |||
5208 | /** | 5393 | /** |
5209 | * css_tryget_online_from_dir - get corresponding css from a cgroup dentry | 5394 | * css_tryget_online_from_dir - get corresponding css from a cgroup dentry |
5210 | * @dentry: directory dentry of interest | 5395 | * @dentry: directory dentry of interest |
@@ -5399,6 +5584,6 @@ static struct cftype debug_files[] = { | |||
5399 | struct cgroup_subsys debug_cgrp_subsys = { | 5584 | struct cgroup_subsys debug_cgrp_subsys = { |
5400 | .css_alloc = debug_css_alloc, | 5585 | .css_alloc = debug_css_alloc, |
5401 | .css_free = debug_css_free, | 5586 | .css_free = debug_css_free, |
5402 | .base_cftypes = debug_files, | 5587 | .legacy_cftypes = debug_files, |
5403 | }; | 5588 | }; |
5404 | #endif /* CONFIG_CGROUP_DEBUG */ | 5589 | #endif /* CONFIG_CGROUP_DEBUG */ |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index a79e40f9d700..92b98cc0ee76 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -480,5 +480,5 @@ struct cgroup_subsys freezer_cgrp_subsys = { | |||
480 | .css_free = freezer_css_free, | 480 | .css_free = freezer_css_free, |
481 | .attach = freezer_attach, | 481 | .attach = freezer_attach, |
482 | .fork = freezer_fork, | 482 | .fork = freezer_fork, |
483 | .base_cftypes = files, | 483 | .legacy_cftypes = files, |
484 | }; | 484 | }; |
diff --git a/kernel/cpu.c b/kernel/cpu.c index a343bde710b1..81e2a388a0f6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -274,21 +274,28 @@ void clear_tasks_mm_cpumask(int cpu) | |||
274 | rcu_read_unlock(); | 274 | rcu_read_unlock(); |
275 | } | 275 | } |
276 | 276 | ||
277 | static inline void check_for_tasks(int cpu) | 277 | static inline void check_for_tasks(int dead_cpu) |
278 | { | 278 | { |
279 | struct task_struct *p; | 279 | struct task_struct *g, *p; |
280 | cputime_t utime, stime; | ||
281 | 280 | ||
282 | write_lock_irq(&tasklist_lock); | 281 | read_lock_irq(&tasklist_lock); |
283 | for_each_process(p) { | 282 | do_each_thread(g, p) { |
284 | task_cputime(p, &utime, &stime); | 283 | if (!p->on_rq) |
285 | if (task_cpu(p) == cpu && p->state == TASK_RUNNING && | 284 | continue; |
286 | (utime || stime)) | 285 | /* |
287 | pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", | 286 | * We do the check with unlocked task_rq(p)->lock. |
288 | p->comm, task_pid_nr(p), cpu, | 287 | * Order the reading to do not warn about a task, |
289 | p->state, p->flags); | 288 | * which was running on this cpu in the past, and |
290 | } | 289 | * it's just been woken on another cpu. |
291 | write_unlock_irq(&tasklist_lock); | 290 | */ |
291 | rmb(); | ||
292 | if (task_cpu(p) != dead_cpu) | ||
293 | continue; | ||
294 | |||
295 | pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", | ||
296 | p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); | ||
297 | } while_each_thread(g, p); | ||
298 | read_unlock_irq(&tasklist_lock); | ||
292 | } | 299 | } |
293 | 300 | ||
294 | struct take_cpu_down_param { | 301 | struct take_cpu_down_param { |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 116a4164720a..22874d7cf2c0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -76,8 +76,34 @@ struct cpuset { | |||
76 | struct cgroup_subsys_state css; | 76 | struct cgroup_subsys_state css; |
77 | 77 | ||
78 | unsigned long flags; /* "unsigned long" so bitops work */ | 78 | unsigned long flags; /* "unsigned long" so bitops work */ |
79 | cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ | 79 | |
80 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ | 80 | /* |
81 | * On default hierarchy: | ||
82 | * | ||
83 | * The user-configured masks can only be changed by writing to | ||
84 | * cpuset.cpus and cpuset.mems, and won't be limited by the | ||
85 | * parent masks. | ||
86 | * | ||
87 | * The effective masks is the real masks that apply to the tasks | ||
88 | * in the cpuset. They may be changed if the configured masks are | ||
89 | * changed or hotplug happens. | ||
90 | * | ||
91 | * effective_mask == configured_mask & parent's effective_mask, | ||
92 | * and if it ends up empty, it will inherit the parent's mask. | ||
93 | * | ||
94 | * | ||
95 | * On legacy hierachy: | ||
96 | * | ||
97 | * The user-configured masks are always the same with effective masks. | ||
98 | */ | ||
99 | |||
100 | /* user-configured CPUs and Memory Nodes allow to tasks */ | ||
101 | cpumask_var_t cpus_allowed; | ||
102 | nodemask_t mems_allowed; | ||
103 | |||
104 | /* effective CPUs and Memory Nodes allow to tasks */ | ||
105 | cpumask_var_t effective_cpus; | ||
106 | nodemask_t effective_mems; | ||
81 | 107 | ||
82 | /* | 108 | /* |
83 | * This is old Memory Nodes tasks took on. | 109 | * This is old Memory Nodes tasks took on. |
@@ -307,9 +333,9 @@ static struct file_system_type cpuset_fs_type = { | |||
307 | */ | 333 | */ |
308 | static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) | 334 | static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) |
309 | { | 335 | { |
310 | while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) | 336 | while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) |
311 | cs = parent_cs(cs); | 337 | cs = parent_cs(cs); |
312 | cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); | 338 | cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); |
313 | } | 339 | } |
314 | 340 | ||
315 | /* | 341 | /* |
@@ -325,9 +351,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) | |||
325 | */ | 351 | */ |
326 | static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) | 352 | static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) |
327 | { | 353 | { |
328 | while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) | 354 | while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) |
329 | cs = parent_cs(cs); | 355 | cs = parent_cs(cs); |
330 | nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); | 356 | nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); |
331 | } | 357 | } |
332 | 358 | ||
333 | /* | 359 | /* |
@@ -376,13 +402,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) | |||
376 | if (!trial) | 402 | if (!trial) |
377 | return NULL; | 403 | return NULL; |
378 | 404 | ||
379 | if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { | 405 | if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) |
380 | kfree(trial); | 406 | goto free_cs; |
381 | return NULL; | 407 | if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL)) |
382 | } | 408 | goto free_cpus; |
383 | cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); | ||
384 | 409 | ||
410 | cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); | ||
411 | cpumask_copy(trial->effective_cpus, cs->effective_cpus); | ||
385 | return trial; | 412 | return trial; |
413 | |||
414 | free_cpus: | ||
415 | free_cpumask_var(trial->cpus_allowed); | ||
416 | free_cs: | ||
417 | kfree(trial); | ||
418 | return NULL; | ||
386 | } | 419 | } |
387 | 420 | ||
388 | /** | 421 | /** |
@@ -391,6 +424,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) | |||
391 | */ | 424 | */ |
392 | static void free_trial_cpuset(struct cpuset *trial) | 425 | static void free_trial_cpuset(struct cpuset *trial) |
393 | { | 426 | { |
427 | free_cpumask_var(trial->effective_cpus); | ||
394 | free_cpumask_var(trial->cpus_allowed); | 428 | free_cpumask_var(trial->cpus_allowed); |
395 | kfree(trial); | 429 | kfree(trial); |
396 | } | 430 | } |
@@ -436,9 +470,9 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) | |||
436 | 470 | ||
437 | par = parent_cs(cur); | 471 | par = parent_cs(cur); |
438 | 472 | ||
439 | /* We must be a subset of our parent cpuset */ | 473 | /* On legacy hiearchy, we must be a subset of our parent cpuset. */ |
440 | ret = -EACCES; | 474 | ret = -EACCES; |
441 | if (!is_cpuset_subset(trial, par)) | 475 | if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par)) |
442 | goto out; | 476 | goto out; |
443 | 477 | ||
444 | /* | 478 | /* |
@@ -480,11 +514,11 @@ out: | |||
480 | #ifdef CONFIG_SMP | 514 | #ifdef CONFIG_SMP |
481 | /* | 515 | /* |
482 | * Helper routine for generate_sched_domains(). | 516 | * Helper routine for generate_sched_domains(). |
483 | * Do cpusets a, b have overlapping cpus_allowed masks? | 517 | * Do cpusets a, b have overlapping effective cpus_allowed masks? |
484 | */ | 518 | */ |
485 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | 519 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) |
486 | { | 520 | { |
487 | return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); | 521 | return cpumask_intersects(a->effective_cpus, b->effective_cpus); |
488 | } | 522 | } |
489 | 523 | ||
490 | static void | 524 | static void |
@@ -601,7 +635,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
601 | *dattr = SD_ATTR_INIT; | 635 | *dattr = SD_ATTR_INIT; |
602 | update_domain_attr_tree(dattr, &top_cpuset); | 636 | update_domain_attr_tree(dattr, &top_cpuset); |
603 | } | 637 | } |
604 | cpumask_copy(doms[0], top_cpuset.cpus_allowed); | 638 | cpumask_copy(doms[0], top_cpuset.effective_cpus); |
605 | 639 | ||
606 | goto done; | 640 | goto done; |
607 | } | 641 | } |
@@ -705,7 +739,7 @@ restart: | |||
705 | struct cpuset *b = csa[j]; | 739 | struct cpuset *b = csa[j]; |
706 | 740 | ||
707 | if (apn == b->pn) { | 741 | if (apn == b->pn) { |
708 | cpumask_or(dp, dp, b->cpus_allowed); | 742 | cpumask_or(dp, dp, b->effective_cpus); |
709 | if (dattr) | 743 | if (dattr) |
710 | update_domain_attr_tree(dattr + nslot, b); | 744 | update_domain_attr_tree(dattr + nslot, b); |
711 | 745 | ||
@@ -757,7 +791,7 @@ static void rebuild_sched_domains_locked(void) | |||
757 | * passing doms with offlined cpu to partition_sched_domains(). | 791 | * passing doms with offlined cpu to partition_sched_domains(). |
758 | * Anyways, hotplug work item will rebuild sched domains. | 792 | * Anyways, hotplug work item will rebuild sched domains. |
759 | */ | 793 | */ |
760 | if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask)) | 794 | if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) |
761 | goto out; | 795 | goto out; |
762 | 796 | ||
763 | /* Generate domain masks and attrs */ | 797 | /* Generate domain masks and attrs */ |
@@ -781,45 +815,6 @@ void rebuild_sched_domains(void) | |||
781 | mutex_unlock(&cpuset_mutex); | 815 | mutex_unlock(&cpuset_mutex); |
782 | } | 816 | } |
783 | 817 | ||
784 | /* | ||
785 | * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus | ||
786 | * @cs: the cpuset in interest | ||
787 | * | ||
788 | * A cpuset's effective cpumask is the cpumask of the nearest ancestor | ||
789 | * with non-empty cpus. We use effective cpumask whenever: | ||
790 | * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask | ||
791 | * if the cpuset they reside in has no cpus) | ||
792 | * - we want to retrieve task_cs(tsk)'s cpus_allowed. | ||
793 | * | ||
794 | * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an | ||
795 | * exception. See comments there. | ||
796 | */ | ||
797 | static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs) | ||
798 | { | ||
799 | while (cpumask_empty(cs->cpus_allowed)) | ||
800 | cs = parent_cs(cs); | ||
801 | return cs; | ||
802 | } | ||
803 | |||
804 | /* | ||
805 | * effective_nodemask_cpuset - return nearest ancestor with non-empty mems | ||
806 | * @cs: the cpuset in interest | ||
807 | * | ||
808 | * A cpuset's effective nodemask is the nodemask of the nearest ancestor | ||
809 | * with non-empty memss. We use effective nodemask whenever: | ||
810 | * - we update tasks' mems_allowed. (they take on the ancestor's nodemask | ||
811 | * if the cpuset they reside in has no mems) | ||
812 | * - we want to retrieve task_cs(tsk)'s mems_allowed. | ||
813 | * | ||
814 | * Called with cpuset_mutex held. | ||
815 | */ | ||
816 | static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) | ||
817 | { | ||
818 | while (nodes_empty(cs->mems_allowed)) | ||
819 | cs = parent_cs(cs); | ||
820 | return cs; | ||
821 | } | ||
822 | |||
823 | /** | 818 | /** |
824 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. | 819 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. |
825 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed | 820 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed |
@@ -830,53 +825,80 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) | |||
830 | */ | 825 | */ |
831 | static void update_tasks_cpumask(struct cpuset *cs) | 826 | static void update_tasks_cpumask(struct cpuset *cs) |
832 | { | 827 | { |
833 | struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); | ||
834 | struct css_task_iter it; | 828 | struct css_task_iter it; |
835 | struct task_struct *task; | 829 | struct task_struct *task; |
836 | 830 | ||
837 | css_task_iter_start(&cs->css, &it); | 831 | css_task_iter_start(&cs->css, &it); |
838 | while ((task = css_task_iter_next(&it))) | 832 | while ((task = css_task_iter_next(&it))) |
839 | set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed); | 833 | set_cpus_allowed_ptr(task, cs->effective_cpus); |
840 | css_task_iter_end(&it); | 834 | css_task_iter_end(&it); |
841 | } | 835 | } |
842 | 836 | ||
843 | /* | 837 | /* |
844 | * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. | 838 | * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree |
845 | * @root_cs: the root cpuset of the hierarchy | 839 | * @cs: the cpuset to consider |
846 | * @update_root: update root cpuset or not? | 840 | * @new_cpus: temp variable for calculating new effective_cpus |
841 | * | ||
842 | * When congifured cpumask is changed, the effective cpumasks of this cpuset | ||
843 | * and all its descendants need to be updated. | ||
847 | * | 844 | * |
848 | * This will update cpumasks of tasks in @root_cs and all other empty cpusets | 845 | * On legacy hierachy, effective_cpus will be the same with cpu_allowed. |
849 | * which take on cpumask of @root_cs. | ||
850 | * | 846 | * |
851 | * Called with cpuset_mutex held | 847 | * Called with cpuset_mutex held |
852 | */ | 848 | */ |
853 | static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) | 849 | static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) |
854 | { | 850 | { |
855 | struct cpuset *cp; | 851 | struct cpuset *cp; |
856 | struct cgroup_subsys_state *pos_css; | 852 | struct cgroup_subsys_state *pos_css; |
853 | bool need_rebuild_sched_domains = false; | ||
857 | 854 | ||
858 | rcu_read_lock(); | 855 | rcu_read_lock(); |
859 | cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { | 856 | cpuset_for_each_descendant_pre(cp, pos_css, cs) { |
860 | if (cp == root_cs) { | 857 | struct cpuset *parent = parent_cs(cp); |
861 | if (!update_root) | 858 | |
862 | continue; | 859 | cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus); |
863 | } else { | 860 | |
864 | /* skip the whole subtree if @cp have some CPU */ | 861 | /* |
865 | if (!cpumask_empty(cp->cpus_allowed)) { | 862 | * If it becomes empty, inherit the effective mask of the |
866 | pos_css = css_rightmost_descendant(pos_css); | 863 | * parent, which is guaranteed to have some CPUs. |
867 | continue; | 864 | */ |
868 | } | 865 | if (cpumask_empty(new_cpus)) |
866 | cpumask_copy(new_cpus, parent->effective_cpus); | ||
867 | |||
868 | /* Skip the whole subtree if the cpumask remains the same. */ | ||
869 | if (cpumask_equal(new_cpus, cp->effective_cpus)) { | ||
870 | pos_css = css_rightmost_descendant(pos_css); | ||
871 | continue; | ||
869 | } | 872 | } |
873 | |||
870 | if (!css_tryget_online(&cp->css)) | 874 | if (!css_tryget_online(&cp->css)) |
871 | continue; | 875 | continue; |
872 | rcu_read_unlock(); | 876 | rcu_read_unlock(); |
873 | 877 | ||
878 | mutex_lock(&callback_mutex); | ||
879 | cpumask_copy(cp->effective_cpus, new_cpus); | ||
880 | mutex_unlock(&callback_mutex); | ||
881 | |||
882 | WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && | ||
883 | !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); | ||
884 | |||
874 | update_tasks_cpumask(cp); | 885 | update_tasks_cpumask(cp); |
875 | 886 | ||
887 | /* | ||
888 | * If the effective cpumask of any non-empty cpuset is changed, | ||
889 | * we need to rebuild sched domains. | ||
890 | */ | ||
891 | if (!cpumask_empty(cp->cpus_allowed) && | ||
892 | is_sched_load_balance(cp)) | ||
893 | need_rebuild_sched_domains = true; | ||
894 | |||
876 | rcu_read_lock(); | 895 | rcu_read_lock(); |
877 | css_put(&cp->css); | 896 | css_put(&cp->css); |
878 | } | 897 | } |
879 | rcu_read_unlock(); | 898 | rcu_read_unlock(); |
899 | |||
900 | if (need_rebuild_sched_domains) | ||
901 | rebuild_sched_domains_locked(); | ||
880 | } | 902 | } |
881 | 903 | ||
882 | /** | 904 | /** |
@@ -889,7 +911,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
889 | const char *buf) | 911 | const char *buf) |
890 | { | 912 | { |
891 | int retval; | 913 | int retval; |
892 | int is_load_balanced; | ||
893 | 914 | ||
894 | /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ | 915 | /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ |
895 | if (cs == &top_cpuset) | 916 | if (cs == &top_cpuset) |
@@ -908,7 +929,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
908 | if (retval < 0) | 929 | if (retval < 0) |
909 | return retval; | 930 | return retval; |
910 | 931 | ||
911 | if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) | 932 | if (!cpumask_subset(trialcs->cpus_allowed, |
933 | top_cpuset.cpus_allowed)) | ||
912 | return -EINVAL; | 934 | return -EINVAL; |
913 | } | 935 | } |
914 | 936 | ||
@@ -920,16 +942,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
920 | if (retval < 0) | 942 | if (retval < 0) |
921 | return retval; | 943 | return retval; |
922 | 944 | ||
923 | is_load_balanced = is_sched_load_balance(trialcs); | ||
924 | |||
925 | mutex_lock(&callback_mutex); | 945 | mutex_lock(&callback_mutex); |
926 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); | 946 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); |
927 | mutex_unlock(&callback_mutex); | 947 | mutex_unlock(&callback_mutex); |
928 | 948 | ||
929 | update_tasks_cpumask_hier(cs, true); | 949 | /* use trialcs->cpus_allowed as a temp variable */ |
930 | 950 | update_cpumasks_hier(cs, trialcs->cpus_allowed); | |
931 | if (is_load_balanced) | ||
932 | rebuild_sched_domains_locked(); | ||
933 | return 0; | 951 | return 0; |
934 | } | 952 | } |
935 | 953 | ||
@@ -951,15 +969,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | |||
951 | const nodemask_t *to) | 969 | const nodemask_t *to) |
952 | { | 970 | { |
953 | struct task_struct *tsk = current; | 971 | struct task_struct *tsk = current; |
954 | struct cpuset *mems_cs; | ||
955 | 972 | ||
956 | tsk->mems_allowed = *to; | 973 | tsk->mems_allowed = *to; |
957 | 974 | ||
958 | do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); | 975 | do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); |
959 | 976 | ||
960 | rcu_read_lock(); | 977 | rcu_read_lock(); |
961 | mems_cs = effective_nodemask_cpuset(task_cs(tsk)); | 978 | guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); |
962 | guarantee_online_mems(mems_cs, &tsk->mems_allowed); | ||
963 | rcu_read_unlock(); | 979 | rcu_read_unlock(); |
964 | } | 980 | } |
965 | 981 | ||
@@ -1028,13 +1044,12 @@ static void *cpuset_being_rebound; | |||
1028 | static void update_tasks_nodemask(struct cpuset *cs) | 1044 | static void update_tasks_nodemask(struct cpuset *cs) |
1029 | { | 1045 | { |
1030 | static nodemask_t newmems; /* protected by cpuset_mutex */ | 1046 | static nodemask_t newmems; /* protected by cpuset_mutex */ |
1031 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); | ||
1032 | struct css_task_iter it; | 1047 | struct css_task_iter it; |
1033 | struct task_struct *task; | 1048 | struct task_struct *task; |
1034 | 1049 | ||
1035 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ | 1050 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
1036 | 1051 | ||
1037 | guarantee_online_mems(mems_cs, &newmems); | 1052 | guarantee_online_mems(cs, &newmems); |
1038 | 1053 | ||
1039 | /* | 1054 | /* |
1040 | * The mpol_rebind_mm() call takes mmap_sem, which we couldn't | 1055 | * The mpol_rebind_mm() call takes mmap_sem, which we couldn't |
@@ -1077,36 +1092,52 @@ static void update_tasks_nodemask(struct cpuset *cs) | |||
1077 | } | 1092 | } |
1078 | 1093 | ||
1079 | /* | 1094 | /* |
1080 | * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. | 1095 | * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree |
1081 | * @cs: the root cpuset of the hierarchy | 1096 | * @cs: the cpuset to consider |
1082 | * @update_root: update the root cpuset or not? | 1097 | * @new_mems: a temp variable for calculating new effective_mems |
1083 | * | 1098 | * |
1084 | * This will update nodemasks of tasks in @root_cs and all other empty cpusets | 1099 | * When configured nodemask is changed, the effective nodemasks of this cpuset |
1085 | * which take on nodemask of @root_cs. | 1100 | * and all its descendants need to be updated. |
1101 | * | ||
1102 | * On legacy hiearchy, effective_mems will be the same with mems_allowed. | ||
1086 | * | 1103 | * |
1087 | * Called with cpuset_mutex held | 1104 | * Called with cpuset_mutex held |
1088 | */ | 1105 | */ |
1089 | static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) | 1106 | static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) |
1090 | { | 1107 | { |
1091 | struct cpuset *cp; | 1108 | struct cpuset *cp; |
1092 | struct cgroup_subsys_state *pos_css; | 1109 | struct cgroup_subsys_state *pos_css; |
1093 | 1110 | ||
1094 | rcu_read_lock(); | 1111 | rcu_read_lock(); |
1095 | cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { | 1112 | cpuset_for_each_descendant_pre(cp, pos_css, cs) { |
1096 | if (cp == root_cs) { | 1113 | struct cpuset *parent = parent_cs(cp); |
1097 | if (!update_root) | 1114 | |
1098 | continue; | 1115 | nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); |
1099 | } else { | 1116 | |
1100 | /* skip the whole subtree if @cp have some CPU */ | 1117 | /* |
1101 | if (!nodes_empty(cp->mems_allowed)) { | 1118 | * If it becomes empty, inherit the effective mask of the |
1102 | pos_css = css_rightmost_descendant(pos_css); | 1119 | * parent, which is guaranteed to have some MEMs. |
1103 | continue; | 1120 | */ |
1104 | } | 1121 | if (nodes_empty(*new_mems)) |
1122 | *new_mems = parent->effective_mems; | ||
1123 | |||
1124 | /* Skip the whole subtree if the nodemask remains the same. */ | ||
1125 | if (nodes_equal(*new_mems, cp->effective_mems)) { | ||
1126 | pos_css = css_rightmost_descendant(pos_css); | ||
1127 | continue; | ||
1105 | } | 1128 | } |
1129 | |||
1106 | if (!css_tryget_online(&cp->css)) | 1130 | if (!css_tryget_online(&cp->css)) |
1107 | continue; | 1131 | continue; |
1108 | rcu_read_unlock(); | 1132 | rcu_read_unlock(); |
1109 | 1133 | ||
1134 | mutex_lock(&callback_mutex); | ||
1135 | cp->effective_mems = *new_mems; | ||
1136 | mutex_unlock(&callback_mutex); | ||
1137 | |||
1138 | WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && | ||
1139 | !nodes_equal(cp->mems_allowed, cp->effective_mems)); | ||
1140 | |||
1110 | update_tasks_nodemask(cp); | 1141 | update_tasks_nodemask(cp); |
1111 | 1142 | ||
1112 | rcu_read_lock(); | 1143 | rcu_read_lock(); |
@@ -1156,8 +1187,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1156 | goto done; | 1187 | goto done; |
1157 | 1188 | ||
1158 | if (!nodes_subset(trialcs->mems_allowed, | 1189 | if (!nodes_subset(trialcs->mems_allowed, |
1159 | node_states[N_MEMORY])) { | 1190 | top_cpuset.mems_allowed)) { |
1160 | retval = -EINVAL; | 1191 | retval = -EINVAL; |
1161 | goto done; | 1192 | goto done; |
1162 | } | 1193 | } |
1163 | } | 1194 | } |
@@ -1174,7 +1205,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1174 | cs->mems_allowed = trialcs->mems_allowed; | 1205 | cs->mems_allowed = trialcs->mems_allowed; |
1175 | mutex_unlock(&callback_mutex); | 1206 | mutex_unlock(&callback_mutex); |
1176 | 1207 | ||
1177 | update_tasks_nodemask_hier(cs, true); | 1208 | /* use trialcs->mems_allowed as a temp variable */ |
1209 | update_nodemasks_hier(cs, &cs->mems_allowed); | ||
1178 | done: | 1210 | done: |
1179 | return retval; | 1211 | return retval; |
1180 | } | 1212 | } |
@@ -1389,12 +1421,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, | |||
1389 | 1421 | ||
1390 | mutex_lock(&cpuset_mutex); | 1422 | mutex_lock(&cpuset_mutex); |
1391 | 1423 | ||
1392 | /* | 1424 | /* allow moving tasks into an empty cpuset if on default hierarchy */ |
1393 | * We allow to move tasks into an empty cpuset if sane_behavior | ||
1394 | * flag is set. | ||
1395 | */ | ||
1396 | ret = -ENOSPC; | 1425 | ret = -ENOSPC; |
1397 | if (!cgroup_sane_behavior(css->cgroup) && | 1426 | if (!cgroup_on_dfl(css->cgroup) && |
1398 | (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) | 1427 | (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) |
1399 | goto out_unlock; | 1428 | goto out_unlock; |
1400 | 1429 | ||
@@ -1452,8 +1481,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css, | |||
1452 | struct task_struct *leader = cgroup_taskset_first(tset); | 1481 | struct task_struct *leader = cgroup_taskset_first(tset); |
1453 | struct cpuset *cs = css_cs(css); | 1482 | struct cpuset *cs = css_cs(css); |
1454 | struct cpuset *oldcs = cpuset_attach_old_cs; | 1483 | struct cpuset *oldcs = cpuset_attach_old_cs; |
1455 | struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); | ||
1456 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); | ||
1457 | 1484 | ||
1458 | mutex_lock(&cpuset_mutex); | 1485 | mutex_lock(&cpuset_mutex); |
1459 | 1486 | ||
@@ -1461,9 +1488,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css, | |||
1461 | if (cs == &top_cpuset) | 1488 | if (cs == &top_cpuset) |
1462 | cpumask_copy(cpus_attach, cpu_possible_mask); | 1489 | cpumask_copy(cpus_attach, cpu_possible_mask); |
1463 | else | 1490 | else |
1464 | guarantee_online_cpus(cpus_cs, cpus_attach); | 1491 | guarantee_online_cpus(cs, cpus_attach); |
1465 | 1492 | ||
1466 | guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); | 1493 | guarantee_online_mems(cs, &cpuset_attach_nodemask_to); |
1467 | 1494 | ||
1468 | cgroup_taskset_for_each(task, tset) { | 1495 | cgroup_taskset_for_each(task, tset) { |
1469 | /* | 1496 | /* |
@@ -1480,11 +1507,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css, | |||
1480 | * Change mm, possibly for multiple threads in a threadgroup. This is | 1507 | * Change mm, possibly for multiple threads in a threadgroup. This is |
1481 | * expensive and may sleep. | 1508 | * expensive and may sleep. |
1482 | */ | 1509 | */ |
1483 | cpuset_attach_nodemask_to = cs->mems_allowed; | 1510 | cpuset_attach_nodemask_to = cs->effective_mems; |
1484 | mm = get_task_mm(leader); | 1511 | mm = get_task_mm(leader); |
1485 | if (mm) { | 1512 | if (mm) { |
1486 | struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs); | ||
1487 | |||
1488 | mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); | 1513 | mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); |
1489 | 1514 | ||
1490 | /* | 1515 | /* |
@@ -1495,7 +1520,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, | |||
1495 | * mm from. | 1520 | * mm from. |
1496 | */ | 1521 | */ |
1497 | if (is_memory_migrate(cs)) { | 1522 | if (is_memory_migrate(cs)) { |
1498 | cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed, | 1523 | cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, |
1499 | &cpuset_attach_nodemask_to); | 1524 | &cpuset_attach_nodemask_to); |
1500 | } | 1525 | } |
1501 | mmput(mm); | 1526 | mmput(mm); |
@@ -1516,6 +1541,8 @@ typedef enum { | |||
1516 | FILE_MEMORY_MIGRATE, | 1541 | FILE_MEMORY_MIGRATE, |
1517 | FILE_CPULIST, | 1542 | FILE_CPULIST, |
1518 | FILE_MEMLIST, | 1543 | FILE_MEMLIST, |
1544 | FILE_EFFECTIVE_CPULIST, | ||
1545 | FILE_EFFECTIVE_MEMLIST, | ||
1519 | FILE_CPU_EXCLUSIVE, | 1546 | FILE_CPU_EXCLUSIVE, |
1520 | FILE_MEM_EXCLUSIVE, | 1547 | FILE_MEM_EXCLUSIVE, |
1521 | FILE_MEM_HARDWALL, | 1548 | FILE_MEM_HARDWALL, |
@@ -1694,6 +1721,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) | |||
1694 | case FILE_MEMLIST: | 1721 | case FILE_MEMLIST: |
1695 | s += nodelist_scnprintf(s, count, cs->mems_allowed); | 1722 | s += nodelist_scnprintf(s, count, cs->mems_allowed); |
1696 | break; | 1723 | break; |
1724 | case FILE_EFFECTIVE_CPULIST: | ||
1725 | s += cpulist_scnprintf(s, count, cs->effective_cpus); | ||
1726 | break; | ||
1727 | case FILE_EFFECTIVE_MEMLIST: | ||
1728 | s += nodelist_scnprintf(s, count, cs->effective_mems); | ||
1729 | break; | ||
1697 | default: | 1730 | default: |
1698 | ret = -EINVAL; | 1731 | ret = -EINVAL; |
1699 | goto out_unlock; | 1732 | goto out_unlock; |
@@ -1779,6 +1812,18 @@ static struct cftype files[] = { | |||
1779 | }, | 1812 | }, |
1780 | 1813 | ||
1781 | { | 1814 | { |
1815 | .name = "effective_cpus", | ||
1816 | .seq_show = cpuset_common_seq_show, | ||
1817 | .private = FILE_EFFECTIVE_CPULIST, | ||
1818 | }, | ||
1819 | |||
1820 | { | ||
1821 | .name = "effective_mems", | ||
1822 | .seq_show = cpuset_common_seq_show, | ||
1823 | .private = FILE_EFFECTIVE_MEMLIST, | ||
1824 | }, | ||
1825 | |||
1826 | { | ||
1782 | .name = "cpu_exclusive", | 1827 | .name = "cpu_exclusive", |
1783 | .read_u64 = cpuset_read_u64, | 1828 | .read_u64 = cpuset_read_u64, |
1784 | .write_u64 = cpuset_write_u64, | 1829 | .write_u64 = cpuset_write_u64, |
@@ -1869,18 +1914,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) | |||
1869 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); | 1914 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); |
1870 | if (!cs) | 1915 | if (!cs) |
1871 | return ERR_PTR(-ENOMEM); | 1916 | return ERR_PTR(-ENOMEM); |
1872 | if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { | 1917 | if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) |
1873 | kfree(cs); | 1918 | goto free_cs; |
1874 | return ERR_PTR(-ENOMEM); | 1919 | if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) |
1875 | } | 1920 | goto free_cpus; |
1876 | 1921 | ||
1877 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | 1922 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); |
1878 | cpumask_clear(cs->cpus_allowed); | 1923 | cpumask_clear(cs->cpus_allowed); |
1879 | nodes_clear(cs->mems_allowed); | 1924 | nodes_clear(cs->mems_allowed); |
1925 | cpumask_clear(cs->effective_cpus); | ||
1926 | nodes_clear(cs->effective_mems); | ||
1880 | fmeter_init(&cs->fmeter); | 1927 | fmeter_init(&cs->fmeter); |
1881 | cs->relax_domain_level = -1; | 1928 | cs->relax_domain_level = -1; |
1882 | 1929 | ||
1883 | return &cs->css; | 1930 | return &cs->css; |
1931 | |||
1932 | free_cpus: | ||
1933 | free_cpumask_var(cs->cpus_allowed); | ||
1934 | free_cs: | ||
1935 | kfree(cs); | ||
1936 | return ERR_PTR(-ENOMEM); | ||
1884 | } | 1937 | } |
1885 | 1938 | ||
1886 | static int cpuset_css_online(struct cgroup_subsys_state *css) | 1939 | static int cpuset_css_online(struct cgroup_subsys_state *css) |
@@ -1903,6 +1956,13 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) | |||
1903 | 1956 | ||
1904 | cpuset_inc(); | 1957 | cpuset_inc(); |
1905 | 1958 | ||
1959 | mutex_lock(&callback_mutex); | ||
1960 | if (cgroup_on_dfl(cs->css.cgroup)) { | ||
1961 | cpumask_copy(cs->effective_cpus, parent->effective_cpus); | ||
1962 | cs->effective_mems = parent->effective_mems; | ||
1963 | } | ||
1964 | mutex_unlock(&callback_mutex); | ||
1965 | |||
1906 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) | 1966 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) |
1907 | goto out_unlock; | 1967 | goto out_unlock; |
1908 | 1968 | ||
@@ -1962,20 +2022,40 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) | |||
1962 | { | 2022 | { |
1963 | struct cpuset *cs = css_cs(css); | 2023 | struct cpuset *cs = css_cs(css); |
1964 | 2024 | ||
2025 | free_cpumask_var(cs->effective_cpus); | ||
1965 | free_cpumask_var(cs->cpus_allowed); | 2026 | free_cpumask_var(cs->cpus_allowed); |
1966 | kfree(cs); | 2027 | kfree(cs); |
1967 | } | 2028 | } |
1968 | 2029 | ||
2030 | static void cpuset_bind(struct cgroup_subsys_state *root_css) | ||
2031 | { | ||
2032 | mutex_lock(&cpuset_mutex); | ||
2033 | mutex_lock(&callback_mutex); | ||
2034 | |||
2035 | if (cgroup_on_dfl(root_css->cgroup)) { | ||
2036 | cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); | ||
2037 | top_cpuset.mems_allowed = node_possible_map; | ||
2038 | } else { | ||
2039 | cpumask_copy(top_cpuset.cpus_allowed, | ||
2040 | top_cpuset.effective_cpus); | ||
2041 | top_cpuset.mems_allowed = top_cpuset.effective_mems; | ||
2042 | } | ||
2043 | |||
2044 | mutex_unlock(&callback_mutex); | ||
2045 | mutex_unlock(&cpuset_mutex); | ||
2046 | } | ||
2047 | |||
1969 | struct cgroup_subsys cpuset_cgrp_subsys = { | 2048 | struct cgroup_subsys cpuset_cgrp_subsys = { |
1970 | .css_alloc = cpuset_css_alloc, | 2049 | .css_alloc = cpuset_css_alloc, |
1971 | .css_online = cpuset_css_online, | 2050 | .css_online = cpuset_css_online, |
1972 | .css_offline = cpuset_css_offline, | 2051 | .css_offline = cpuset_css_offline, |
1973 | .css_free = cpuset_css_free, | 2052 | .css_free = cpuset_css_free, |
1974 | .can_attach = cpuset_can_attach, | 2053 | .can_attach = cpuset_can_attach, |
1975 | .cancel_attach = cpuset_cancel_attach, | 2054 | .cancel_attach = cpuset_cancel_attach, |
1976 | .attach = cpuset_attach, | 2055 | .attach = cpuset_attach, |
1977 | .base_cftypes = files, | 2056 | .bind = cpuset_bind, |
1978 | .early_init = 1, | 2057 | .legacy_cftypes = files, |
2058 | .early_init = 1, | ||
1979 | }; | 2059 | }; |
1980 | 2060 | ||
1981 | /** | 2061 | /** |
@@ -1990,9 +2070,13 @@ int __init cpuset_init(void) | |||
1990 | 2070 | ||
1991 | if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) | 2071 | if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) |
1992 | BUG(); | 2072 | BUG(); |
2073 | if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) | ||
2074 | BUG(); | ||
1993 | 2075 | ||
1994 | cpumask_setall(top_cpuset.cpus_allowed); | 2076 | cpumask_setall(top_cpuset.cpus_allowed); |
1995 | nodes_setall(top_cpuset.mems_allowed); | 2077 | nodes_setall(top_cpuset.mems_allowed); |
2078 | cpumask_setall(top_cpuset.effective_cpus); | ||
2079 | nodes_setall(top_cpuset.effective_mems); | ||
1996 | 2080 | ||
1997 | fmeter_init(&top_cpuset.fmeter); | 2081 | fmeter_init(&top_cpuset.fmeter); |
1998 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); | 2082 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); |
@@ -2035,6 +2119,66 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
2035 | } | 2119 | } |
2036 | } | 2120 | } |
2037 | 2121 | ||
2122 | static void | ||
2123 | hotplug_update_tasks_legacy(struct cpuset *cs, | ||
2124 | struct cpumask *new_cpus, nodemask_t *new_mems, | ||
2125 | bool cpus_updated, bool mems_updated) | ||
2126 | { | ||
2127 | bool is_empty; | ||
2128 | |||
2129 | mutex_lock(&callback_mutex); | ||
2130 | cpumask_copy(cs->cpus_allowed, new_cpus); | ||
2131 | cpumask_copy(cs->effective_cpus, new_cpus); | ||
2132 | cs->mems_allowed = *new_mems; | ||
2133 | cs->effective_mems = *new_mems; | ||
2134 | mutex_unlock(&callback_mutex); | ||
2135 | |||
2136 | /* | ||
2137 | * Don't call update_tasks_cpumask() if the cpuset becomes empty, | ||
2138 | * as the tasks will be migratecd to an ancestor. | ||
2139 | */ | ||
2140 | if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) | ||
2141 | update_tasks_cpumask(cs); | ||
2142 | if (mems_updated && !nodes_empty(cs->mems_allowed)) | ||
2143 | update_tasks_nodemask(cs); | ||
2144 | |||
2145 | is_empty = cpumask_empty(cs->cpus_allowed) || | ||
2146 | nodes_empty(cs->mems_allowed); | ||
2147 | |||
2148 | mutex_unlock(&cpuset_mutex); | ||
2149 | |||
2150 | /* | ||
2151 | * Move tasks to the nearest ancestor with execution resources, | ||
2152 | * This is full cgroup operation which will also call back into | ||
2153 | * cpuset. Should be done outside any lock. | ||
2154 | */ | ||
2155 | if (is_empty) | ||
2156 | remove_tasks_in_empty_cpuset(cs); | ||
2157 | |||
2158 | mutex_lock(&cpuset_mutex); | ||
2159 | } | ||
2160 | |||
2161 | static void | ||
2162 | hotplug_update_tasks(struct cpuset *cs, | ||
2163 | struct cpumask *new_cpus, nodemask_t *new_mems, | ||
2164 | bool cpus_updated, bool mems_updated) | ||
2165 | { | ||
2166 | if (cpumask_empty(new_cpus)) | ||
2167 | cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); | ||
2168 | if (nodes_empty(*new_mems)) | ||
2169 | *new_mems = parent_cs(cs)->effective_mems; | ||
2170 | |||
2171 | mutex_lock(&callback_mutex); | ||
2172 | cpumask_copy(cs->effective_cpus, new_cpus); | ||
2173 | cs->effective_mems = *new_mems; | ||
2174 | mutex_unlock(&callback_mutex); | ||
2175 | |||
2176 | if (cpus_updated) | ||
2177 | update_tasks_cpumask(cs); | ||
2178 | if (mems_updated) | ||
2179 | update_tasks_nodemask(cs); | ||
2180 | } | ||
2181 | |||
2038 | /** | 2182 | /** |
2039 | * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug | 2183 | * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug |
2040 | * @cs: cpuset in interest | 2184 | * @cs: cpuset in interest |
@@ -2045,11 +2189,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
2045 | */ | 2189 | */ |
2046 | static void cpuset_hotplug_update_tasks(struct cpuset *cs) | 2190 | static void cpuset_hotplug_update_tasks(struct cpuset *cs) |
2047 | { | 2191 | { |
2048 | static cpumask_t off_cpus; | 2192 | static cpumask_t new_cpus; |
2049 | static nodemask_t off_mems; | 2193 | static nodemask_t new_mems; |
2050 | bool is_empty; | 2194 | bool cpus_updated; |
2051 | bool sane = cgroup_sane_behavior(cs->css.cgroup); | 2195 | bool mems_updated; |
2052 | |||
2053 | retry: | 2196 | retry: |
2054 | wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); | 2197 | wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); |
2055 | 2198 | ||
@@ -2064,51 +2207,20 @@ retry: | |||
2064 | goto retry; | 2207 | goto retry; |
2065 | } | 2208 | } |
2066 | 2209 | ||
2067 | cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); | 2210 | cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus); |
2068 | nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); | 2211 | nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); |
2069 | |||
2070 | mutex_lock(&callback_mutex); | ||
2071 | cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); | ||
2072 | mutex_unlock(&callback_mutex); | ||
2073 | |||
2074 | /* | ||
2075 | * If sane_behavior flag is set, we need to update tasks' cpumask | ||
2076 | * for empty cpuset to take on ancestor's cpumask. Otherwise, don't | ||
2077 | * call update_tasks_cpumask() if the cpuset becomes empty, as | ||
2078 | * the tasks in it will be migrated to an ancestor. | ||
2079 | */ | ||
2080 | if ((sane && cpumask_empty(cs->cpus_allowed)) || | ||
2081 | (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) | ||
2082 | update_tasks_cpumask(cs); | ||
2083 | 2212 | ||
2084 | mutex_lock(&callback_mutex); | 2213 | cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); |
2085 | nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); | 2214 | mems_updated = !nodes_equal(new_mems, cs->effective_mems); |
2086 | mutex_unlock(&callback_mutex); | ||
2087 | |||
2088 | /* | ||
2089 | * If sane_behavior flag is set, we need to update tasks' nodemask | ||
2090 | * for empty cpuset to take on ancestor's nodemask. Otherwise, don't | ||
2091 | * call update_tasks_nodemask() if the cpuset becomes empty, as | ||
2092 | * the tasks in it will be migratd to an ancestor. | ||
2093 | */ | ||
2094 | if ((sane && nodes_empty(cs->mems_allowed)) || | ||
2095 | (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) | ||
2096 | update_tasks_nodemask(cs); | ||
2097 | 2215 | ||
2098 | is_empty = cpumask_empty(cs->cpus_allowed) || | 2216 | if (cgroup_on_dfl(cs->css.cgroup)) |
2099 | nodes_empty(cs->mems_allowed); | 2217 | hotplug_update_tasks(cs, &new_cpus, &new_mems, |
2218 | cpus_updated, mems_updated); | ||
2219 | else | ||
2220 | hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, | ||
2221 | cpus_updated, mems_updated); | ||
2100 | 2222 | ||
2101 | mutex_unlock(&cpuset_mutex); | 2223 | mutex_unlock(&cpuset_mutex); |
2102 | |||
2103 | /* | ||
2104 | * If sane_behavior flag is set, we'll keep tasks in empty cpusets. | ||
2105 | * | ||
2106 | * Otherwise move tasks to the nearest ancestor with execution | ||
2107 | * resources. This is full cgroup operation which will | ||
2108 | * also call back into cpuset. Should be done outside any lock. | ||
2109 | */ | ||
2110 | if (!sane && is_empty) | ||
2111 | remove_tasks_in_empty_cpuset(cs); | ||
2112 | } | 2224 | } |
2113 | 2225 | ||
2114 | /** | 2226 | /** |
@@ -2132,6 +2244,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2132 | static cpumask_t new_cpus; | 2244 | static cpumask_t new_cpus; |
2133 | static nodemask_t new_mems; | 2245 | static nodemask_t new_mems; |
2134 | bool cpus_updated, mems_updated; | 2246 | bool cpus_updated, mems_updated; |
2247 | bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup); | ||
2135 | 2248 | ||
2136 | mutex_lock(&cpuset_mutex); | 2249 | mutex_lock(&cpuset_mutex); |
2137 | 2250 | ||
@@ -2139,13 +2252,15 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2139 | cpumask_copy(&new_cpus, cpu_active_mask); | 2252 | cpumask_copy(&new_cpus, cpu_active_mask); |
2140 | new_mems = node_states[N_MEMORY]; | 2253 | new_mems = node_states[N_MEMORY]; |
2141 | 2254 | ||
2142 | cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); | 2255 | cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); |
2143 | mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); | 2256 | mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); |
2144 | 2257 | ||
2145 | /* synchronize cpus_allowed to cpu_active_mask */ | 2258 | /* synchronize cpus_allowed to cpu_active_mask */ |
2146 | if (cpus_updated) { | 2259 | if (cpus_updated) { |
2147 | mutex_lock(&callback_mutex); | 2260 | mutex_lock(&callback_mutex); |
2148 | cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); | 2261 | if (!on_dfl) |
2262 | cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); | ||
2263 | cpumask_copy(top_cpuset.effective_cpus, &new_cpus); | ||
2149 | mutex_unlock(&callback_mutex); | 2264 | mutex_unlock(&callback_mutex); |
2150 | /* we don't mess with cpumasks of tasks in top_cpuset */ | 2265 | /* we don't mess with cpumasks of tasks in top_cpuset */ |
2151 | } | 2266 | } |
@@ -2153,7 +2268,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2153 | /* synchronize mems_allowed to N_MEMORY */ | 2268 | /* synchronize mems_allowed to N_MEMORY */ |
2154 | if (mems_updated) { | 2269 | if (mems_updated) { |
2155 | mutex_lock(&callback_mutex); | 2270 | mutex_lock(&callback_mutex); |
2156 | top_cpuset.mems_allowed = new_mems; | 2271 | if (!on_dfl) |
2272 | top_cpuset.mems_allowed = new_mems; | ||
2273 | top_cpuset.effective_mems = new_mems; | ||
2157 | mutex_unlock(&callback_mutex); | 2274 | mutex_unlock(&callback_mutex); |
2158 | update_tasks_nodemask(&top_cpuset); | 2275 | update_tasks_nodemask(&top_cpuset); |
2159 | } | 2276 | } |
@@ -2228,6 +2345,9 @@ void __init cpuset_init_smp(void) | |||
2228 | top_cpuset.mems_allowed = node_states[N_MEMORY]; | 2345 | top_cpuset.mems_allowed = node_states[N_MEMORY]; |
2229 | top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; | 2346 | top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; |
2230 | 2347 | ||
2348 | cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); | ||
2349 | top_cpuset.effective_mems = node_states[N_MEMORY]; | ||
2350 | |||
2231 | register_hotmemory_notifier(&cpuset_track_online_nodes_nb); | 2351 | register_hotmemory_notifier(&cpuset_track_online_nodes_nb); |
2232 | } | 2352 | } |
2233 | 2353 | ||
@@ -2244,23 +2364,17 @@ void __init cpuset_init_smp(void) | |||
2244 | 2364 | ||
2245 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) | 2365 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) |
2246 | { | 2366 | { |
2247 | struct cpuset *cpus_cs; | ||
2248 | |||
2249 | mutex_lock(&callback_mutex); | 2367 | mutex_lock(&callback_mutex); |
2250 | rcu_read_lock(); | 2368 | rcu_read_lock(); |
2251 | cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); | 2369 | guarantee_online_cpus(task_cs(tsk), pmask); |
2252 | guarantee_online_cpus(cpus_cs, pmask); | ||
2253 | rcu_read_unlock(); | 2370 | rcu_read_unlock(); |
2254 | mutex_unlock(&callback_mutex); | 2371 | mutex_unlock(&callback_mutex); |
2255 | } | 2372 | } |
2256 | 2373 | ||
2257 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) | 2374 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) |
2258 | { | 2375 | { |
2259 | struct cpuset *cpus_cs; | ||
2260 | |||
2261 | rcu_read_lock(); | 2376 | rcu_read_lock(); |
2262 | cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); | 2377 | do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); |
2263 | do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed); | ||
2264 | rcu_read_unlock(); | 2378 | rcu_read_unlock(); |
2265 | 2379 | ||
2266 | /* | 2380 | /* |
@@ -2299,13 +2413,11 @@ void cpuset_init_current_mems_allowed(void) | |||
2299 | 2413 | ||
2300 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) | 2414 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) |
2301 | { | 2415 | { |
2302 | struct cpuset *mems_cs; | ||
2303 | nodemask_t mask; | 2416 | nodemask_t mask; |
2304 | 2417 | ||
2305 | mutex_lock(&callback_mutex); | 2418 | mutex_lock(&callback_mutex); |
2306 | rcu_read_lock(); | 2419 | rcu_read_lock(); |
2307 | mems_cs = effective_nodemask_cpuset(task_cs(tsk)); | 2420 | guarantee_online_mems(task_cs(tsk), &mask); |
2308 | guarantee_online_mems(mems_cs, &mask); | ||
2309 | rcu_read_unlock(); | 2421 | rcu_read_unlock(); |
2310 | mutex_unlock(&callback_mutex); | 2422 | mutex_unlock(&callback_mutex); |
2311 | 2423 | ||
diff --git a/kernel/events/core.c b/kernel/events/core.c index a33d9a2bcbd7..1cf24b3e42ec 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -2320,7 +2320,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
2320 | next_parent = rcu_dereference(next_ctx->parent_ctx); | 2320 | next_parent = rcu_dereference(next_ctx->parent_ctx); |
2321 | 2321 | ||
2322 | /* If neither context have a parent context; they cannot be clones. */ | 2322 | /* If neither context have a parent context; they cannot be clones. */ |
2323 | if (!parent && !next_parent) | 2323 | if (!parent || !next_parent) |
2324 | goto unlock; | 2324 | goto unlock; |
2325 | 2325 | ||
2326 | if (next_parent == ctx || next_ctx == parent || next_parent == parent) { | 2326 | if (next_parent == ctx || next_ctx == parent || next_parent == parent) { |
@@ -5266,6 +5266,12 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
5266 | 5266 | ||
5267 | goto got_name; | 5267 | goto got_name; |
5268 | } else { | 5268 | } else { |
5269 | if (vma->vm_ops && vma->vm_ops->name) { | ||
5270 | name = (char *) vma->vm_ops->name(vma); | ||
5271 | if (name) | ||
5272 | goto cpy_name; | ||
5273 | } | ||
5274 | |||
5269 | name = (char *)arch_vma_name(vma); | 5275 | name = (char *)arch_vma_name(vma); |
5270 | if (name) | 5276 | if (name) |
5271 | goto cpy_name; | 5277 | goto cpy_name; |
@@ -7458,7 +7464,19 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
7458 | struct perf_event_context *child_ctx, | 7464 | struct perf_event_context *child_ctx, |
7459 | struct task_struct *child) | 7465 | struct task_struct *child) |
7460 | { | 7466 | { |
7461 | perf_remove_from_context(child_event, true); | 7467 | /* |
7468 | * Do not destroy the 'original' grouping; because of the context | ||
7469 | * switch optimization the original events could've ended up in a | ||
7470 | * random child task. | ||
7471 | * | ||
7472 | * If we were to destroy the original group, all group related | ||
7473 | * operations would cease to function properly after this random | ||
7474 | * child dies. | ||
7475 | * | ||
7476 | * Do destroy all inherited groups, we don't care about those | ||
7477 | * and being thorough is better. | ||
7478 | */ | ||
7479 | perf_remove_from_context(child_event, !!child_event->parent); | ||
7462 | 7480 | ||
7463 | /* | 7481 | /* |
7464 | * It can happen that the parent exits first, and has events | 7482 | * It can happen that the parent exits first, and has events |
@@ -7474,7 +7492,7 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
7474 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | 7492 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) |
7475 | { | 7493 | { |
7476 | struct perf_event *child_event, *next; | 7494 | struct perf_event *child_event, *next; |
7477 | struct perf_event_context *child_ctx; | 7495 | struct perf_event_context *child_ctx, *parent_ctx; |
7478 | unsigned long flags; | 7496 | unsigned long flags; |
7479 | 7497 | ||
7480 | if (likely(!child->perf_event_ctxp[ctxn])) { | 7498 | if (likely(!child->perf_event_ctxp[ctxn])) { |
@@ -7499,6 +7517,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
7499 | raw_spin_lock(&child_ctx->lock); | 7517 | raw_spin_lock(&child_ctx->lock); |
7500 | task_ctx_sched_out(child_ctx); | 7518 | task_ctx_sched_out(child_ctx); |
7501 | child->perf_event_ctxp[ctxn] = NULL; | 7519 | child->perf_event_ctxp[ctxn] = NULL; |
7520 | |||
7521 | /* | ||
7522 | * In order to avoid freeing: child_ctx->parent_ctx->task | ||
7523 | * under perf_event_context::lock, grab another reference. | ||
7524 | */ | ||
7525 | parent_ctx = child_ctx->parent_ctx; | ||
7526 | if (parent_ctx) | ||
7527 | get_ctx(parent_ctx); | ||
7528 | |||
7502 | /* | 7529 | /* |
7503 | * If this context is a clone; unclone it so it can't get | 7530 | * If this context is a clone; unclone it so it can't get |
7504 | * swapped to another process while we're removing all | 7531 | * swapped to another process while we're removing all |
@@ -7509,6 +7536,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
7509 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); | 7536 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); |
7510 | 7537 | ||
7511 | /* | 7538 | /* |
7539 | * Now that we no longer hold perf_event_context::lock, drop | ||
7540 | * our extra child_ctx->parent_ctx reference. | ||
7541 | */ | ||
7542 | if (parent_ctx) | ||
7543 | put_ctx(parent_ctx); | ||
7544 | |||
7545 | /* | ||
7512 | * Report the task dead after unscheduling the events so that we | 7546 | * Report the task dead after unscheduling the events so that we |
7513 | * won't get any samples after PERF_RECORD_EXIT. We can however still | 7547 | * won't get any samples after PERF_RECORD_EXIT. We can however still |
7514 | * get a few PERF_RECORD_READ events. | 7548 | * get a few PERF_RECORD_READ events. |
@@ -7776,7 +7810,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, | |||
7776 | /* | 7810 | /* |
7777 | * Initialize the perf_event context in task_struct | 7811 | * Initialize the perf_event context in task_struct |
7778 | */ | 7812 | */ |
7779 | int perf_event_init_context(struct task_struct *child, int ctxn) | 7813 | static int perf_event_init_context(struct task_struct *child, int ctxn) |
7780 | { | 7814 | { |
7781 | struct perf_event_context *child_ctx, *parent_ctx; | 7815 | struct perf_event_context *child_ctx, *parent_ctx; |
7782 | struct perf_event_context *cloned_ctx; | 7816 | struct perf_event_context *cloned_ctx; |
diff --git a/kernel/fork.c b/kernel/fork.c index 627b7f80afb0..5f1bf3bebb4f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1095,7 +1095,6 @@ static void rt_mutex_init_task(struct task_struct *p) | |||
1095 | p->pi_waiters = RB_ROOT; | 1095 | p->pi_waiters = RB_ROOT; |
1096 | p->pi_waiters_leftmost = NULL; | 1096 | p->pi_waiters_leftmost = NULL; |
1097 | p->pi_blocked_on = NULL; | 1097 | p->pi_blocked_on = NULL; |
1098 | p->pi_top_task = NULL; | ||
1099 | #endif | 1098 | #endif |
1100 | } | 1099 | } |
1101 | 1100 | ||
diff --git a/kernel/futex.c b/kernel/futex.c index b632b5f3f094..d3a9d946d0b7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -792,94 +792,91 @@ void exit_pi_state_list(struct task_struct *curr) | |||
792 | * [10] There is no transient state which leaves owner and user space | 792 | * [10] There is no transient state which leaves owner and user space |
793 | * TID out of sync. | 793 | * TID out of sync. |
794 | */ | 794 | */ |
795 | static int | 795 | |
796 | lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | 796 | /* |
797 | union futex_key *key, struct futex_pi_state **ps) | 797 | * Validate that the existing waiter has a pi_state and sanity check |
798 | * the pi_state against the user space value. If correct, attach to | ||
799 | * it. | ||
800 | */ | ||
801 | static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, | ||
802 | struct futex_pi_state **ps) | ||
798 | { | 803 | { |
799 | struct futex_pi_state *pi_state = NULL; | ||
800 | struct futex_q *this, *next; | ||
801 | struct task_struct *p; | ||
802 | pid_t pid = uval & FUTEX_TID_MASK; | 804 | pid_t pid = uval & FUTEX_TID_MASK; |
803 | 805 | ||
804 | plist_for_each_entry_safe(this, next, &hb->chain, list) { | 806 | /* |
805 | if (match_futex(&this->key, key)) { | 807 | * Userspace might have messed up non-PI and PI futexes [3] |
806 | /* | 808 | */ |
807 | * Sanity check the waiter before increasing | 809 | if (unlikely(!pi_state)) |
808 | * the refcount and attaching to it. | 810 | return -EINVAL; |
809 | */ | ||
810 | pi_state = this->pi_state; | ||
811 | /* | ||
812 | * Userspace might have messed up non-PI and | ||
813 | * PI futexes [3] | ||
814 | */ | ||
815 | if (unlikely(!pi_state)) | ||
816 | return -EINVAL; | ||
817 | 811 | ||
818 | WARN_ON(!atomic_read(&pi_state->refcount)); | 812 | WARN_ON(!atomic_read(&pi_state->refcount)); |
819 | 813 | ||
814 | /* | ||
815 | * Handle the owner died case: | ||
816 | */ | ||
817 | if (uval & FUTEX_OWNER_DIED) { | ||
818 | /* | ||
819 | * exit_pi_state_list sets owner to NULL and wakes the | ||
820 | * topmost waiter. The task which acquires the | ||
821 | * pi_state->rt_mutex will fixup owner. | ||
822 | */ | ||
823 | if (!pi_state->owner) { | ||
820 | /* | 824 | /* |
821 | * Handle the owner died case: | 825 | * No pi state owner, but the user space TID |
826 | * is not 0. Inconsistent state. [5] | ||
822 | */ | 827 | */ |
823 | if (uval & FUTEX_OWNER_DIED) { | 828 | if (pid) |
824 | /* | 829 | return -EINVAL; |
825 | * exit_pi_state_list sets owner to NULL and | ||
826 | * wakes the topmost waiter. The task which | ||
827 | * acquires the pi_state->rt_mutex will fixup | ||
828 | * owner. | ||
829 | */ | ||
830 | if (!pi_state->owner) { | ||
831 | /* | ||
832 | * No pi state owner, but the user | ||
833 | * space TID is not 0. Inconsistent | ||
834 | * state. [5] | ||
835 | */ | ||
836 | if (pid) | ||
837 | return -EINVAL; | ||
838 | /* | ||
839 | * Take a ref on the state and | ||
840 | * return. [4] | ||
841 | */ | ||
842 | goto out_state; | ||
843 | } | ||
844 | |||
845 | /* | ||
846 | * If TID is 0, then either the dying owner | ||
847 | * has not yet executed exit_pi_state_list() | ||
848 | * or some waiter acquired the rtmutex in the | ||
849 | * pi state, but did not yet fixup the TID in | ||
850 | * user space. | ||
851 | * | ||
852 | * Take a ref on the state and return. [6] | ||
853 | */ | ||
854 | if (!pid) | ||
855 | goto out_state; | ||
856 | } else { | ||
857 | /* | ||
858 | * If the owner died bit is not set, | ||
859 | * then the pi_state must have an | ||
860 | * owner. [7] | ||
861 | */ | ||
862 | if (!pi_state->owner) | ||
863 | return -EINVAL; | ||
864 | } | ||
865 | |||
866 | /* | 830 | /* |
867 | * Bail out if user space manipulated the | 831 | * Take a ref on the state and return success. [4] |
868 | * futex value. If pi state exists then the | ||
869 | * owner TID must be the same as the user | ||
870 | * space TID. [9/10] | ||
871 | */ | 832 | */ |
872 | if (pid != task_pid_vnr(pi_state->owner)) | 833 | goto out_state; |
873 | return -EINVAL; | ||
874 | |||
875 | out_state: | ||
876 | atomic_inc(&pi_state->refcount); | ||
877 | *ps = pi_state; | ||
878 | return 0; | ||
879 | } | 834 | } |
835 | |||
836 | /* | ||
837 | * If TID is 0, then either the dying owner has not | ||
838 | * yet executed exit_pi_state_list() or some waiter | ||
839 | * acquired the rtmutex in the pi state, but did not | ||
840 | * yet fixup the TID in user space. | ||
841 | * | ||
842 | * Take a ref on the state and return success. [6] | ||
843 | */ | ||
844 | if (!pid) | ||
845 | goto out_state; | ||
846 | } else { | ||
847 | /* | ||
848 | * If the owner died bit is not set, then the pi_state | ||
849 | * must have an owner. [7] | ||
850 | */ | ||
851 | if (!pi_state->owner) | ||
852 | return -EINVAL; | ||
880 | } | 853 | } |
881 | 854 | ||
882 | /* | 855 | /* |
856 | * Bail out if user space manipulated the futex value. If pi | ||
857 | * state exists then the owner TID must be the same as the | ||
858 | * user space TID. [9/10] | ||
859 | */ | ||
860 | if (pid != task_pid_vnr(pi_state->owner)) | ||
861 | return -EINVAL; | ||
862 | out_state: | ||
863 | atomic_inc(&pi_state->refcount); | ||
864 | *ps = pi_state; | ||
865 | return 0; | ||
866 | } | ||
867 | |||
868 | /* | ||
869 | * Lookup the task for the TID provided from user space and attach to | ||
870 | * it after doing proper sanity checks. | ||
871 | */ | ||
872 | static int attach_to_pi_owner(u32 uval, union futex_key *key, | ||
873 | struct futex_pi_state **ps) | ||
874 | { | ||
875 | pid_t pid = uval & FUTEX_TID_MASK; | ||
876 | struct futex_pi_state *pi_state; | ||
877 | struct task_struct *p; | ||
878 | |||
879 | /* | ||
883 | * We are the first waiter - try to look up the real owner and attach | 880 | * We are the first waiter - try to look up the real owner and attach |
884 | * the new pi_state to it, but bail out when TID = 0 [1] | 881 | * the new pi_state to it, but bail out when TID = 0 [1] |
885 | */ | 882 | */ |
@@ -920,7 +917,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
920 | pi_state = alloc_pi_state(); | 917 | pi_state = alloc_pi_state(); |
921 | 918 | ||
922 | /* | 919 | /* |
923 | * Initialize the pi_mutex in locked state and make 'p' | 920 | * Initialize the pi_mutex in locked state and make @p |
924 | * the owner of it: | 921 | * the owner of it: |
925 | */ | 922 | */ |
926 | rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); | 923 | rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); |
@@ -940,6 +937,36 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
940 | return 0; | 937 | return 0; |
941 | } | 938 | } |
942 | 939 | ||
940 | static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | ||
941 | union futex_key *key, struct futex_pi_state **ps) | ||
942 | { | ||
943 | struct futex_q *match = futex_top_waiter(hb, key); | ||
944 | |||
945 | /* | ||
946 | * If there is a waiter on that futex, validate it and | ||
947 | * attach to the pi_state when the validation succeeds. | ||
948 | */ | ||
949 | if (match) | ||
950 | return attach_to_pi_state(uval, match->pi_state, ps); | ||
951 | |||
952 | /* | ||
953 | * We are the first waiter - try to look up the owner based on | ||
954 | * @uval and attach to it. | ||
955 | */ | ||
956 | return attach_to_pi_owner(uval, key, ps); | ||
957 | } | ||
958 | |||
959 | static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) | ||
960 | { | ||
961 | u32 uninitialized_var(curval); | ||
962 | |||
963 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) | ||
964 | return -EFAULT; | ||
965 | |||
966 | /*If user space value changed, let the caller retry */ | ||
967 | return curval != uval ? -EAGAIN : 0; | ||
968 | } | ||
969 | |||
943 | /** | 970 | /** |
944 | * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex | 971 | * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex |
945 | * @uaddr: the pi futex user address | 972 | * @uaddr: the pi futex user address |
@@ -963,113 +990,69 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, | |||
963 | struct futex_pi_state **ps, | 990 | struct futex_pi_state **ps, |
964 | struct task_struct *task, int set_waiters) | 991 | struct task_struct *task, int set_waiters) |
965 | { | 992 | { |
966 | int lock_taken, ret, force_take = 0; | 993 | u32 uval, newval, vpid = task_pid_vnr(task); |
967 | u32 uval, newval, curval, vpid = task_pid_vnr(task); | 994 | struct futex_q *match; |
968 | 995 | int ret; | |
969 | retry: | ||
970 | ret = lock_taken = 0; | ||
971 | 996 | ||
972 | /* | 997 | /* |
973 | * To avoid races, we attempt to take the lock here again | 998 | * Read the user space value first so we can validate a few |
974 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | 999 | * things before proceeding further. |
975 | * the locks. It will most likely not succeed. | ||
976 | */ | 1000 | */ |
977 | newval = vpid; | 1001 | if (get_futex_value_locked(&uval, uaddr)) |
978 | if (set_waiters) | ||
979 | newval |= FUTEX_WAITERS; | ||
980 | |||
981 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) | ||
982 | return -EFAULT; | 1002 | return -EFAULT; |
983 | 1003 | ||
984 | /* | 1004 | /* |
985 | * Detect deadlocks. | 1005 | * Detect deadlocks. |
986 | */ | 1006 | */ |
987 | if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) | 1007 | if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) |
988 | return -EDEADLK; | 1008 | return -EDEADLK; |
989 | 1009 | ||
990 | /* | 1010 | /* |
991 | * Surprise - we got the lock, but we do not trust user space at all. | 1011 | * Lookup existing state first. If it exists, try to attach to |
992 | */ | 1012 | * its pi_state. |
993 | if (unlikely(!curval)) { | ||
994 | /* | ||
995 | * We verify whether there is kernel state for this | ||
996 | * futex. If not, we can safely assume, that the 0 -> | ||
997 | * TID transition is correct. If state exists, we do | ||
998 | * not bother to fixup the user space state as it was | ||
999 | * corrupted already. | ||
1000 | */ | ||
1001 | return futex_top_waiter(hb, key) ? -EINVAL : 1; | ||
1002 | } | ||
1003 | |||
1004 | uval = curval; | ||
1005 | |||
1006 | /* | ||
1007 | * Set the FUTEX_WAITERS flag, so the owner will know it has someone | ||
1008 | * to wake at the next unlock. | ||
1009 | */ | 1013 | */ |
1010 | newval = curval | FUTEX_WAITERS; | 1014 | match = futex_top_waiter(hb, key); |
1015 | if (match) | ||
1016 | return attach_to_pi_state(uval, match->pi_state, ps); | ||
1011 | 1017 | ||
1012 | /* | 1018 | /* |
1013 | * Should we force take the futex? See below. | 1019 | * No waiter and user TID is 0. We are here because the |
1020 | * waiters or the owner died bit is set or called from | ||
1021 | * requeue_cmp_pi or for whatever reason something took the | ||
1022 | * syscall. | ||
1014 | */ | 1023 | */ |
1015 | if (unlikely(force_take)) { | 1024 | if (!(uval & FUTEX_TID_MASK)) { |
1016 | /* | 1025 | /* |
1017 | * Keep the OWNER_DIED and the WAITERS bit and set the | 1026 | * We take over the futex. No other waiters and the user space |
1018 | * new TID value. | 1027 | * TID is 0. We preserve the owner died bit. |
1019 | */ | 1028 | */ |
1020 | newval = (curval & ~FUTEX_TID_MASK) | vpid; | 1029 | newval = uval & FUTEX_OWNER_DIED; |
1021 | force_take = 0; | 1030 | newval |= vpid; |
1022 | lock_taken = 1; | ||
1023 | } | ||
1024 | 1031 | ||
1025 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) | 1032 | /* The futex requeue_pi code can enforce the waiters bit */ |
1026 | return -EFAULT; | 1033 | if (set_waiters) |
1027 | if (unlikely(curval != uval)) | 1034 | newval |= FUTEX_WAITERS; |
1028 | goto retry; | 1035 | |
1036 | ret = lock_pi_update_atomic(uaddr, uval, newval); | ||
1037 | /* If the take over worked, return 1 */ | ||
1038 | return ret < 0 ? ret : 1; | ||
1039 | } | ||
1029 | 1040 | ||
1030 | /* | 1041 | /* |
1031 | * We took the lock due to forced take over. | 1042 | * First waiter. Set the waiters bit before attaching ourself to |
1043 | * the owner. If owner tries to unlock, it will be forced into | ||
1044 | * the kernel and blocked on hb->lock. | ||
1032 | */ | 1045 | */ |
1033 | if (unlikely(lock_taken)) | 1046 | newval = uval | FUTEX_WAITERS; |
1034 | return 1; | 1047 | ret = lock_pi_update_atomic(uaddr, uval, newval); |
1035 | 1048 | if (ret) | |
1049 | return ret; | ||
1036 | /* | 1050 | /* |
1037 | * We dont have the lock. Look up the PI state (or create it if | 1051 | * If the update of the user space value succeeded, we try to |
1038 | * we are the first waiter): | 1052 | * attach to the owner. If that fails, no harm done, we only |
1053 | * set the FUTEX_WAITERS bit in the user space variable. | ||
1039 | */ | 1054 | */ |
1040 | ret = lookup_pi_state(uval, hb, key, ps); | 1055 | return attach_to_pi_owner(uval, key, ps); |
1041 | |||
1042 | if (unlikely(ret)) { | ||
1043 | switch (ret) { | ||
1044 | case -ESRCH: | ||
1045 | /* | ||
1046 | * We failed to find an owner for this | ||
1047 | * futex. So we have no pi_state to block | ||
1048 | * on. This can happen in two cases: | ||
1049 | * | ||
1050 | * 1) The owner died | ||
1051 | * 2) A stale FUTEX_WAITERS bit | ||
1052 | * | ||
1053 | * Re-read the futex value. | ||
1054 | */ | ||
1055 | if (get_futex_value_locked(&curval, uaddr)) | ||
1056 | return -EFAULT; | ||
1057 | |||
1058 | /* | ||
1059 | * If the owner died or we have a stale | ||
1060 | * WAITERS bit the owner TID in the user space | ||
1061 | * futex is 0. | ||
1062 | */ | ||
1063 | if (!(curval & FUTEX_TID_MASK)) { | ||
1064 | force_take = 1; | ||
1065 | goto retry; | ||
1066 | } | ||
1067 | default: | ||
1068 | break; | ||
1069 | } | ||
1070 | } | ||
1071 | |||
1072 | return ret; | ||
1073 | } | 1056 | } |
1074 | 1057 | ||
1075 | /** | 1058 | /** |
@@ -1186,22 +1169,6 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
1186 | return 0; | 1169 | return 0; |
1187 | } | 1170 | } |
1188 | 1171 | ||
1189 | static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | ||
1190 | { | ||
1191 | u32 uninitialized_var(oldval); | ||
1192 | |||
1193 | /* | ||
1194 | * There is no waiter, so we unlock the futex. The owner died | ||
1195 | * bit has not to be preserved here. We are the owner: | ||
1196 | */ | ||
1197 | if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) | ||
1198 | return -EFAULT; | ||
1199 | if (oldval != uval) | ||
1200 | return -EAGAIN; | ||
1201 | |||
1202 | return 0; | ||
1203 | } | ||
1204 | |||
1205 | /* | 1172 | /* |
1206 | * Express the locking dependencies for lockdep: | 1173 | * Express the locking dependencies for lockdep: |
1207 | */ | 1174 | */ |
@@ -1659,7 +1626,12 @@ retry_private: | |||
1659 | goto retry; | 1626 | goto retry; |
1660 | goto out; | 1627 | goto out; |
1661 | case -EAGAIN: | 1628 | case -EAGAIN: |
1662 | /* The owner was exiting, try again. */ | 1629 | /* |
1630 | * Two reasons for this: | ||
1631 | * - Owner is exiting and we just wait for the | ||
1632 | * exit to complete. | ||
1633 | * - The user space value changed. | ||
1634 | */ | ||
1663 | double_unlock_hb(hb1, hb2); | 1635 | double_unlock_hb(hb1, hb2); |
1664 | hb_waiters_dec(hb2); | 1636 | hb_waiters_dec(hb2); |
1665 | put_futex_key(&key2); | 1637 | put_futex_key(&key2); |
@@ -1718,7 +1690,7 @@ retry_private: | |||
1718 | this->pi_state = pi_state; | 1690 | this->pi_state = pi_state; |
1719 | ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, | 1691 | ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, |
1720 | this->rt_waiter, | 1692 | this->rt_waiter, |
1721 | this->task, 1); | 1693 | this->task); |
1722 | if (ret == 1) { | 1694 | if (ret == 1) { |
1723 | /* We got the lock. */ | 1695 | /* We got the lock. */ |
1724 | requeue_pi_wake_futex(this, &key2, hb2); | 1696 | requeue_pi_wake_futex(this, &key2, hb2); |
@@ -2316,8 +2288,10 @@ retry_private: | |||
2316 | goto uaddr_faulted; | 2288 | goto uaddr_faulted; |
2317 | case -EAGAIN: | 2289 | case -EAGAIN: |
2318 | /* | 2290 | /* |
2319 | * Task is exiting and we just wait for the | 2291 | * Two reasons for this: |
2320 | * exit to complete. | 2292 | * - Task is exiting and we just wait for the |
2293 | * exit to complete. | ||
2294 | * - The user space value changed. | ||
2321 | */ | 2295 | */ |
2322 | queue_unlock(hb); | 2296 | queue_unlock(hb); |
2323 | put_futex_key(&q.key); | 2297 | put_futex_key(&q.key); |
@@ -2337,9 +2311,9 @@ retry_private: | |||
2337 | /* | 2311 | /* |
2338 | * Block on the PI mutex: | 2312 | * Block on the PI mutex: |
2339 | */ | 2313 | */ |
2340 | if (!trylock) | 2314 | if (!trylock) { |
2341 | ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); | 2315 | ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to); |
2342 | else { | 2316 | } else { |
2343 | ret = rt_mutex_trylock(&q.pi_state->pi_mutex); | 2317 | ret = rt_mutex_trylock(&q.pi_state->pi_mutex); |
2344 | /* Fixup the trylock return value: */ | 2318 | /* Fixup the trylock return value: */ |
2345 | ret = ret ? 0 : -EWOULDBLOCK; | 2319 | ret = ret ? 0 : -EWOULDBLOCK; |
@@ -2401,10 +2375,10 @@ uaddr_faulted: | |||
2401 | */ | 2375 | */ |
2402 | static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) | 2376 | static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) |
2403 | { | 2377 | { |
2404 | struct futex_hash_bucket *hb; | 2378 | u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); |
2405 | struct futex_q *this, *next; | ||
2406 | union futex_key key = FUTEX_KEY_INIT; | 2379 | union futex_key key = FUTEX_KEY_INIT; |
2407 | u32 uval, vpid = task_pid_vnr(current); | 2380 | struct futex_hash_bucket *hb; |
2381 | struct futex_q *match; | ||
2408 | int ret; | 2382 | int ret; |
2409 | 2383 | ||
2410 | retry: | 2384 | retry: |
@@ -2417,57 +2391,47 @@ retry: | |||
2417 | return -EPERM; | 2391 | return -EPERM; |
2418 | 2392 | ||
2419 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); | 2393 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); |
2420 | if (unlikely(ret != 0)) | 2394 | if (ret) |
2421 | goto out; | 2395 | return ret; |
2422 | 2396 | ||
2423 | hb = hash_futex(&key); | 2397 | hb = hash_futex(&key); |
2424 | spin_lock(&hb->lock); | 2398 | spin_lock(&hb->lock); |
2425 | 2399 | ||
2426 | /* | 2400 | /* |
2427 | * To avoid races, try to do the TID -> 0 atomic transition | 2401 | * Check waiters first. We do not trust user space values at |
2428 | * again. If it succeeds then we can return without waking | 2402 | * all and we at least want to know if user space fiddled |
2429 | * anyone else up. We only try this if neither the waiters nor | 2403 | * with the futex value instead of blindly unlocking. |
2430 | * the owner died bit are set. | ||
2431 | */ | ||
2432 | if (!(uval & ~FUTEX_TID_MASK) && | ||
2433 | cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) | ||
2434 | goto pi_faulted; | ||
2435 | /* | ||
2436 | * Rare case: we managed to release the lock atomically, | ||
2437 | * no need to wake anyone else up: | ||
2438 | */ | ||
2439 | if (unlikely(uval == vpid)) | ||
2440 | goto out_unlock; | ||
2441 | |||
2442 | /* | ||
2443 | * Ok, other tasks may need to be woken up - check waiters | ||
2444 | * and do the wakeup if necessary: | ||
2445 | */ | 2404 | */ |
2446 | plist_for_each_entry_safe(this, next, &hb->chain, list) { | 2405 | match = futex_top_waiter(hb, &key); |
2447 | if (!match_futex (&this->key, &key)) | 2406 | if (match) { |
2448 | continue; | 2407 | ret = wake_futex_pi(uaddr, uval, match); |
2449 | ret = wake_futex_pi(uaddr, uval, this); | ||
2450 | /* | 2408 | /* |
2451 | * The atomic access to the futex value | 2409 | * The atomic access to the futex value generated a |
2452 | * generated a pagefault, so retry the | 2410 | * pagefault, so retry the user-access and the wakeup: |
2453 | * user-access and the wakeup: | ||
2454 | */ | 2411 | */ |
2455 | if (ret == -EFAULT) | 2412 | if (ret == -EFAULT) |
2456 | goto pi_faulted; | 2413 | goto pi_faulted; |
2457 | goto out_unlock; | 2414 | goto out_unlock; |
2458 | } | 2415 | } |
2416 | |||
2459 | /* | 2417 | /* |
2460 | * No waiters - kernel unlocks the futex: | 2418 | * We have no kernel internal state, i.e. no waiters in the |
2419 | * kernel. Waiters which are about to queue themselves are stuck | ||
2420 | * on hb->lock. So we can safely ignore them. We do neither | ||
2421 | * preserve the WAITERS bit not the OWNER_DIED one. We are the | ||
2422 | * owner. | ||
2461 | */ | 2423 | */ |
2462 | ret = unlock_futex_pi(uaddr, uval); | 2424 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) |
2463 | if (ret == -EFAULT) | ||
2464 | goto pi_faulted; | 2425 | goto pi_faulted; |
2465 | 2426 | ||
2427 | /* | ||
2428 | * If uval has changed, let user space handle it. | ||
2429 | */ | ||
2430 | ret = (curval == uval) ? 0 : -EAGAIN; | ||
2431 | |||
2466 | out_unlock: | 2432 | out_unlock: |
2467 | spin_unlock(&hb->lock); | 2433 | spin_unlock(&hb->lock); |
2468 | put_futex_key(&key); | 2434 | put_futex_key(&key); |
2469 | |||
2470 | out: | ||
2471 | return ret; | 2435 | return ret; |
2472 | 2436 | ||
2473 | pi_faulted: | 2437 | pi_faulted: |
@@ -2669,7 +2633,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
2669 | */ | 2633 | */ |
2670 | WARN_ON(!q.pi_state); | 2634 | WARN_ON(!q.pi_state); |
2671 | pi_mutex = &q.pi_state->pi_mutex; | 2635 | pi_mutex = &q.pi_state->pi_mutex; |
2672 | ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); | 2636 | ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter); |
2673 | debug_rt_mutex_free_waiter(&rt_waiter); | 2637 | debug_rt_mutex_free_waiter(&rt_waiter); |
2674 | 2638 | ||
2675 | spin_lock(q.lock_ptr); | 2639 | spin_lock(q.lock_ptr); |
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 452d6f2ba21d..cf80e7b0ddab 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c | |||
@@ -341,8 +341,8 @@ static struct lock_class_key irq_nested_lock_class; | |||
341 | /* | 341 | /* |
342 | * irq_map_generic_chip - Map a generic chip for an irq domain | 342 | * irq_map_generic_chip - Map a generic chip for an irq domain |
343 | */ | 343 | */ |
344 | static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, | 344 | int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, |
345 | irq_hw_number_t hw_irq) | 345 | irq_hw_number_t hw_irq) |
346 | { | 346 | { |
347 | struct irq_data *data = irq_get_irq_data(virq); | 347 | struct irq_data *data = irq_get_irq_data(virq); |
348 | struct irq_domain_chip_generic *dgc = d->gc; | 348 | struct irq_domain_chip_generic *dgc = d->gc; |
@@ -394,6 +394,7 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, | |||
394 | irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); | 394 | irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); |
395 | return 0; | 395 | return 0; |
396 | } | 396 | } |
397 | EXPORT_SYMBOL_GPL(irq_map_generic_chip); | ||
397 | 398 | ||
398 | struct irq_domain_ops irq_generic_chip_ops = { | 399 | struct irq_domain_ops irq_generic_chip_ops = { |
399 | .map = irq_map_generic_chip, | 400 | .map = irq_map_generic_chip, |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index eb5e10e32e05..6534ff6ce02e 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -231,7 +231,7 @@ void irq_set_default_host(struct irq_domain *domain) | |||
231 | } | 231 | } |
232 | EXPORT_SYMBOL_GPL(irq_set_default_host); | 232 | EXPORT_SYMBOL_GPL(irq_set_default_host); |
233 | 233 | ||
234 | static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) | 234 | void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) |
235 | { | 235 | { |
236 | struct irq_data *irq_data = irq_get_irq_data(irq); | 236 | struct irq_data *irq_data = irq_get_irq_data(irq); |
237 | irq_hw_number_t hwirq; | 237 | irq_hw_number_t hwirq; |
diff --git a/kernel/irq_work.c b/kernel/irq_work.c index a82170e2fa78..e6bcbe756663 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c | |||
@@ -16,11 +16,12 @@ | |||
16 | #include <linux/tick.h> | 16 | #include <linux/tick.h> |
17 | #include <linux/cpu.h> | 17 | #include <linux/cpu.h> |
18 | #include <linux/notifier.h> | 18 | #include <linux/notifier.h> |
19 | #include <linux/smp.h> | ||
19 | #include <asm/processor.h> | 20 | #include <asm/processor.h> |
20 | 21 | ||
21 | 22 | ||
22 | static DEFINE_PER_CPU(struct llist_head, irq_work_list); | 23 | static DEFINE_PER_CPU(struct llist_head, raised_list); |
23 | static DEFINE_PER_CPU(int, irq_work_raised); | 24 | static DEFINE_PER_CPU(struct llist_head, lazy_list); |
24 | 25 | ||
25 | /* | 26 | /* |
26 | * Claim the entry so that no one else will poke at it. | 27 | * Claim the entry so that no one else will poke at it. |
@@ -55,12 +56,34 @@ void __weak arch_irq_work_raise(void) | |||
55 | */ | 56 | */ |
56 | } | 57 | } |
57 | 58 | ||
59 | #ifdef CONFIG_SMP | ||
58 | /* | 60 | /* |
59 | * Enqueue the irq_work @entry unless it's already pending | 61 | * Enqueue the irq_work @work on @cpu unless it's already pending |
60 | * somewhere. | 62 | * somewhere. |
61 | * | 63 | * |
62 | * Can be re-enqueued while the callback is still in progress. | 64 | * Can be re-enqueued while the callback is still in progress. |
63 | */ | 65 | */ |
66 | bool irq_work_queue_on(struct irq_work *work, int cpu) | ||
67 | { | ||
68 | /* All work should have been flushed before going offline */ | ||
69 | WARN_ON_ONCE(cpu_is_offline(cpu)); | ||
70 | |||
71 | /* Arch remote IPI send/receive backend aren't NMI safe */ | ||
72 | WARN_ON_ONCE(in_nmi()); | ||
73 | |||
74 | /* Only queue if not already pending */ | ||
75 | if (!irq_work_claim(work)) | ||
76 | return false; | ||
77 | |||
78 | if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) | ||
79 | arch_send_call_function_single_ipi(cpu); | ||
80 | |||
81 | return true; | ||
82 | } | ||
83 | EXPORT_SYMBOL_GPL(irq_work_queue_on); | ||
84 | #endif | ||
85 | |||
86 | /* Enqueue the irq work @work on the current CPU */ | ||
64 | bool irq_work_queue(struct irq_work *work) | 87 | bool irq_work_queue(struct irq_work *work) |
65 | { | 88 | { |
66 | /* Only queue if not already pending */ | 89 | /* Only queue if not already pending */ |
@@ -70,15 +93,13 @@ bool irq_work_queue(struct irq_work *work) | |||
70 | /* Queue the entry and raise the IPI if needed. */ | 93 | /* Queue the entry and raise the IPI if needed. */ |
71 | preempt_disable(); | 94 | preempt_disable(); |
72 | 95 | ||
73 | llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); | 96 | /* If the work is "lazy", handle it from next tick if any */ |
74 | 97 | if (work->flags & IRQ_WORK_LAZY) { | |
75 | /* | 98 | if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) && |
76 | * If the work is not "lazy" or the tick is stopped, raise the irq | 99 | tick_nohz_tick_stopped()) |
77 | * work interrupt (if supported by the arch), otherwise, just wait | 100 | arch_irq_work_raise(); |
78 | * for the next tick. | 101 | } else { |
79 | */ | 102 | if (llist_add(&work->llnode, &__get_cpu_var(raised_list))) |
80 | if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) { | ||
81 | if (!this_cpu_cmpxchg(irq_work_raised, 0, 1)) | ||
82 | arch_irq_work_raise(); | 103 | arch_irq_work_raise(); |
83 | } | 104 | } |
84 | 105 | ||
@@ -90,10 +111,11 @@ EXPORT_SYMBOL_GPL(irq_work_queue); | |||
90 | 111 | ||
91 | bool irq_work_needs_cpu(void) | 112 | bool irq_work_needs_cpu(void) |
92 | { | 113 | { |
93 | struct llist_head *this_list; | 114 | struct llist_head *raised, *lazy; |
94 | 115 | ||
95 | this_list = &__get_cpu_var(irq_work_list); | 116 | raised = &__get_cpu_var(raised_list); |
96 | if (llist_empty(this_list)) | 117 | lazy = &__get_cpu_var(lazy_list); |
118 | if (llist_empty(raised) && llist_empty(lazy)) | ||
97 | return false; | 119 | return false; |
98 | 120 | ||
99 | /* All work should have been flushed before going offline */ | 121 | /* All work should have been flushed before going offline */ |
@@ -102,28 +124,18 @@ bool irq_work_needs_cpu(void) | |||
102 | return true; | 124 | return true; |
103 | } | 125 | } |
104 | 126 | ||
105 | static void __irq_work_run(void) | 127 | static void irq_work_run_list(struct llist_head *list) |
106 | { | 128 | { |
107 | unsigned long flags; | 129 | unsigned long flags; |
108 | struct irq_work *work; | 130 | struct irq_work *work; |
109 | struct llist_head *this_list; | ||
110 | struct llist_node *llnode; | 131 | struct llist_node *llnode; |
111 | 132 | ||
133 | BUG_ON(!irqs_disabled()); | ||
112 | 134 | ||
113 | /* | 135 | if (llist_empty(list)) |
114 | * Reset the "raised" state right before we check the list because | ||
115 | * an NMI may enqueue after we find the list empty from the runner. | ||
116 | */ | ||
117 | __this_cpu_write(irq_work_raised, 0); | ||
118 | barrier(); | ||
119 | |||
120 | this_list = &__get_cpu_var(irq_work_list); | ||
121 | if (llist_empty(this_list)) | ||
122 | return; | 136 | return; |
123 | 137 | ||
124 | BUG_ON(!irqs_disabled()); | 138 | llnode = llist_del_all(list); |
125 | |||
126 | llnode = llist_del_all(this_list); | ||
127 | while (llnode != NULL) { | 139 | while (llnode != NULL) { |
128 | work = llist_entry(llnode, struct irq_work, llnode); | 140 | work = llist_entry(llnode, struct irq_work, llnode); |
129 | 141 | ||
@@ -149,13 +161,13 @@ static void __irq_work_run(void) | |||
149 | } | 161 | } |
150 | 162 | ||
151 | /* | 163 | /* |
152 | * Run the irq_work entries on this cpu. Requires to be ran from hardirq | 164 | * hotplug calls this through: |
153 | * context with local IRQs disabled. | 165 | * hotplug_cfd() -> flush_smp_call_function_queue() |
154 | */ | 166 | */ |
155 | void irq_work_run(void) | 167 | void irq_work_run(void) |
156 | { | 168 | { |
157 | BUG_ON(!in_irq()); | 169 | irq_work_run_list(&__get_cpu_var(raised_list)); |
158 | __irq_work_run(); | 170 | irq_work_run_list(&__get_cpu_var(lazy_list)); |
159 | } | 171 | } |
160 | EXPORT_SYMBOL_GPL(irq_work_run); | 172 | EXPORT_SYMBOL_GPL(irq_work_run); |
161 | 173 | ||
@@ -171,35 +183,3 @@ void irq_work_sync(struct irq_work *work) | |||
171 | cpu_relax(); | 183 | cpu_relax(); |
172 | } | 184 | } |
173 | EXPORT_SYMBOL_GPL(irq_work_sync); | 185 | EXPORT_SYMBOL_GPL(irq_work_sync); |
174 | |||
175 | #ifdef CONFIG_HOTPLUG_CPU | ||
176 | static int irq_work_cpu_notify(struct notifier_block *self, | ||
177 | unsigned long action, void *hcpu) | ||
178 | { | ||
179 | long cpu = (long)hcpu; | ||
180 | |||
181 | switch (action) { | ||
182 | case CPU_DYING: | ||
183 | /* Called from stop_machine */ | ||
184 | if (WARN_ON_ONCE(cpu != smp_processor_id())) | ||
185 | break; | ||
186 | __irq_work_run(); | ||
187 | break; | ||
188 | default: | ||
189 | break; | ||
190 | } | ||
191 | return NOTIFY_OK; | ||
192 | } | ||
193 | |||
194 | static struct notifier_block cpu_notify; | ||
195 | |||
196 | static __init int irq_work_init_cpu_notifier(void) | ||
197 | { | ||
198 | cpu_notify.notifier_call = irq_work_cpu_notify; | ||
199 | cpu_notify.priority = 0; | ||
200 | register_cpu_notifier(&cpu_notify); | ||
201 | return 0; | ||
202 | } | ||
203 | device_initcall(irq_work_init_cpu_notifier); | ||
204 | |||
205 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index 369f41a94124..4b8f0c925884 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/swap.h> | 33 | #include <linux/swap.h> |
34 | #include <linux/syscore_ops.h> | 34 | #include <linux/syscore_ops.h> |
35 | #include <linux/compiler.h> | 35 | #include <linux/compiler.h> |
36 | #include <linux/hugetlb.h> | ||
36 | 37 | ||
37 | #include <asm/page.h> | 38 | #include <asm/page.h> |
38 | #include <asm/uaccess.h> | 39 | #include <asm/uaccess.h> |
@@ -1619,6 +1620,9 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
1619 | #endif | 1620 | #endif |
1620 | VMCOREINFO_NUMBER(PG_head_mask); | 1621 | VMCOREINFO_NUMBER(PG_head_mask); |
1621 | VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); | 1622 | VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); |
1623 | #ifdef CONFIG_HUGETLBFS | ||
1624 | VMCOREINFO_SYMBOL(free_huge_page); | ||
1625 | #endif | ||
1622 | 1626 | ||
1623 | arch_crash_save_vmcoreinfo(); | 1627 | arch_crash_save_vmcoreinfo(); |
1624 | update_vmcoreinfo_note(); | 1628 | update_vmcoreinfo_note(); |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3214289df5a7..734e9a7d280b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -2037,19 +2037,23 @@ static int __init populate_kprobe_blacklist(unsigned long *start, | |||
2037 | { | 2037 | { |
2038 | unsigned long *iter; | 2038 | unsigned long *iter; |
2039 | struct kprobe_blacklist_entry *ent; | 2039 | struct kprobe_blacklist_entry *ent; |
2040 | unsigned long offset = 0, size = 0; | 2040 | unsigned long entry, offset = 0, size = 0; |
2041 | 2041 | ||
2042 | for (iter = start; iter < end; iter++) { | 2042 | for (iter = start; iter < end; iter++) { |
2043 | if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) { | 2043 | entry = arch_deref_entry_point((void *)*iter); |
2044 | pr_err("Failed to find blacklist %p\n", (void *)*iter); | 2044 | |
2045 | if (!kernel_text_address(entry) || | ||
2046 | !kallsyms_lookup_size_offset(entry, &size, &offset)) { | ||
2047 | pr_err("Failed to find blacklist at %p\n", | ||
2048 | (void *)entry); | ||
2045 | continue; | 2049 | continue; |
2046 | } | 2050 | } |
2047 | 2051 | ||
2048 | ent = kmalloc(sizeof(*ent), GFP_KERNEL); | 2052 | ent = kmalloc(sizeof(*ent), GFP_KERNEL); |
2049 | if (!ent) | 2053 | if (!ent) |
2050 | return -ENOMEM; | 2054 | return -ENOMEM; |
2051 | ent->start_addr = *iter; | 2055 | ent->start_addr = entry; |
2052 | ent->end_addr = *iter + size; | 2056 | ent->end_addr = entry + size; |
2053 | INIT_LIST_HEAD(&ent->list); | 2057 | INIT_LIST_HEAD(&ent->list); |
2054 | list_add_tail(&ent->list, &kprobe_blacklist); | 2058 | list_add_tail(&ent->list, &kprobe_blacklist); |
2055 | } | 2059 | } |
diff --git a/kernel/kthread.c b/kernel/kthread.c index c2390f41307b..ef483220e855 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -591,7 +591,7 @@ static void insert_kthread_work(struct kthread_worker *worker, | |||
591 | 591 | ||
592 | list_add_tail(&work->node, pos); | 592 | list_add_tail(&work->node, pos); |
593 | work->worker = worker; | 593 | work->worker = worker; |
594 | if (likely(worker->task)) | 594 | if (!worker->current_work && likely(worker->task)) |
595 | wake_up_process(worker->task); | 595 | wake_up_process(worker->task); |
596 | } | 596 | } |
597 | 597 | ||
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d24e4339b46d..88d0d4420ad2 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
@@ -384,7 +384,9 @@ static void print_lockdep_off(const char *bug_msg) | |||
384 | { | 384 | { |
385 | printk(KERN_DEBUG "%s\n", bug_msg); | 385 | printk(KERN_DEBUG "%s\n", bug_msg); |
386 | printk(KERN_DEBUG "turning off the locking correctness validator.\n"); | 386 | printk(KERN_DEBUG "turning off the locking correctness validator.\n"); |
387 | #ifdef CONFIG_LOCK_STAT | ||
387 | printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); | 388 | printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); |
389 | #endif | ||
388 | } | 390 | } |
389 | 391 | ||
390 | static int save_trace(struct stack_trace *trace) | 392 | static int save_trace(struct stack_trace *trace) |
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c index 838dc9e00669..9887a905a762 100644 --- a/kernel/locking/mcs_spinlock.c +++ b/kernel/locking/mcs_spinlock.c | |||
@@ -1,6 +1,4 @@ | |||
1 | |||
2 | #include <linux/percpu.h> | 1 | #include <linux/percpu.h> |
3 | #include <linux/mutex.h> | ||
4 | #include <linux/sched.h> | 2 | #include <linux/sched.h> |
5 | #include "mcs_spinlock.h" | 3 | #include "mcs_spinlock.h" |
6 | 4 | ||
@@ -14,21 +12,47 @@ | |||
14 | * called from interrupt context and we have preemption disabled while | 12 | * called from interrupt context and we have preemption disabled while |
15 | * spinning. | 13 | * spinning. |
16 | */ | 14 | */ |
17 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node); | 15 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); |
16 | |||
17 | /* | ||
18 | * We use the value 0 to represent "no CPU", thus the encoded value | ||
19 | * will be the CPU number incremented by 1. | ||
20 | */ | ||
21 | static inline int encode_cpu(int cpu_nr) | ||
22 | { | ||
23 | return cpu_nr + 1; | ||
24 | } | ||
25 | |||
26 | static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) | ||
27 | { | ||
28 | int cpu_nr = encoded_cpu_val - 1; | ||
29 | |||
30 | return per_cpu_ptr(&osq_node, cpu_nr); | ||
31 | } | ||
18 | 32 | ||
19 | /* | 33 | /* |
20 | * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. | 34 | * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. |
21 | * Can return NULL in case we were the last queued and we updated @lock instead. | 35 | * Can return NULL in case we were the last queued and we updated @lock instead. |
22 | */ | 36 | */ |
23 | static inline struct optimistic_spin_queue * | 37 | static inline struct optimistic_spin_node * |
24 | osq_wait_next(struct optimistic_spin_queue **lock, | 38 | osq_wait_next(struct optimistic_spin_queue *lock, |
25 | struct optimistic_spin_queue *node, | 39 | struct optimistic_spin_node *node, |
26 | struct optimistic_spin_queue *prev) | 40 | struct optimistic_spin_node *prev) |
27 | { | 41 | { |
28 | struct optimistic_spin_queue *next = NULL; | 42 | struct optimistic_spin_node *next = NULL; |
43 | int curr = encode_cpu(smp_processor_id()); | ||
44 | int old; | ||
45 | |||
46 | /* | ||
47 | * If there is a prev node in queue, then the 'old' value will be | ||
48 | * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if | ||
49 | * we're currently last in queue, then the queue will then become empty. | ||
50 | */ | ||
51 | old = prev ? prev->cpu : OSQ_UNLOCKED_VAL; | ||
29 | 52 | ||
30 | for (;;) { | 53 | for (;;) { |
31 | if (*lock == node && cmpxchg(lock, node, prev) == node) { | 54 | if (atomic_read(&lock->tail) == curr && |
55 | atomic_cmpxchg(&lock->tail, curr, old) == curr) { | ||
32 | /* | 56 | /* |
33 | * We were the last queued, we moved @lock back. @prev | 57 | * We were the last queued, we moved @lock back. @prev |
34 | * will now observe @lock and will complete its | 58 | * will now observe @lock and will complete its |
@@ -53,24 +77,29 @@ osq_wait_next(struct optimistic_spin_queue **lock, | |||
53 | break; | 77 | break; |
54 | } | 78 | } |
55 | 79 | ||
56 | arch_mutex_cpu_relax(); | 80 | cpu_relax_lowlatency(); |
57 | } | 81 | } |
58 | 82 | ||
59 | return next; | 83 | return next; |
60 | } | 84 | } |
61 | 85 | ||
62 | bool osq_lock(struct optimistic_spin_queue **lock) | 86 | bool osq_lock(struct optimistic_spin_queue *lock) |
63 | { | 87 | { |
64 | struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); | 88 | struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); |
65 | struct optimistic_spin_queue *prev, *next; | 89 | struct optimistic_spin_node *prev, *next; |
90 | int curr = encode_cpu(smp_processor_id()); | ||
91 | int old; | ||
66 | 92 | ||
67 | node->locked = 0; | 93 | node->locked = 0; |
68 | node->next = NULL; | 94 | node->next = NULL; |
95 | node->cpu = curr; | ||
69 | 96 | ||
70 | node->prev = prev = xchg(lock, node); | 97 | old = atomic_xchg(&lock->tail, curr); |
71 | if (likely(prev == NULL)) | 98 | if (old == OSQ_UNLOCKED_VAL) |
72 | return true; | 99 | return true; |
73 | 100 | ||
101 | prev = decode_cpu(old); | ||
102 | node->prev = prev; | ||
74 | ACCESS_ONCE(prev->next) = node; | 103 | ACCESS_ONCE(prev->next) = node; |
75 | 104 | ||
76 | /* | 105 | /* |
@@ -89,7 +118,7 @@ bool osq_lock(struct optimistic_spin_queue **lock) | |||
89 | if (need_resched()) | 118 | if (need_resched()) |
90 | goto unqueue; | 119 | goto unqueue; |
91 | 120 | ||
92 | arch_mutex_cpu_relax(); | 121 | cpu_relax_lowlatency(); |
93 | } | 122 | } |
94 | return true; | 123 | return true; |
95 | 124 | ||
@@ -115,7 +144,7 @@ unqueue: | |||
115 | if (smp_load_acquire(&node->locked)) | 144 | if (smp_load_acquire(&node->locked)) |
116 | return true; | 145 | return true; |
117 | 146 | ||
118 | arch_mutex_cpu_relax(); | 147 | cpu_relax_lowlatency(); |
119 | 148 | ||
120 | /* | 149 | /* |
121 | * Or we race against a concurrent unqueue()'s step-B, in which | 150 | * Or we race against a concurrent unqueue()'s step-B, in which |
@@ -149,20 +178,21 @@ unqueue: | |||
149 | return false; | 178 | return false; |
150 | } | 179 | } |
151 | 180 | ||
152 | void osq_unlock(struct optimistic_spin_queue **lock) | 181 | void osq_unlock(struct optimistic_spin_queue *lock) |
153 | { | 182 | { |
154 | struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); | 183 | struct optimistic_spin_node *node, *next; |
155 | struct optimistic_spin_queue *next; | 184 | int curr = encode_cpu(smp_processor_id()); |
156 | 185 | ||
157 | /* | 186 | /* |
158 | * Fast path for the uncontended case. | 187 | * Fast path for the uncontended case. |
159 | */ | 188 | */ |
160 | if (likely(cmpxchg(lock, node, NULL) == node)) | 189 | if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr)) |
161 | return; | 190 | return; |
162 | 191 | ||
163 | /* | 192 | /* |
164 | * Second most likely case. | 193 | * Second most likely case. |
165 | */ | 194 | */ |
195 | node = this_cpu_ptr(&osq_node); | ||
166 | next = xchg(&node->next, NULL); | 196 | next = xchg(&node->next, NULL); |
167 | if (next) { | 197 | if (next) { |
168 | ACCESS_ONCE(next->locked) = 1; | 198 | ACCESS_ONCE(next->locked) = 1; |
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index a2dbac4aca6b..23e89c5930e9 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h | |||
@@ -27,7 +27,7 @@ struct mcs_spinlock { | |||
27 | #define arch_mcs_spin_lock_contended(l) \ | 27 | #define arch_mcs_spin_lock_contended(l) \ |
28 | do { \ | 28 | do { \ |
29 | while (!(smp_load_acquire(l))) \ | 29 | while (!(smp_load_acquire(l))) \ |
30 | arch_mutex_cpu_relax(); \ | 30 | cpu_relax_lowlatency(); \ |
31 | } while (0) | 31 | } while (0) |
32 | #endif | 32 | #endif |
33 | 33 | ||
@@ -104,7 +104,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | |||
104 | return; | 104 | return; |
105 | /* Wait until the next pointer is set */ | 105 | /* Wait until the next pointer is set */ |
106 | while (!(next = ACCESS_ONCE(node->next))) | 106 | while (!(next = ACCESS_ONCE(node->next))) |
107 | arch_mutex_cpu_relax(); | 107 | cpu_relax_lowlatency(); |
108 | } | 108 | } |
109 | 109 | ||
110 | /* Pass lock to next waiter. */ | 110 | /* Pass lock to next waiter. */ |
@@ -118,12 +118,13 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | |||
118 | * mutex_lock()/rwsem_down_{read,write}() etc. | 118 | * mutex_lock()/rwsem_down_{read,write}() etc. |
119 | */ | 119 | */ |
120 | 120 | ||
121 | struct optimistic_spin_queue { | 121 | struct optimistic_spin_node { |
122 | struct optimistic_spin_queue *next, *prev; | 122 | struct optimistic_spin_node *next, *prev; |
123 | int locked; /* 1 if lock acquired */ | 123 | int locked; /* 1 if lock acquired */ |
124 | int cpu; /* encoded CPU # value */ | ||
124 | }; | 125 | }; |
125 | 126 | ||
126 | extern bool osq_lock(struct optimistic_spin_queue **lock); | 127 | extern bool osq_lock(struct optimistic_spin_queue *lock); |
127 | extern void osq_unlock(struct optimistic_spin_queue **lock); | 128 | extern void osq_unlock(struct optimistic_spin_queue *lock); |
128 | 129 | ||
129 | #endif /* __LINUX_MCS_SPINLOCK_H */ | 130 | #endif /* __LINUX_MCS_SPINLOCK_H */ |
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index bc73d33c6760..ae712b25e492 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c | |||
@@ -46,12 +46,6 @@ | |||
46 | # include <asm/mutex.h> | 46 | # include <asm/mutex.h> |
47 | #endif | 47 | #endif |
48 | 48 | ||
49 | /* | ||
50 | * A negative mutex count indicates that waiters are sleeping waiting for the | ||
51 | * mutex. | ||
52 | */ | ||
53 | #define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0) | ||
54 | |||
55 | void | 49 | void |
56 | __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) | 50 | __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) |
57 | { | 51 | { |
@@ -60,7 +54,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) | |||
60 | INIT_LIST_HEAD(&lock->wait_list); | 54 | INIT_LIST_HEAD(&lock->wait_list); |
61 | mutex_clear_owner(lock); | 55 | mutex_clear_owner(lock); |
62 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 56 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
63 | lock->osq = NULL; | 57 | osq_lock_init(&lock->osq); |
64 | #endif | 58 | #endif |
65 | 59 | ||
66 | debug_mutex_init(lock, name, key); | 60 | debug_mutex_init(lock, name, key); |
@@ -152,7 +146,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) | |||
152 | if (need_resched()) | 146 | if (need_resched()) |
153 | break; | 147 | break; |
154 | 148 | ||
155 | arch_mutex_cpu_relax(); | 149 | cpu_relax_lowlatency(); |
156 | } | 150 | } |
157 | rcu_read_unlock(); | 151 | rcu_read_unlock(); |
158 | 152 | ||
@@ -388,12 +382,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
388 | /* | 382 | /* |
389 | * Optimistic spinning. | 383 | * Optimistic spinning. |
390 | * | 384 | * |
391 | * We try to spin for acquisition when we find that there are no | 385 | * We try to spin for acquisition when we find that the lock owner |
392 | * pending waiters and the lock owner is currently running on a | 386 | * is currently running on a (different) CPU and while we don't |
393 | * (different) CPU. | 387 | * need to reschedule. The rationale is that if the lock owner is |
394 | * | 388 | * running, it is likely to release the lock soon. |
395 | * The rationale is that if the lock owner is running, it is likely to | ||
396 | * release the lock soon. | ||
397 | * | 389 | * |
398 | * Since this needs the lock owner, and this mutex implementation | 390 | * Since this needs the lock owner, and this mutex implementation |
399 | * doesn't track the owner atomically in the lock field, we need to | 391 | * doesn't track the owner atomically in the lock field, we need to |
@@ -440,7 +432,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
440 | if (owner && !mutex_spin_on_owner(lock, owner)) | 432 | if (owner && !mutex_spin_on_owner(lock, owner)) |
441 | break; | 433 | break; |
442 | 434 | ||
443 | if ((atomic_read(&lock->count) == 1) && | 435 | /* Try to acquire the mutex if it is unlocked. */ |
436 | if (!mutex_is_locked(lock) && | ||
444 | (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { | 437 | (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { |
445 | lock_acquired(&lock->dep_map, ip); | 438 | lock_acquired(&lock->dep_map, ip); |
446 | if (use_ww_ctx) { | 439 | if (use_ww_ctx) { |
@@ -471,7 +464,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
471 | * memory barriers as we'll eventually observe the right | 464 | * memory barriers as we'll eventually observe the right |
472 | * values at the cost of a few extra spins. | 465 | * values at the cost of a few extra spins. |
473 | */ | 466 | */ |
474 | arch_mutex_cpu_relax(); | 467 | cpu_relax_lowlatency(); |
475 | } | 468 | } |
476 | osq_unlock(&lock->osq); | 469 | osq_unlock(&lock->osq); |
477 | slowpath: | 470 | slowpath: |
@@ -485,8 +478,11 @@ slowpath: | |||
485 | #endif | 478 | #endif |
486 | spin_lock_mutex(&lock->wait_lock, flags); | 479 | spin_lock_mutex(&lock->wait_lock, flags); |
487 | 480 | ||
488 | /* once more, can we acquire the lock? */ | 481 | /* |
489 | if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1)) | 482 | * Once more, try to acquire the lock. Only try-lock the mutex if |
483 | * it is unlocked to reduce unnecessary xchg() operations. | ||
484 | */ | ||
485 | if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1)) | ||
490 | goto skip_wait; | 486 | goto skip_wait; |
491 | 487 | ||
492 | debug_mutex_lock_common(lock, &waiter); | 488 | debug_mutex_lock_common(lock, &waiter); |
@@ -506,9 +502,10 @@ slowpath: | |||
506 | * it's unlocked. Later on, if we sleep, this is the | 502 | * it's unlocked. Later on, if we sleep, this is the |
507 | * operation that gives us the lock. We xchg it to -1, so | 503 | * operation that gives us the lock. We xchg it to -1, so |
508 | * that when we release the lock, we properly wake up the | 504 | * that when we release the lock, we properly wake up the |
509 | * other waiters: | 505 | * other waiters. We only attempt the xchg if the count is |
506 | * non-negative in order to avoid unnecessary xchg operations: | ||
510 | */ | 507 | */ |
511 | if (MUTEX_SHOW_NO_WAITER(lock) && | 508 | if (atomic_read(&lock->count) >= 0 && |
512 | (atomic_xchg(&lock->count, -1) == 1)) | 509 | (atomic_xchg(&lock->count, -1) == 1)) |
513 | break; | 510 | break; |
514 | 511 | ||
@@ -823,6 +820,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) | |||
823 | unsigned long flags; | 820 | unsigned long flags; |
824 | int prev; | 821 | int prev; |
825 | 822 | ||
823 | /* No need to trylock if the mutex is locked. */ | ||
824 | if (mutex_is_locked(lock)) | ||
825 | return 0; | ||
826 | |||
826 | spin_lock_mutex(&lock->wait_lock, flags); | 827 | spin_lock_mutex(&lock->wait_lock, flags); |
827 | 828 | ||
828 | prev = atomic_xchg(&lock->count, -1); | 829 | prev = atomic_xchg(&lock->count, -1); |
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index fb5b8ac411a5..f956ede7f90d 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c | |||
@@ -20,7 +20,6 @@ | |||
20 | #include <linux/cpumask.h> | 20 | #include <linux/cpumask.h> |
21 | #include <linux/percpu.h> | 21 | #include <linux/percpu.h> |
22 | #include <linux/hardirq.h> | 22 | #include <linux/hardirq.h> |
23 | #include <linux/mutex.h> | ||
24 | #include <asm/qrwlock.h> | 23 | #include <asm/qrwlock.h> |
25 | 24 | ||
26 | /** | 25 | /** |
@@ -35,7 +34,7 @@ static __always_inline void | |||
35 | rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) | 34 | rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) |
36 | { | 35 | { |
37 | while ((cnts & _QW_WMASK) == _QW_LOCKED) { | 36 | while ((cnts & _QW_WMASK) == _QW_LOCKED) { |
38 | arch_mutex_cpu_relax(); | 37 | cpu_relax_lowlatency(); |
39 | cnts = smp_load_acquire((u32 *)&lock->cnts); | 38 | cnts = smp_load_acquire((u32 *)&lock->cnts); |
40 | } | 39 | } |
41 | } | 40 | } |
@@ -75,7 +74,7 @@ void queue_read_lock_slowpath(struct qrwlock *lock) | |||
75 | * to make sure that the write lock isn't taken. | 74 | * to make sure that the write lock isn't taken. |
76 | */ | 75 | */ |
77 | while (atomic_read(&lock->cnts) & _QW_WMASK) | 76 | while (atomic_read(&lock->cnts) & _QW_WMASK) |
78 | arch_mutex_cpu_relax(); | 77 | cpu_relax_lowlatency(); |
79 | 78 | ||
80 | cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; | 79 | cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; |
81 | rspin_until_writer_unlock(lock, cnts); | 80 | rspin_until_writer_unlock(lock, cnts); |
@@ -114,7 +113,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock) | |||
114 | cnts | _QW_WAITING) == cnts)) | 113 | cnts | _QW_WAITING) == cnts)) |
115 | break; | 114 | break; |
116 | 115 | ||
117 | arch_mutex_cpu_relax(); | 116 | cpu_relax_lowlatency(); |
118 | } | 117 | } |
119 | 118 | ||
120 | /* When no more readers, set the locked flag */ | 119 | /* When no more readers, set the locked flag */ |
@@ -125,7 +124,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock) | |||
125 | _QW_LOCKED) == _QW_WAITING)) | 124 | _QW_LOCKED) == _QW_WAITING)) |
126 | break; | 125 | break; |
127 | 126 | ||
128 | arch_mutex_cpu_relax(); | 127 | cpu_relax_lowlatency(); |
129 | } | 128 | } |
130 | unlock: | 129 | unlock: |
131 | arch_spin_unlock(&lock->lock); | 130 | arch_spin_unlock(&lock->lock); |
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 49b2ed3dced8..62b6cee8ea7f 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c | |||
@@ -66,12 +66,13 @@ void rt_mutex_debug_task_free(struct task_struct *task) | |||
66 | * the deadlock. We print when we return. act_waiter can be NULL in | 66 | * the deadlock. We print when we return. act_waiter can be NULL in |
67 | * case of a remove waiter operation. | 67 | * case of a remove waiter operation. |
68 | */ | 68 | */ |
69 | void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, | 69 | void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, |
70 | struct rt_mutex_waiter *act_waiter, | ||
70 | struct rt_mutex *lock) | 71 | struct rt_mutex *lock) |
71 | { | 72 | { |
72 | struct task_struct *task; | 73 | struct task_struct *task; |
73 | 74 | ||
74 | if (!debug_locks || detect || !act_waiter) | 75 | if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter) |
75 | return; | 76 | return; |
76 | 77 | ||
77 | task = rt_mutex_owner(act_waiter->lock); | 78 | task = rt_mutex_owner(act_waiter->lock); |
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index ab29b6a22669..d0519c3432b6 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h | |||
@@ -20,14 +20,15 @@ extern void debug_rt_mutex_unlock(struct rt_mutex *lock); | |||
20 | extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, | 20 | extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, |
21 | struct task_struct *powner); | 21 | struct task_struct *powner); |
22 | extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); | 22 | extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); |
23 | extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, | 23 | extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, |
24 | struct rt_mutex_waiter *waiter, | ||
24 | struct rt_mutex *lock); | 25 | struct rt_mutex *lock); |
25 | extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); | 26 | extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); |
26 | # define debug_rt_mutex_reset_waiter(w) \ | 27 | # define debug_rt_mutex_reset_waiter(w) \ |
27 | do { (w)->deadlock_lock = NULL; } while (0) | 28 | do { (w)->deadlock_lock = NULL; } while (0) |
28 | 29 | ||
29 | static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, | 30 | static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, |
30 | int detect) | 31 | enum rtmutex_chainwalk walk) |
31 | { | 32 | { |
32 | return (waiter != NULL); | 33 | return (waiter != NULL); |
33 | } | 34 | } |
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index fc605941b9b8..a0ea2a141b3b 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
@@ -308,6 +308,32 @@ static void rt_mutex_adjust_prio(struct task_struct *task) | |||
308 | } | 308 | } |
309 | 309 | ||
310 | /* | 310 | /* |
311 | * Deadlock detection is conditional: | ||
312 | * | ||
313 | * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted | ||
314 | * if the detect argument is == RT_MUTEX_FULL_CHAINWALK. | ||
315 | * | ||
316 | * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always | ||
317 | * conducted independent of the detect argument. | ||
318 | * | ||
319 | * If the waiter argument is NULL this indicates the deboost path and | ||
320 | * deadlock detection is disabled independent of the detect argument | ||
321 | * and the config settings. | ||
322 | */ | ||
323 | static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, | ||
324 | enum rtmutex_chainwalk chwalk) | ||
325 | { | ||
326 | /* | ||
327 | * This is just a wrapper function for the following call, | ||
328 | * because debug_rt_mutex_detect_deadlock() smells like a magic | ||
329 | * debug feature and I wanted to keep the cond function in the | ||
330 | * main source file along with the comments instead of having | ||
331 | * two of the same in the headers. | ||
332 | */ | ||
333 | return debug_rt_mutex_detect_deadlock(waiter, chwalk); | ||
334 | } | ||
335 | |||
336 | /* | ||
311 | * Max number of times we'll walk the boosting chain: | 337 | * Max number of times we'll walk the boosting chain: |
312 | */ | 338 | */ |
313 | int max_lock_depth = 1024; | 339 | int max_lock_depth = 1024; |
@@ -337,21 +363,65 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) | |||
337 | * @top_task: the current top waiter | 363 | * @top_task: the current top waiter |
338 | * | 364 | * |
339 | * Returns 0 or -EDEADLK. | 365 | * Returns 0 or -EDEADLK. |
366 | * | ||
367 | * Chain walk basics and protection scope | ||
368 | * | ||
369 | * [R] refcount on task | ||
370 | * [P] task->pi_lock held | ||
371 | * [L] rtmutex->wait_lock held | ||
372 | * | ||
373 | * Step Description Protected by | ||
374 | * function arguments: | ||
375 | * @task [R] | ||
376 | * @orig_lock if != NULL @top_task is blocked on it | ||
377 | * @next_lock Unprotected. Cannot be | ||
378 | * dereferenced. Only used for | ||
379 | * comparison. | ||
380 | * @orig_waiter if != NULL @top_task is blocked on it | ||
381 | * @top_task current, or in case of proxy | ||
382 | * locking protected by calling | ||
383 | * code | ||
384 | * again: | ||
385 | * loop_sanity_check(); | ||
386 | * retry: | ||
387 | * [1] lock(task->pi_lock); [R] acquire [P] | ||
388 | * [2] waiter = task->pi_blocked_on; [P] | ||
389 | * [3] check_exit_conditions_1(); [P] | ||
390 | * [4] lock = waiter->lock; [P] | ||
391 | * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L] | ||
392 | * unlock(task->pi_lock); release [P] | ||
393 | * goto retry; | ||
394 | * } | ||
395 | * [6] check_exit_conditions_2(); [P] + [L] | ||
396 | * [7] requeue_lock_waiter(lock, waiter); [P] + [L] | ||
397 | * [8] unlock(task->pi_lock); release [P] | ||
398 | * put_task_struct(task); release [R] | ||
399 | * [9] check_exit_conditions_3(); [L] | ||
400 | * [10] task = owner(lock); [L] | ||
401 | * get_task_struct(task); [L] acquire [R] | ||
402 | * lock(task->pi_lock); [L] acquire [P] | ||
403 | * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L] | ||
404 | * [12] check_exit_conditions_4(); [P] + [L] | ||
405 | * [13] unlock(task->pi_lock); release [P] | ||
406 | * unlock(lock->wait_lock); release [L] | ||
407 | * goto again; | ||
340 | */ | 408 | */ |
341 | static int rt_mutex_adjust_prio_chain(struct task_struct *task, | 409 | static int rt_mutex_adjust_prio_chain(struct task_struct *task, |
342 | int deadlock_detect, | 410 | enum rtmutex_chainwalk chwalk, |
343 | struct rt_mutex *orig_lock, | 411 | struct rt_mutex *orig_lock, |
344 | struct rt_mutex *next_lock, | 412 | struct rt_mutex *next_lock, |
345 | struct rt_mutex_waiter *orig_waiter, | 413 | struct rt_mutex_waiter *orig_waiter, |
346 | struct task_struct *top_task) | 414 | struct task_struct *top_task) |
347 | { | 415 | { |
348 | struct rt_mutex *lock; | ||
349 | struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; | 416 | struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; |
350 | int detect_deadlock, ret = 0, depth = 0; | 417 | struct rt_mutex_waiter *prerequeue_top_waiter; |
418 | int ret = 0, depth = 0; | ||
419 | struct rt_mutex *lock; | ||
420 | bool detect_deadlock; | ||
351 | unsigned long flags; | 421 | unsigned long flags; |
422 | bool requeue = true; | ||
352 | 423 | ||
353 | detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, | 424 | detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk); |
354 | deadlock_detect); | ||
355 | 425 | ||
356 | /* | 426 | /* |
357 | * The (de)boosting is a step by step approach with a lot of | 427 | * The (de)boosting is a step by step approach with a lot of |
@@ -360,6 +430,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
360 | * carefully whether things change under us. | 430 | * carefully whether things change under us. |
361 | */ | 431 | */ |
362 | again: | 432 | again: |
433 | /* | ||
434 | * We limit the lock chain length for each invocation. | ||
435 | */ | ||
363 | if (++depth > max_lock_depth) { | 436 | if (++depth > max_lock_depth) { |
364 | static int prev_max; | 437 | static int prev_max; |
365 | 438 | ||
@@ -377,13 +450,28 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
377 | 450 | ||
378 | return -EDEADLK; | 451 | return -EDEADLK; |
379 | } | 452 | } |
453 | |||
454 | /* | ||
455 | * We are fully preemptible here and only hold the refcount on | ||
456 | * @task. So everything can have changed under us since the | ||
457 | * caller or our own code below (goto retry/again) dropped all | ||
458 | * locks. | ||
459 | */ | ||
380 | retry: | 460 | retry: |
381 | /* | 461 | /* |
382 | * Task can not go away as we did a get_task() before ! | 462 | * [1] Task cannot go away as we did a get_task() before ! |
383 | */ | 463 | */ |
384 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 464 | raw_spin_lock_irqsave(&task->pi_lock, flags); |
385 | 465 | ||
466 | /* | ||
467 | * [2] Get the waiter on which @task is blocked on. | ||
468 | */ | ||
386 | waiter = task->pi_blocked_on; | 469 | waiter = task->pi_blocked_on; |
470 | |||
471 | /* | ||
472 | * [3] check_exit_conditions_1() protected by task->pi_lock. | ||
473 | */ | ||
474 | |||
387 | /* | 475 | /* |
388 | * Check whether the end of the boosting chain has been | 476 | * Check whether the end of the boosting chain has been |
389 | * reached or the state of the chain has changed while we | 477 | * reached or the state of the chain has changed while we |
@@ -421,20 +509,41 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
421 | goto out_unlock_pi; | 509 | goto out_unlock_pi; |
422 | /* | 510 | /* |
423 | * If deadlock detection is off, we stop here if we | 511 | * If deadlock detection is off, we stop here if we |
424 | * are not the top pi waiter of the task. | 512 | * are not the top pi waiter of the task. If deadlock |
513 | * detection is enabled we continue, but stop the | ||
514 | * requeueing in the chain walk. | ||
425 | */ | 515 | */ |
426 | if (!detect_deadlock && top_waiter != task_top_pi_waiter(task)) | 516 | if (top_waiter != task_top_pi_waiter(task)) { |
427 | goto out_unlock_pi; | 517 | if (!detect_deadlock) |
518 | goto out_unlock_pi; | ||
519 | else | ||
520 | requeue = false; | ||
521 | } | ||
428 | } | 522 | } |
429 | 523 | ||
430 | /* | 524 | /* |
431 | * When deadlock detection is off then we check, if further | 525 | * If the waiter priority is the same as the task priority |
432 | * priority adjustment is necessary. | 526 | * then there is no further priority adjustment necessary. If |
527 | * deadlock detection is off, we stop the chain walk. If its | ||
528 | * enabled we continue, but stop the requeueing in the chain | ||
529 | * walk. | ||
433 | */ | 530 | */ |
434 | if (!detect_deadlock && waiter->prio == task->prio) | 531 | if (waiter->prio == task->prio) { |
435 | goto out_unlock_pi; | 532 | if (!detect_deadlock) |
533 | goto out_unlock_pi; | ||
534 | else | ||
535 | requeue = false; | ||
536 | } | ||
436 | 537 | ||
538 | /* | ||
539 | * [4] Get the next lock | ||
540 | */ | ||
437 | lock = waiter->lock; | 541 | lock = waiter->lock; |
542 | /* | ||
543 | * [5] We need to trylock here as we are holding task->pi_lock, | ||
544 | * which is the reverse lock order versus the other rtmutex | ||
545 | * operations. | ||
546 | */ | ||
438 | if (!raw_spin_trylock(&lock->wait_lock)) { | 547 | if (!raw_spin_trylock(&lock->wait_lock)) { |
439 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 548 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
440 | cpu_relax(); | 549 | cpu_relax(); |
@@ -442,79 +551,180 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
442 | } | 551 | } |
443 | 552 | ||
444 | /* | 553 | /* |
554 | * [6] check_exit_conditions_2() protected by task->pi_lock and | ||
555 | * lock->wait_lock. | ||
556 | * | ||
445 | * Deadlock detection. If the lock is the same as the original | 557 | * Deadlock detection. If the lock is the same as the original |
446 | * lock which caused us to walk the lock chain or if the | 558 | * lock which caused us to walk the lock chain or if the |
447 | * current lock is owned by the task which initiated the chain | 559 | * current lock is owned by the task which initiated the chain |
448 | * walk, we detected a deadlock. | 560 | * walk, we detected a deadlock. |
449 | */ | 561 | */ |
450 | if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { | 562 | if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { |
451 | debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); | 563 | debug_rt_mutex_deadlock(chwalk, orig_waiter, lock); |
452 | raw_spin_unlock(&lock->wait_lock); | 564 | raw_spin_unlock(&lock->wait_lock); |
453 | ret = -EDEADLK; | 565 | ret = -EDEADLK; |
454 | goto out_unlock_pi; | 566 | goto out_unlock_pi; |
455 | } | 567 | } |
456 | 568 | ||
457 | top_waiter = rt_mutex_top_waiter(lock); | 569 | /* |
570 | * If we just follow the lock chain for deadlock detection, no | ||
571 | * need to do all the requeue operations. To avoid a truckload | ||
572 | * of conditionals around the various places below, just do the | ||
573 | * minimum chain walk checks. | ||
574 | */ | ||
575 | if (!requeue) { | ||
576 | /* | ||
577 | * No requeue[7] here. Just release @task [8] | ||
578 | */ | ||
579 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | ||
580 | put_task_struct(task); | ||
581 | |||
582 | /* | ||
583 | * [9] check_exit_conditions_3 protected by lock->wait_lock. | ||
584 | * If there is no owner of the lock, end of chain. | ||
585 | */ | ||
586 | if (!rt_mutex_owner(lock)) { | ||
587 | raw_spin_unlock(&lock->wait_lock); | ||
588 | return 0; | ||
589 | } | ||
590 | |||
591 | /* [10] Grab the next task, i.e. owner of @lock */ | ||
592 | task = rt_mutex_owner(lock); | ||
593 | get_task_struct(task); | ||
594 | raw_spin_lock_irqsave(&task->pi_lock, flags); | ||
595 | |||
596 | /* | ||
597 | * No requeue [11] here. We just do deadlock detection. | ||
598 | * | ||
599 | * [12] Store whether owner is blocked | ||
600 | * itself. Decision is made after dropping the locks | ||
601 | */ | ||
602 | next_lock = task_blocked_on_lock(task); | ||
603 | /* | ||
604 | * Get the top waiter for the next iteration | ||
605 | */ | ||
606 | top_waiter = rt_mutex_top_waiter(lock); | ||
607 | |||
608 | /* [13] Drop locks */ | ||
609 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | ||
610 | raw_spin_unlock(&lock->wait_lock); | ||
611 | |||
612 | /* If owner is not blocked, end of chain. */ | ||
613 | if (!next_lock) | ||
614 | goto out_put_task; | ||
615 | goto again; | ||
616 | } | ||
458 | 617 | ||
459 | /* Requeue the waiter */ | 618 | /* |
619 | * Store the current top waiter before doing the requeue | ||
620 | * operation on @lock. We need it for the boost/deboost | ||
621 | * decision below. | ||
622 | */ | ||
623 | prerequeue_top_waiter = rt_mutex_top_waiter(lock); | ||
624 | |||
625 | /* [7] Requeue the waiter in the lock waiter list. */ | ||
460 | rt_mutex_dequeue(lock, waiter); | 626 | rt_mutex_dequeue(lock, waiter); |
461 | waiter->prio = task->prio; | 627 | waiter->prio = task->prio; |
462 | rt_mutex_enqueue(lock, waiter); | 628 | rt_mutex_enqueue(lock, waiter); |
463 | 629 | ||
464 | /* Release the task */ | 630 | /* [8] Release the task */ |
465 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 631 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
632 | put_task_struct(task); | ||
633 | |||
634 | /* | ||
635 | * [9] check_exit_conditions_3 protected by lock->wait_lock. | ||
636 | * | ||
637 | * We must abort the chain walk if there is no lock owner even | ||
638 | * in the dead lock detection case, as we have nothing to | ||
639 | * follow here. This is the end of the chain we are walking. | ||
640 | */ | ||
466 | if (!rt_mutex_owner(lock)) { | 641 | if (!rt_mutex_owner(lock)) { |
467 | /* | 642 | /* |
468 | * If the requeue above changed the top waiter, then we need | 643 | * If the requeue [7] above changed the top waiter, |
469 | * to wake the new top waiter up to try to get the lock. | 644 | * then we need to wake the new top waiter up to try |
645 | * to get the lock. | ||
470 | */ | 646 | */ |
471 | 647 | if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) | |
472 | if (top_waiter != rt_mutex_top_waiter(lock)) | ||
473 | wake_up_process(rt_mutex_top_waiter(lock)->task); | 648 | wake_up_process(rt_mutex_top_waiter(lock)->task); |
474 | raw_spin_unlock(&lock->wait_lock); | 649 | raw_spin_unlock(&lock->wait_lock); |
475 | goto out_put_task; | 650 | return 0; |
476 | } | 651 | } |
477 | put_task_struct(task); | ||
478 | 652 | ||
479 | /* Grab the next task */ | 653 | /* [10] Grab the next task, i.e. the owner of @lock */ |
480 | task = rt_mutex_owner(lock); | 654 | task = rt_mutex_owner(lock); |
481 | get_task_struct(task); | 655 | get_task_struct(task); |
482 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 656 | raw_spin_lock_irqsave(&task->pi_lock, flags); |
483 | 657 | ||
658 | /* [11] requeue the pi waiters if necessary */ | ||
484 | if (waiter == rt_mutex_top_waiter(lock)) { | 659 | if (waiter == rt_mutex_top_waiter(lock)) { |
485 | /* Boost the owner */ | 660 | /* |
486 | rt_mutex_dequeue_pi(task, top_waiter); | 661 | * The waiter became the new top (highest priority) |
662 | * waiter on the lock. Replace the previous top waiter | ||
663 | * in the owner tasks pi waiters list with this waiter | ||
664 | * and adjust the priority of the owner. | ||
665 | */ | ||
666 | rt_mutex_dequeue_pi(task, prerequeue_top_waiter); | ||
487 | rt_mutex_enqueue_pi(task, waiter); | 667 | rt_mutex_enqueue_pi(task, waiter); |
488 | __rt_mutex_adjust_prio(task); | 668 | __rt_mutex_adjust_prio(task); |
489 | 669 | ||
490 | } else if (top_waiter == waiter) { | 670 | } else if (prerequeue_top_waiter == waiter) { |
491 | /* Deboost the owner */ | 671 | /* |
672 | * The waiter was the top waiter on the lock, but is | ||
673 | * no longer the top prority waiter. Replace waiter in | ||
674 | * the owner tasks pi waiters list with the new top | ||
675 | * (highest priority) waiter and adjust the priority | ||
676 | * of the owner. | ||
677 | * The new top waiter is stored in @waiter so that | ||
678 | * @waiter == @top_waiter evaluates to true below and | ||
679 | * we continue to deboost the rest of the chain. | ||
680 | */ | ||
492 | rt_mutex_dequeue_pi(task, waiter); | 681 | rt_mutex_dequeue_pi(task, waiter); |
493 | waiter = rt_mutex_top_waiter(lock); | 682 | waiter = rt_mutex_top_waiter(lock); |
494 | rt_mutex_enqueue_pi(task, waiter); | 683 | rt_mutex_enqueue_pi(task, waiter); |
495 | __rt_mutex_adjust_prio(task); | 684 | __rt_mutex_adjust_prio(task); |
685 | } else { | ||
686 | /* | ||
687 | * Nothing changed. No need to do any priority | ||
688 | * adjustment. | ||
689 | */ | ||
496 | } | 690 | } |
497 | 691 | ||
498 | /* | 692 | /* |
693 | * [12] check_exit_conditions_4() protected by task->pi_lock | ||
694 | * and lock->wait_lock. The actual decisions are made after we | ||
695 | * dropped the locks. | ||
696 | * | ||
499 | * Check whether the task which owns the current lock is pi | 697 | * Check whether the task which owns the current lock is pi |
500 | * blocked itself. If yes we store a pointer to the lock for | 698 | * blocked itself. If yes we store a pointer to the lock for |
501 | * the lock chain change detection above. After we dropped | 699 | * the lock chain change detection above. After we dropped |
502 | * task->pi_lock next_lock cannot be dereferenced anymore. | 700 | * task->pi_lock next_lock cannot be dereferenced anymore. |
503 | */ | 701 | */ |
504 | next_lock = task_blocked_on_lock(task); | 702 | next_lock = task_blocked_on_lock(task); |
703 | /* | ||
704 | * Store the top waiter of @lock for the end of chain walk | ||
705 | * decision below. | ||
706 | */ | ||
707 | top_waiter = rt_mutex_top_waiter(lock); | ||
505 | 708 | ||
709 | /* [13] Drop the locks */ | ||
506 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 710 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
507 | |||
508 | top_waiter = rt_mutex_top_waiter(lock); | ||
509 | raw_spin_unlock(&lock->wait_lock); | 711 | raw_spin_unlock(&lock->wait_lock); |
510 | 712 | ||
511 | /* | 713 | /* |
714 | * Make the actual exit decisions [12], based on the stored | ||
715 | * values. | ||
716 | * | ||
512 | * We reached the end of the lock chain. Stop right here. No | 717 | * We reached the end of the lock chain. Stop right here. No |
513 | * point to go back just to figure that out. | 718 | * point to go back just to figure that out. |
514 | */ | 719 | */ |
515 | if (!next_lock) | 720 | if (!next_lock) |
516 | goto out_put_task; | 721 | goto out_put_task; |
517 | 722 | ||
723 | /* | ||
724 | * If the current waiter is not the top waiter on the lock, | ||
725 | * then we can stop the chain walk here if we are not in full | ||
726 | * deadlock detection mode. | ||
727 | */ | ||
518 | if (!detect_deadlock && waiter != top_waiter) | 728 | if (!detect_deadlock && waiter != top_waiter) |
519 | goto out_put_task; | 729 | goto out_put_task; |
520 | 730 | ||
@@ -533,76 +743,119 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
533 | * | 743 | * |
534 | * Must be called with lock->wait_lock held. | 744 | * Must be called with lock->wait_lock held. |
535 | * | 745 | * |
536 | * @lock: the lock to be acquired. | 746 | * @lock: The lock to be acquired. |
537 | * @task: the task which wants to acquire the lock | 747 | * @task: The task which wants to acquire the lock |
538 | * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) | 748 | * @waiter: The waiter that is queued to the lock's wait list if the |
749 | * callsite called task_blocked_on_lock(), otherwise NULL | ||
539 | */ | 750 | */ |
540 | static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, | 751 | static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, |
541 | struct rt_mutex_waiter *waiter) | 752 | struct rt_mutex_waiter *waiter) |
542 | { | 753 | { |
754 | unsigned long flags; | ||
755 | |||
543 | /* | 756 | /* |
544 | * We have to be careful here if the atomic speedups are | 757 | * Before testing whether we can acquire @lock, we set the |
545 | * enabled, such that, when | 758 | * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all |
546 | * - no other waiter is on the lock | 759 | * other tasks which try to modify @lock into the slow path |
547 | * - the lock has been released since we did the cmpxchg | 760 | * and they serialize on @lock->wait_lock. |
548 | * the lock can be released or taken while we are doing the | 761 | * |
549 | * checks and marking the lock with RT_MUTEX_HAS_WAITERS. | 762 | * The RT_MUTEX_HAS_WAITERS bit can have a transitional state |
763 | * as explained at the top of this file if and only if: | ||
550 | * | 764 | * |
551 | * The atomic acquire/release aware variant of | 765 | * - There is a lock owner. The caller must fixup the |
552 | * mark_rt_mutex_waiters uses a cmpxchg loop. After setting | 766 | * transient state if it does a trylock or leaves the lock |
553 | * the WAITERS bit, the atomic release / acquire can not | 767 | * function due to a signal or timeout. |
554 | * happen anymore and lock->wait_lock protects us from the | ||
555 | * non-atomic case. | ||
556 | * | 768 | * |
557 | * Note, that this might set lock->owner = | 769 | * - @task acquires the lock and there are no other |
558 | * RT_MUTEX_HAS_WAITERS in the case the lock is not contended | 770 | * waiters. This is undone in rt_mutex_set_owner(@task) at |
559 | * any more. This is fixed up when we take the ownership. | 771 | * the end of this function. |
560 | * This is the transitional state explained at the top of this file. | ||
561 | */ | 772 | */ |
562 | mark_rt_mutex_waiters(lock); | 773 | mark_rt_mutex_waiters(lock); |
563 | 774 | ||
775 | /* | ||
776 | * If @lock has an owner, give up. | ||
777 | */ | ||
564 | if (rt_mutex_owner(lock)) | 778 | if (rt_mutex_owner(lock)) |
565 | return 0; | 779 | return 0; |
566 | 780 | ||
567 | /* | 781 | /* |
568 | * It will get the lock because of one of these conditions: | 782 | * If @waiter != NULL, @task has already enqueued the waiter |
569 | * 1) there is no waiter | 783 | * into @lock waiter list. If @waiter == NULL then this is a |
570 | * 2) higher priority than waiters | 784 | * trylock attempt. |
571 | * 3) it is top waiter | ||
572 | */ | 785 | */ |
573 | if (rt_mutex_has_waiters(lock)) { | 786 | if (waiter) { |
574 | if (task->prio >= rt_mutex_top_waiter(lock)->prio) { | 787 | /* |
575 | if (!waiter || waiter != rt_mutex_top_waiter(lock)) | 788 | * If waiter is not the highest priority waiter of |
576 | return 0; | 789 | * @lock, give up. |
577 | } | 790 | */ |
578 | } | 791 | if (waiter != rt_mutex_top_waiter(lock)) |
579 | 792 | return 0; | |
580 | if (waiter || rt_mutex_has_waiters(lock)) { | ||
581 | unsigned long flags; | ||
582 | struct rt_mutex_waiter *top; | ||
583 | |||
584 | raw_spin_lock_irqsave(&task->pi_lock, flags); | ||
585 | 793 | ||
586 | /* remove the queued waiter. */ | 794 | /* |
587 | if (waiter) { | 795 | * We can acquire the lock. Remove the waiter from the |
588 | rt_mutex_dequeue(lock, waiter); | 796 | * lock waiters list. |
589 | task->pi_blocked_on = NULL; | 797 | */ |
590 | } | 798 | rt_mutex_dequeue(lock, waiter); |
591 | 799 | ||
800 | } else { | ||
592 | /* | 801 | /* |
593 | * We have to enqueue the top waiter(if it exists) into | 802 | * If the lock has waiters already we check whether @task is |
594 | * task->pi_waiters list. | 803 | * eligible to take over the lock. |
804 | * | ||
805 | * If there are no other waiters, @task can acquire | ||
806 | * the lock. @task->pi_blocked_on is NULL, so it does | ||
807 | * not need to be dequeued. | ||
595 | */ | 808 | */ |
596 | if (rt_mutex_has_waiters(lock)) { | 809 | if (rt_mutex_has_waiters(lock)) { |
597 | top = rt_mutex_top_waiter(lock); | 810 | /* |
598 | rt_mutex_enqueue_pi(task, top); | 811 | * If @task->prio is greater than or equal to |
812 | * the top waiter priority (kernel view), | ||
813 | * @task lost. | ||
814 | */ | ||
815 | if (task->prio >= rt_mutex_top_waiter(lock)->prio) | ||
816 | return 0; | ||
817 | |||
818 | /* | ||
819 | * The current top waiter stays enqueued. We | ||
820 | * don't have to change anything in the lock | ||
821 | * waiters order. | ||
822 | */ | ||
823 | } else { | ||
824 | /* | ||
825 | * No waiters. Take the lock without the | ||
826 | * pi_lock dance.@task->pi_blocked_on is NULL | ||
827 | * and we have no waiters to enqueue in @task | ||
828 | * pi waiters list. | ||
829 | */ | ||
830 | goto takeit; | ||
599 | } | 831 | } |
600 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | ||
601 | } | 832 | } |
602 | 833 | ||
834 | /* | ||
835 | * Clear @task->pi_blocked_on. Requires protection by | ||
836 | * @task->pi_lock. Redundant operation for the @waiter == NULL | ||
837 | * case, but conditionals are more expensive than a redundant | ||
838 | * store. | ||
839 | */ | ||
840 | raw_spin_lock_irqsave(&task->pi_lock, flags); | ||
841 | task->pi_blocked_on = NULL; | ||
842 | /* | ||
843 | * Finish the lock acquisition. @task is the new owner. If | ||
844 | * other waiters exist we have to insert the highest priority | ||
845 | * waiter into @task->pi_waiters list. | ||
846 | */ | ||
847 | if (rt_mutex_has_waiters(lock)) | ||
848 | rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock)); | ||
849 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | ||
850 | |||
851 | takeit: | ||
603 | /* We got the lock. */ | 852 | /* We got the lock. */ |
604 | debug_rt_mutex_lock(lock); | 853 | debug_rt_mutex_lock(lock); |
605 | 854 | ||
855 | /* | ||
856 | * This either preserves the RT_MUTEX_HAS_WAITERS bit if there | ||
857 | * are still waiters or clears it. | ||
858 | */ | ||
606 | rt_mutex_set_owner(lock, task); | 859 | rt_mutex_set_owner(lock, task); |
607 | 860 | ||
608 | rt_mutex_deadlock_account_lock(lock, task); | 861 | rt_mutex_deadlock_account_lock(lock, task); |
@@ -620,7 +873,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, | |||
620 | static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | 873 | static int task_blocks_on_rt_mutex(struct rt_mutex *lock, |
621 | struct rt_mutex_waiter *waiter, | 874 | struct rt_mutex_waiter *waiter, |
622 | struct task_struct *task, | 875 | struct task_struct *task, |
623 | int detect_deadlock) | 876 | enum rtmutex_chainwalk chwalk) |
624 | { | 877 | { |
625 | struct task_struct *owner = rt_mutex_owner(lock); | 878 | struct task_struct *owner = rt_mutex_owner(lock); |
626 | struct rt_mutex_waiter *top_waiter = waiter; | 879 | struct rt_mutex_waiter *top_waiter = waiter; |
@@ -666,7 +919,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
666 | __rt_mutex_adjust_prio(owner); | 919 | __rt_mutex_adjust_prio(owner); |
667 | if (owner->pi_blocked_on) | 920 | if (owner->pi_blocked_on) |
668 | chain_walk = 1; | 921 | chain_walk = 1; |
669 | } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { | 922 | } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { |
670 | chain_walk = 1; | 923 | chain_walk = 1; |
671 | } | 924 | } |
672 | 925 | ||
@@ -691,7 +944,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
691 | 944 | ||
692 | raw_spin_unlock(&lock->wait_lock); | 945 | raw_spin_unlock(&lock->wait_lock); |
693 | 946 | ||
694 | res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, | 947 | res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, |
695 | next_lock, waiter, task); | 948 | next_lock, waiter, task); |
696 | 949 | ||
697 | raw_spin_lock(&lock->wait_lock); | 950 | raw_spin_lock(&lock->wait_lock); |
@@ -753,9 +1006,9 @@ static void wakeup_next_waiter(struct rt_mutex *lock) | |||
753 | static void remove_waiter(struct rt_mutex *lock, | 1006 | static void remove_waiter(struct rt_mutex *lock, |
754 | struct rt_mutex_waiter *waiter) | 1007 | struct rt_mutex_waiter *waiter) |
755 | { | 1008 | { |
756 | int first = (waiter == rt_mutex_top_waiter(lock)); | 1009 | bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); |
757 | struct task_struct *owner = rt_mutex_owner(lock); | 1010 | struct task_struct *owner = rt_mutex_owner(lock); |
758 | struct rt_mutex *next_lock = NULL; | 1011 | struct rt_mutex *next_lock; |
759 | unsigned long flags; | 1012 | unsigned long flags; |
760 | 1013 | ||
761 | raw_spin_lock_irqsave(¤t->pi_lock, flags); | 1014 | raw_spin_lock_irqsave(¤t->pi_lock, flags); |
@@ -763,29 +1016,31 @@ static void remove_waiter(struct rt_mutex *lock, | |||
763 | current->pi_blocked_on = NULL; | 1016 | current->pi_blocked_on = NULL; |
764 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); | 1017 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); |
765 | 1018 | ||
766 | if (!owner) | 1019 | /* |
1020 | * Only update priority if the waiter was the highest priority | ||
1021 | * waiter of the lock and there is an owner to update. | ||
1022 | */ | ||
1023 | if (!owner || !is_top_waiter) | ||
767 | return; | 1024 | return; |
768 | 1025 | ||
769 | if (first) { | 1026 | raw_spin_lock_irqsave(&owner->pi_lock, flags); |
770 | |||
771 | raw_spin_lock_irqsave(&owner->pi_lock, flags); | ||
772 | 1027 | ||
773 | rt_mutex_dequeue_pi(owner, waiter); | 1028 | rt_mutex_dequeue_pi(owner, waiter); |
774 | 1029 | ||
775 | if (rt_mutex_has_waiters(lock)) { | 1030 | if (rt_mutex_has_waiters(lock)) |
776 | struct rt_mutex_waiter *next; | 1031 | rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); |
777 | 1032 | ||
778 | next = rt_mutex_top_waiter(lock); | 1033 | __rt_mutex_adjust_prio(owner); |
779 | rt_mutex_enqueue_pi(owner, next); | ||
780 | } | ||
781 | __rt_mutex_adjust_prio(owner); | ||
782 | 1034 | ||
783 | /* Store the lock on which owner is blocked or NULL */ | 1035 | /* Store the lock on which owner is blocked or NULL */ |
784 | next_lock = task_blocked_on_lock(owner); | 1036 | next_lock = task_blocked_on_lock(owner); |
785 | 1037 | ||
786 | raw_spin_unlock_irqrestore(&owner->pi_lock, flags); | 1038 | raw_spin_unlock_irqrestore(&owner->pi_lock, flags); |
787 | } | ||
788 | 1039 | ||
1040 | /* | ||
1041 | * Don't walk the chain, if the owner task is not blocked | ||
1042 | * itself. | ||
1043 | */ | ||
789 | if (!next_lock) | 1044 | if (!next_lock) |
790 | return; | 1045 | return; |
791 | 1046 | ||
@@ -794,7 +1049,8 @@ static void remove_waiter(struct rt_mutex *lock, | |||
794 | 1049 | ||
795 | raw_spin_unlock(&lock->wait_lock); | 1050 | raw_spin_unlock(&lock->wait_lock); |
796 | 1051 | ||
797 | rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current); | 1052 | rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, |
1053 | next_lock, NULL, current); | ||
798 | 1054 | ||
799 | raw_spin_lock(&lock->wait_lock); | 1055 | raw_spin_lock(&lock->wait_lock); |
800 | } | 1056 | } |
@@ -824,7 +1080,8 @@ void rt_mutex_adjust_pi(struct task_struct *task) | |||
824 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | 1080 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ |
825 | get_task_struct(task); | 1081 | get_task_struct(task); |
826 | 1082 | ||
827 | rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task); | 1083 | rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, |
1084 | next_lock, NULL, task); | ||
828 | } | 1085 | } |
829 | 1086 | ||
830 | /** | 1087 | /** |
@@ -902,7 +1159,7 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock, | |||
902 | static int __sched | 1159 | static int __sched |
903 | rt_mutex_slowlock(struct rt_mutex *lock, int state, | 1160 | rt_mutex_slowlock(struct rt_mutex *lock, int state, |
904 | struct hrtimer_sleeper *timeout, | 1161 | struct hrtimer_sleeper *timeout, |
905 | int detect_deadlock) | 1162 | enum rtmutex_chainwalk chwalk) |
906 | { | 1163 | { |
907 | struct rt_mutex_waiter waiter; | 1164 | struct rt_mutex_waiter waiter; |
908 | int ret = 0; | 1165 | int ret = 0; |
@@ -928,7 +1185,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
928 | timeout->task = NULL; | 1185 | timeout->task = NULL; |
929 | } | 1186 | } |
930 | 1187 | ||
931 | ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); | 1188 | ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); |
932 | 1189 | ||
933 | if (likely(!ret)) | 1190 | if (likely(!ret)) |
934 | ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); | 1191 | ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); |
@@ -937,7 +1194,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
937 | 1194 | ||
938 | if (unlikely(ret)) { | 1195 | if (unlikely(ret)) { |
939 | remove_waiter(lock, &waiter); | 1196 | remove_waiter(lock, &waiter); |
940 | rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter); | 1197 | rt_mutex_handle_deadlock(ret, chwalk, &waiter); |
941 | } | 1198 | } |
942 | 1199 | ||
943 | /* | 1200 | /* |
@@ -960,22 +1217,31 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
960 | /* | 1217 | /* |
961 | * Slow path try-lock function: | 1218 | * Slow path try-lock function: |
962 | */ | 1219 | */ |
963 | static inline int | 1220 | static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) |
964 | rt_mutex_slowtrylock(struct rt_mutex *lock) | ||
965 | { | 1221 | { |
966 | int ret = 0; | 1222 | int ret; |
1223 | |||
1224 | /* | ||
1225 | * If the lock already has an owner we fail to get the lock. | ||
1226 | * This can be done without taking the @lock->wait_lock as | ||
1227 | * it is only being read, and this is a trylock anyway. | ||
1228 | */ | ||
1229 | if (rt_mutex_owner(lock)) | ||
1230 | return 0; | ||
967 | 1231 | ||
1232 | /* | ||
1233 | * The mutex has currently no owner. Lock the wait lock and | ||
1234 | * try to acquire the lock. | ||
1235 | */ | ||
968 | raw_spin_lock(&lock->wait_lock); | 1236 | raw_spin_lock(&lock->wait_lock); |
969 | 1237 | ||
970 | if (likely(rt_mutex_owner(lock) != current)) { | 1238 | ret = try_to_take_rt_mutex(lock, current, NULL); |
971 | 1239 | ||
972 | ret = try_to_take_rt_mutex(lock, current, NULL); | 1240 | /* |
973 | /* | 1241 | * try_to_take_rt_mutex() sets the lock waiters bit |
974 | * try_to_take_rt_mutex() sets the lock waiters | 1242 | * unconditionally. Clean this up. |
975 | * bit unconditionally. Clean this up. | 1243 | */ |
976 | */ | 1244 | fixup_rt_mutex_waiters(lock); |
977 | fixup_rt_mutex_waiters(lock); | ||
978 | } | ||
979 | 1245 | ||
980 | raw_spin_unlock(&lock->wait_lock); | 1246 | raw_spin_unlock(&lock->wait_lock); |
981 | 1247 | ||
@@ -1053,30 +1319,31 @@ rt_mutex_slowunlock(struct rt_mutex *lock) | |||
1053 | */ | 1319 | */ |
1054 | static inline int | 1320 | static inline int |
1055 | rt_mutex_fastlock(struct rt_mutex *lock, int state, | 1321 | rt_mutex_fastlock(struct rt_mutex *lock, int state, |
1056 | int detect_deadlock, | ||
1057 | int (*slowfn)(struct rt_mutex *lock, int state, | 1322 | int (*slowfn)(struct rt_mutex *lock, int state, |
1058 | struct hrtimer_sleeper *timeout, | 1323 | struct hrtimer_sleeper *timeout, |
1059 | int detect_deadlock)) | 1324 | enum rtmutex_chainwalk chwalk)) |
1060 | { | 1325 | { |
1061 | if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { | 1326 | if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { |
1062 | rt_mutex_deadlock_account_lock(lock, current); | 1327 | rt_mutex_deadlock_account_lock(lock, current); |
1063 | return 0; | 1328 | return 0; |
1064 | } else | 1329 | } else |
1065 | return slowfn(lock, state, NULL, detect_deadlock); | 1330 | return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); |
1066 | } | 1331 | } |
1067 | 1332 | ||
1068 | static inline int | 1333 | static inline int |
1069 | rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, | 1334 | rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, |
1070 | struct hrtimer_sleeper *timeout, int detect_deadlock, | 1335 | struct hrtimer_sleeper *timeout, |
1336 | enum rtmutex_chainwalk chwalk, | ||
1071 | int (*slowfn)(struct rt_mutex *lock, int state, | 1337 | int (*slowfn)(struct rt_mutex *lock, int state, |
1072 | struct hrtimer_sleeper *timeout, | 1338 | struct hrtimer_sleeper *timeout, |
1073 | int detect_deadlock)) | 1339 | enum rtmutex_chainwalk chwalk)) |
1074 | { | 1340 | { |
1075 | if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { | 1341 | if (chwalk == RT_MUTEX_MIN_CHAINWALK && |
1342 | likely(rt_mutex_cmpxchg(lock, NULL, current))) { | ||
1076 | rt_mutex_deadlock_account_lock(lock, current); | 1343 | rt_mutex_deadlock_account_lock(lock, current); |
1077 | return 0; | 1344 | return 0; |
1078 | } else | 1345 | } else |
1079 | return slowfn(lock, state, timeout, detect_deadlock); | 1346 | return slowfn(lock, state, timeout, chwalk); |
1080 | } | 1347 | } |
1081 | 1348 | ||
1082 | static inline int | 1349 | static inline int |
@@ -1109,54 +1376,61 @@ void __sched rt_mutex_lock(struct rt_mutex *lock) | |||
1109 | { | 1376 | { |
1110 | might_sleep(); | 1377 | might_sleep(); |
1111 | 1378 | ||
1112 | rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); | 1379 | rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); |
1113 | } | 1380 | } |
1114 | EXPORT_SYMBOL_GPL(rt_mutex_lock); | 1381 | EXPORT_SYMBOL_GPL(rt_mutex_lock); |
1115 | 1382 | ||
1116 | /** | 1383 | /** |
1117 | * rt_mutex_lock_interruptible - lock a rt_mutex interruptible | 1384 | * rt_mutex_lock_interruptible - lock a rt_mutex interruptible |
1118 | * | 1385 | * |
1119 | * @lock: the rt_mutex to be locked | 1386 | * @lock: the rt_mutex to be locked |
1120 | * @detect_deadlock: deadlock detection on/off | ||
1121 | * | 1387 | * |
1122 | * Returns: | 1388 | * Returns: |
1123 | * 0 on success | 1389 | * 0 on success |
1124 | * -EINTR when interrupted by a signal | 1390 | * -EINTR when interrupted by a signal |
1125 | * -EDEADLK when the lock would deadlock (when deadlock detection is on) | ||
1126 | */ | 1391 | */ |
1127 | int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, | 1392 | int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) |
1128 | int detect_deadlock) | ||
1129 | { | 1393 | { |
1130 | might_sleep(); | 1394 | might_sleep(); |
1131 | 1395 | ||
1132 | return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, | 1396 | return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); |
1133 | detect_deadlock, rt_mutex_slowlock); | ||
1134 | } | 1397 | } |
1135 | EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); | 1398 | EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); |
1136 | 1399 | ||
1400 | /* | ||
1401 | * Futex variant with full deadlock detection. | ||
1402 | */ | ||
1403 | int rt_mutex_timed_futex_lock(struct rt_mutex *lock, | ||
1404 | struct hrtimer_sleeper *timeout) | ||
1405 | { | ||
1406 | might_sleep(); | ||
1407 | |||
1408 | return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, | ||
1409 | RT_MUTEX_FULL_CHAINWALK, | ||
1410 | rt_mutex_slowlock); | ||
1411 | } | ||
1412 | |||
1137 | /** | 1413 | /** |
1138 | * rt_mutex_timed_lock - lock a rt_mutex interruptible | 1414 | * rt_mutex_timed_lock - lock a rt_mutex interruptible |
1139 | * the timeout structure is provided | 1415 | * the timeout structure is provided |
1140 | * by the caller | 1416 | * by the caller |
1141 | * | 1417 | * |
1142 | * @lock: the rt_mutex to be locked | 1418 | * @lock: the rt_mutex to be locked |
1143 | * @timeout: timeout structure or NULL (no timeout) | 1419 | * @timeout: timeout structure or NULL (no timeout) |
1144 | * @detect_deadlock: deadlock detection on/off | ||
1145 | * | 1420 | * |
1146 | * Returns: | 1421 | * Returns: |
1147 | * 0 on success | 1422 | * 0 on success |
1148 | * -EINTR when interrupted by a signal | 1423 | * -EINTR when interrupted by a signal |
1149 | * -ETIMEDOUT when the timeout expired | 1424 | * -ETIMEDOUT when the timeout expired |
1150 | * -EDEADLK when the lock would deadlock (when deadlock detection is on) | ||
1151 | */ | 1425 | */ |
1152 | int | 1426 | int |
1153 | rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, | 1427 | rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) |
1154 | int detect_deadlock) | ||
1155 | { | 1428 | { |
1156 | might_sleep(); | 1429 | might_sleep(); |
1157 | 1430 | ||
1158 | return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, | 1431 | return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, |
1159 | detect_deadlock, rt_mutex_slowlock); | 1432 | RT_MUTEX_MIN_CHAINWALK, |
1433 | rt_mutex_slowlock); | ||
1160 | } | 1434 | } |
1161 | EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); | 1435 | EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); |
1162 | 1436 | ||
@@ -1262,7 +1536,6 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, | |||
1262 | * @lock: the rt_mutex to take | 1536 | * @lock: the rt_mutex to take |
1263 | * @waiter: the pre-initialized rt_mutex_waiter | 1537 | * @waiter: the pre-initialized rt_mutex_waiter |
1264 | * @task: the task to prepare | 1538 | * @task: the task to prepare |
1265 | * @detect_deadlock: perform deadlock detection (1) or not (0) | ||
1266 | * | 1539 | * |
1267 | * Returns: | 1540 | * Returns: |
1268 | * 0 - task blocked on lock | 1541 | * 0 - task blocked on lock |
@@ -1273,7 +1546,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, | |||
1273 | */ | 1546 | */ |
1274 | int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | 1547 | int rt_mutex_start_proxy_lock(struct rt_mutex *lock, |
1275 | struct rt_mutex_waiter *waiter, | 1548 | struct rt_mutex_waiter *waiter, |
1276 | struct task_struct *task, int detect_deadlock) | 1549 | struct task_struct *task) |
1277 | { | 1550 | { |
1278 | int ret; | 1551 | int ret; |
1279 | 1552 | ||
@@ -1285,7 +1558,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | |||
1285 | } | 1558 | } |
1286 | 1559 | ||
1287 | /* We enforce deadlock detection for futexes */ | 1560 | /* We enforce deadlock detection for futexes */ |
1288 | ret = task_blocks_on_rt_mutex(lock, waiter, task, 1); | 1561 | ret = task_blocks_on_rt_mutex(lock, waiter, task, |
1562 | RT_MUTEX_FULL_CHAINWALK); | ||
1289 | 1563 | ||
1290 | if (ret && !rt_mutex_owner(lock)) { | 1564 | if (ret && !rt_mutex_owner(lock)) { |
1291 | /* | 1565 | /* |
@@ -1331,22 +1605,20 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) | |||
1331 | * rt_mutex_finish_proxy_lock() - Complete lock acquisition | 1605 | * rt_mutex_finish_proxy_lock() - Complete lock acquisition |
1332 | * @lock: the rt_mutex we were woken on | 1606 | * @lock: the rt_mutex we were woken on |
1333 | * @to: the timeout, null if none. hrtimer should already have | 1607 | * @to: the timeout, null if none. hrtimer should already have |
1334 | * been started. | 1608 | * been started. |
1335 | * @waiter: the pre-initialized rt_mutex_waiter | 1609 | * @waiter: the pre-initialized rt_mutex_waiter |
1336 | * @detect_deadlock: perform deadlock detection (1) or not (0) | ||
1337 | * | 1610 | * |
1338 | * Complete the lock acquisition started our behalf by another thread. | 1611 | * Complete the lock acquisition started our behalf by another thread. |
1339 | * | 1612 | * |
1340 | * Returns: | 1613 | * Returns: |
1341 | * 0 - success | 1614 | * 0 - success |
1342 | * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK | 1615 | * <0 - error, one of -EINTR, -ETIMEDOUT |
1343 | * | 1616 | * |
1344 | * Special API call for PI-futex requeue support | 1617 | * Special API call for PI-futex requeue support |
1345 | */ | 1618 | */ |
1346 | int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, | 1619 | int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, |
1347 | struct hrtimer_sleeper *to, | 1620 | struct hrtimer_sleeper *to, |
1348 | struct rt_mutex_waiter *waiter, | 1621 | struct rt_mutex_waiter *waiter) |
1349 | int detect_deadlock) | ||
1350 | { | 1622 | { |
1351 | int ret; | 1623 | int ret; |
1352 | 1624 | ||
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index f6a1f3c133b1..c4060584c407 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h | |||
@@ -22,10 +22,15 @@ | |||
22 | #define debug_rt_mutex_init(m, n) do { } while (0) | 22 | #define debug_rt_mutex_init(m, n) do { } while (0) |
23 | #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) | 23 | #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) |
24 | #define debug_rt_mutex_print_deadlock(w) do { } while (0) | 24 | #define debug_rt_mutex_print_deadlock(w) do { } while (0) |
25 | #define debug_rt_mutex_detect_deadlock(w,d) (d) | ||
26 | #define debug_rt_mutex_reset_waiter(w) do { } while (0) | 25 | #define debug_rt_mutex_reset_waiter(w) do { } while (0) |
27 | 26 | ||
28 | static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) | 27 | static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) |
29 | { | 28 | { |
30 | WARN(1, "rtmutex deadlock detected\n"); | 29 | WARN(1, "rtmutex deadlock detected\n"); |
31 | } | 30 | } |
31 | |||
32 | static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w, | ||
33 | enum rtmutex_chainwalk walk) | ||
34 | { | ||
35 | return walk == RT_MUTEX_FULL_CHAINWALK; | ||
36 | } | ||
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 7431a9c86f35..855212501407 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h | |||
@@ -102,6 +102,21 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) | |||
102 | } | 102 | } |
103 | 103 | ||
104 | /* | 104 | /* |
105 | * Constants for rt mutex functions which have a selectable deadlock | ||
106 | * detection. | ||
107 | * | ||
108 | * RT_MUTEX_MIN_CHAINWALK: Stops the lock chain walk when there are | ||
109 | * no further PI adjustments to be made. | ||
110 | * | ||
111 | * RT_MUTEX_FULL_CHAINWALK: Invoke deadlock detection with a full | ||
112 | * walk of the lock chain. | ||
113 | */ | ||
114 | enum rtmutex_chainwalk { | ||
115 | RT_MUTEX_MIN_CHAINWALK, | ||
116 | RT_MUTEX_FULL_CHAINWALK, | ||
117 | }; | ||
118 | |||
119 | /* | ||
105 | * PI-futex support (proxy locking functions, etc.): | 120 | * PI-futex support (proxy locking functions, etc.): |
106 | */ | 121 | */ |
107 | extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); | 122 | extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); |
@@ -111,12 +126,11 @@ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, | |||
111 | struct task_struct *proxy_owner); | 126 | struct task_struct *proxy_owner); |
112 | extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, | 127 | extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, |
113 | struct rt_mutex_waiter *waiter, | 128 | struct rt_mutex_waiter *waiter, |
114 | struct task_struct *task, | 129 | struct task_struct *task); |
115 | int detect_deadlock); | ||
116 | extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, | 130 | extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, |
117 | struct hrtimer_sleeper *to, | 131 | struct hrtimer_sleeper *to, |
118 | struct rt_mutex_waiter *waiter, | 132 | struct rt_mutex_waiter *waiter); |
119 | int detect_deadlock); | 133 | extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); |
120 | 134 | ||
121 | #ifdef CONFIG_DEBUG_RT_MUTEXES | 135 | #ifdef CONFIG_DEBUG_RT_MUTEXES |
122 | # include "rtmutex-debug.h" | 136 | # include "rtmutex-debug.h" |
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 9be8a9144978..2c93571162cb 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c | |||
@@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem) | |||
26 | unsigned long flags; | 26 | unsigned long flags; |
27 | 27 | ||
28 | if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { | 28 | if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { |
29 | ret = (sem->activity != 0); | 29 | ret = (sem->count != 0); |
30 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 30 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
31 | } | 31 | } |
32 | return ret; | 32 | return ret; |
@@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, | |||
46 | debug_check_no_locks_freed((void *)sem, sizeof(*sem)); | 46 | debug_check_no_locks_freed((void *)sem, sizeof(*sem)); |
47 | lockdep_init_map(&sem->dep_map, name, key, 0); | 47 | lockdep_init_map(&sem->dep_map, name, key, 0); |
48 | #endif | 48 | #endif |
49 | sem->activity = 0; | 49 | sem->count = 0; |
50 | raw_spin_lock_init(&sem->wait_lock); | 50 | raw_spin_lock_init(&sem->wait_lock); |
51 | INIT_LIST_HEAD(&sem->wait_list); | 51 | INIT_LIST_HEAD(&sem->wait_list); |
52 | } | 52 | } |
@@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) | |||
95 | waiter = list_entry(next, struct rwsem_waiter, list); | 95 | waiter = list_entry(next, struct rwsem_waiter, list); |
96 | } while (waiter->type != RWSEM_WAITING_FOR_WRITE); | 96 | } while (waiter->type != RWSEM_WAITING_FOR_WRITE); |
97 | 97 | ||
98 | sem->activity += woken; | 98 | sem->count += woken; |
99 | 99 | ||
100 | out: | 100 | out: |
101 | return sem; | 101 | return sem; |
@@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem) | |||
126 | 126 | ||
127 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 127 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
128 | 128 | ||
129 | if (sem->activity >= 0 && list_empty(&sem->wait_list)) { | 129 | if (sem->count >= 0 && list_empty(&sem->wait_list)) { |
130 | /* granted */ | 130 | /* granted */ |
131 | sem->activity++; | 131 | sem->count++; |
132 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 132 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
133 | goto out; | 133 | goto out; |
134 | } | 134 | } |
@@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem) | |||
170 | 170 | ||
171 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 171 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
172 | 172 | ||
173 | if (sem->activity >= 0 && list_empty(&sem->wait_list)) { | 173 | if (sem->count >= 0 && list_empty(&sem->wait_list)) { |
174 | /* granted */ | 174 | /* granted */ |
175 | sem->activity++; | 175 | sem->count++; |
176 | ret = 1; | 176 | ret = 1; |
177 | } | 177 | } |
178 | 178 | ||
@@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) | |||
206 | * itself into sleep and waiting for system woke it or someone | 206 | * itself into sleep and waiting for system woke it or someone |
207 | * else in the head of the wait list up. | 207 | * else in the head of the wait list up. |
208 | */ | 208 | */ |
209 | if (sem->activity == 0) | 209 | if (sem->count == 0) |
210 | break; | 210 | break; |
211 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 211 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); |
212 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 212 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
@@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) | |||
214 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 214 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
215 | } | 215 | } |
216 | /* got the lock */ | 216 | /* got the lock */ |
217 | sem->activity = -1; | 217 | sem->count = -1; |
218 | list_del(&waiter.list); | 218 | list_del(&waiter.list); |
219 | 219 | ||
220 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 220 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
@@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem) | |||
235 | 235 | ||
236 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 236 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
237 | 237 | ||
238 | if (sem->activity == 0) { | 238 | if (sem->count == 0) { |
239 | /* got the lock */ | 239 | /* got the lock */ |
240 | sem->activity = -1; | 240 | sem->count = -1; |
241 | ret = 1; | 241 | ret = 1; |
242 | } | 242 | } |
243 | 243 | ||
@@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem) | |||
255 | 255 | ||
256 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 256 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
257 | 257 | ||
258 | if (--sem->activity == 0 && !list_empty(&sem->wait_list)) | 258 | if (--sem->count == 0 && !list_empty(&sem->wait_list)) |
259 | sem = __rwsem_wake_one_writer(sem); | 259 | sem = __rwsem_wake_one_writer(sem); |
260 | 260 | ||
261 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 261 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
@@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem) | |||
270 | 270 | ||
271 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 271 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
272 | 272 | ||
273 | sem->activity = 0; | 273 | sem->count = 0; |
274 | if (!list_empty(&sem->wait_list)) | 274 | if (!list_empty(&sem->wait_list)) |
275 | sem = __rwsem_do_wake(sem, 1); | 275 | sem = __rwsem_do_wake(sem, 1); |
276 | 276 | ||
@@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem) | |||
287 | 287 | ||
288 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | 288 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
289 | 289 | ||
290 | sem->activity = 1; | 290 | sem->count = 1; |
291 | if (!list_empty(&sem->wait_list)) | 291 | if (!list_empty(&sem->wait_list)) |
292 | sem = __rwsem_do_wake(sem, 0); | 292 | sem = __rwsem_do_wake(sem, 0); |
293 | 293 | ||
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index dacc32142fcc..d6203faf2eb1 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
@@ -82,9 +82,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, | |||
82 | sem->count = RWSEM_UNLOCKED_VALUE; | 82 | sem->count = RWSEM_UNLOCKED_VALUE; |
83 | raw_spin_lock_init(&sem->wait_lock); | 83 | raw_spin_lock_init(&sem->wait_lock); |
84 | INIT_LIST_HEAD(&sem->wait_list); | 84 | INIT_LIST_HEAD(&sem->wait_list); |
85 | #ifdef CONFIG_SMP | 85 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER |
86 | sem->owner = NULL; | 86 | sem->owner = NULL; |
87 | sem->osq = NULL; | 87 | osq_lock_init(&sem->osq); |
88 | #endif | 88 | #endif |
89 | } | 89 | } |
90 | 90 | ||
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | |||
262 | return false; | 262 | return false; |
263 | } | 263 | } |
264 | 264 | ||
265 | #ifdef CONFIG_SMP | 265 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER |
266 | /* | 266 | /* |
267 | * Try to acquire write lock before the writer has been put on wait queue. | 267 | * Try to acquire write lock before the writer has been put on wait queue. |
268 | */ | 268 | */ |
@@ -285,10 +285,10 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | |||
285 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) | 285 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) |
286 | { | 286 | { |
287 | struct task_struct *owner; | 287 | struct task_struct *owner; |
288 | bool on_cpu = true; | 288 | bool on_cpu = false; |
289 | 289 | ||
290 | if (need_resched()) | 290 | if (need_resched()) |
291 | return 0; | 291 | return false; |
292 | 292 | ||
293 | rcu_read_lock(); | 293 | rcu_read_lock(); |
294 | owner = ACCESS_ONCE(sem->owner); | 294 | owner = ACCESS_ONCE(sem->owner); |
@@ -297,9 +297,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) | |||
297 | rcu_read_unlock(); | 297 | rcu_read_unlock(); |
298 | 298 | ||
299 | /* | 299 | /* |
300 | * If sem->owner is not set, the rwsem owner may have | 300 | * If sem->owner is not set, yet we have just recently entered the |
301 | * just acquired it and not set the owner yet or the rwsem | 301 | * slowpath, then there is a possibility reader(s) may have the lock. |
302 | * has been released. | 302 | * To be safe, avoid spinning in these situations. |
303 | */ | 303 | */ |
304 | return on_cpu; | 304 | return on_cpu; |
305 | } | 305 | } |
@@ -329,7 +329,7 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) | |||
329 | if (need_resched()) | 329 | if (need_resched()) |
330 | break; | 330 | break; |
331 | 331 | ||
332 | arch_mutex_cpu_relax(); | 332 | cpu_relax_lowlatency(); |
333 | } | 333 | } |
334 | rcu_read_unlock(); | 334 | rcu_read_unlock(); |
335 | 335 | ||
@@ -381,7 +381,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | |||
381 | * memory barriers as we'll eventually observe the right | 381 | * memory barriers as we'll eventually observe the right |
382 | * values at the cost of a few extra spins. | 382 | * values at the cost of a few extra spins. |
383 | */ | 383 | */ |
384 | arch_mutex_cpu_relax(); | 384 | cpu_relax_lowlatency(); |
385 | } | 385 | } |
386 | osq_unlock(&sem->osq); | 386 | osq_unlock(&sem->osq); |
387 | done: | 387 | done: |
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 42f806de49d4..e2d3bc7f03b4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c | |||
@@ -12,7 +12,7 @@ | |||
12 | 12 | ||
13 | #include <linux/atomic.h> | 13 | #include <linux/atomic.h> |
14 | 14 | ||
15 | #if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM) | 15 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER |
16 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | 16 | static inline void rwsem_set_owner(struct rw_semaphore *sem) |
17 | { | 17 | { |
18 | sem->owner = current; | 18 | sem->owner = current; |
diff --git a/kernel/module.c b/kernel/module.c index 81e727cf6df9..ae79ce615cb9 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -60,7 +60,6 @@ | |||
60 | #include <linux/jump_label.h> | 60 | #include <linux/jump_label.h> |
61 | #include <linux/pfn.h> | 61 | #include <linux/pfn.h> |
62 | #include <linux/bsearch.h> | 62 | #include <linux/bsearch.h> |
63 | #include <linux/fips.h> | ||
64 | #include <uapi/linux/module.h> | 63 | #include <uapi/linux/module.h> |
65 | #include "module-internal.h" | 64 | #include "module-internal.h" |
66 | 65 | ||
@@ -2448,9 +2447,6 @@ static int module_sig_check(struct load_info *info) | |||
2448 | } | 2447 | } |
2449 | 2448 | ||
2450 | /* Not having a signature is only an error if we're strict. */ | 2449 | /* Not having a signature is only an error if we're strict. */ |
2451 | if (err < 0 && fips_enabled) | ||
2452 | panic("Module verification failed with error %d in FIPS mode\n", | ||
2453 | err); | ||
2454 | if (err == -ENOKEY && !sig_enforce) | 2450 | if (err == -ENOKEY && !sig_enforce) |
2455 | err = 0; | 2451 | err = 0; |
2456 | 2452 | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index fcc2611d3f14..a9dfa79b6bab 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -371,7 +371,6 @@ int hibernation_snapshot(int platform_mode) | |||
371 | } | 371 | } |
372 | 372 | ||
373 | suspend_console(); | 373 | suspend_console(); |
374 | ftrace_stop(); | ||
375 | pm_restrict_gfp_mask(); | 374 | pm_restrict_gfp_mask(); |
376 | 375 | ||
377 | error = dpm_suspend(PMSG_FREEZE); | 376 | error = dpm_suspend(PMSG_FREEZE); |
@@ -397,7 +396,6 @@ int hibernation_snapshot(int platform_mode) | |||
397 | if (error || !in_suspend) | 396 | if (error || !in_suspend) |
398 | pm_restore_gfp_mask(); | 397 | pm_restore_gfp_mask(); |
399 | 398 | ||
400 | ftrace_start(); | ||
401 | resume_console(); | 399 | resume_console(); |
402 | dpm_complete(msg); | 400 | dpm_complete(msg); |
403 | 401 | ||
@@ -500,7 +498,6 @@ int hibernation_restore(int platform_mode) | |||
500 | 498 | ||
501 | pm_prepare_console(); | 499 | pm_prepare_console(); |
502 | suspend_console(); | 500 | suspend_console(); |
503 | ftrace_stop(); | ||
504 | pm_restrict_gfp_mask(); | 501 | pm_restrict_gfp_mask(); |
505 | error = dpm_suspend_start(PMSG_QUIESCE); | 502 | error = dpm_suspend_start(PMSG_QUIESCE); |
506 | if (!error) { | 503 | if (!error) { |
@@ -508,7 +505,6 @@ int hibernation_restore(int platform_mode) | |||
508 | dpm_resume_end(PMSG_RECOVER); | 505 | dpm_resume_end(PMSG_RECOVER); |
509 | } | 506 | } |
510 | pm_restore_gfp_mask(); | 507 | pm_restore_gfp_mask(); |
511 | ftrace_start(); | ||
512 | resume_console(); | 508 | resume_console(); |
513 | pm_restore_console(); | 509 | pm_restore_console(); |
514 | return error; | 510 | return error; |
@@ -535,7 +531,6 @@ int hibernation_platform_enter(void) | |||
535 | 531 | ||
536 | entering_platform_hibernation = true; | 532 | entering_platform_hibernation = true; |
537 | suspend_console(); | 533 | suspend_console(); |
538 | ftrace_stop(); | ||
539 | error = dpm_suspend_start(PMSG_HIBERNATE); | 534 | error = dpm_suspend_start(PMSG_HIBERNATE); |
540 | if (error) { | 535 | if (error) { |
541 | if (hibernation_ops->recover) | 536 | if (hibernation_ops->recover) |
@@ -579,7 +574,6 @@ int hibernation_platform_enter(void) | |||
579 | Resume_devices: | 574 | Resume_devices: |
580 | entering_platform_hibernation = false; | 575 | entering_platform_hibernation = false; |
581 | dpm_resume_end(PMSG_RESTORE); | 576 | dpm_resume_end(PMSG_RESTORE); |
582 | ftrace_start(); | ||
583 | resume_console(); | 577 | resume_console(); |
584 | 578 | ||
585 | Close: | 579 | Close: |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 0ca8d83e2369..4ee194eb524b 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -186,6 +186,7 @@ void thaw_processes(void) | |||
186 | 186 | ||
187 | printk("Restarting tasks ... "); | 187 | printk("Restarting tasks ... "); |
188 | 188 | ||
189 | __usermodehelper_set_disable_depth(UMH_FREEZING); | ||
189 | thaw_workqueues(); | 190 | thaw_workqueues(); |
190 | 191 | ||
191 | read_lock(&tasklist_lock); | 192 | read_lock(&tasklist_lock); |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4dd8822f732a..4b736b4dfa96 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -248,7 +248,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
248 | goto Platform_wake; | 248 | goto Platform_wake; |
249 | } | 249 | } |
250 | 250 | ||
251 | ftrace_stop(); | ||
252 | error = disable_nonboot_cpus(); | 251 | error = disable_nonboot_cpus(); |
253 | if (error || suspend_test(TEST_CPUS)) | 252 | if (error || suspend_test(TEST_CPUS)) |
254 | goto Enable_cpus; | 253 | goto Enable_cpus; |
@@ -275,7 +274,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
275 | 274 | ||
276 | Enable_cpus: | 275 | Enable_cpus: |
277 | enable_nonboot_cpus(); | 276 | enable_nonboot_cpus(); |
278 | ftrace_start(); | ||
279 | 277 | ||
280 | Platform_wake: | 278 | Platform_wake: |
281 | if (need_suspend_ops(state) && suspend_ops->wake) | 279 | if (need_suspend_ops(state) && suspend_ops->wake) |
@@ -306,7 +304,7 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
306 | error = suspend_ops->begin(state); | 304 | error = suspend_ops->begin(state); |
307 | if (error) | 305 | if (error) |
308 | goto Close; | 306 | goto Close; |
309 | } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) { | 307 | } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) { |
310 | error = freeze_ops->begin(); | 308 | error = freeze_ops->begin(); |
311 | if (error) | 309 | if (error) |
312 | goto Close; | 310 | goto Close; |
@@ -335,7 +333,7 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
335 | Close: | 333 | Close: |
336 | if (need_suspend_ops(state) && suspend_ops->end) | 334 | if (need_suspend_ops(state) && suspend_ops->end) |
337 | suspend_ops->end(); | 335 | suspend_ops->end(); |
338 | else if (state == PM_SUSPEND_FREEZE && freeze_ops->end) | 336 | else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) |
339 | freeze_ops->end(); | 337 | freeze_ops->end(); |
340 | 338 | ||
341 | return error; | 339 | return error; |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index adf98622cb32..54e75226c2c4 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -28,12 +28,6 @@ | |||
28 | #include <linux/compat.h> | 28 | #include <linux/compat.h> |
29 | 29 | ||
30 | 30 | ||
31 | static int ptrace_trapping_sleep_fn(void *flags) | ||
32 | { | ||
33 | schedule(); | ||
34 | return 0; | ||
35 | } | ||
36 | |||
37 | /* | 31 | /* |
38 | * ptrace a task: make the debugger its new parent and | 32 | * ptrace a task: make the debugger its new parent and |
39 | * move it to the ptrace list. | 33 | * move it to the ptrace list. |
@@ -371,7 +365,7 @@ unlock_creds: | |||
371 | out: | 365 | out: |
372 | if (!retval) { | 366 | if (!retval) { |
373 | wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, | 367 | wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, |
374 | ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); | 368 | TASK_UNINTERRUPTIBLE); |
375 | proc_ptrace_connector(task, PTRACE_ATTACH); | 369 | proc_ptrace_connector(task, PTRACE_ATTACH); |
376 | } | 370 | } |
377 | 371 | ||
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index bfda2726ca45..ff1a6de62f17 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h | |||
@@ -99,6 +99,10 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) | |||
99 | 99 | ||
100 | void kfree(const void *); | 100 | void kfree(const void *); |
101 | 101 | ||
102 | /* | ||
103 | * Reclaim the specified callback, either by invoking it (non-lazy case) | ||
104 | * or freeing it directly (lazy case). Return true if lazy, false otherwise. | ||
105 | */ | ||
102 | static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) | 106 | static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) |
103 | { | 107 | { |
104 | unsigned long offset = (unsigned long)head->func; | 108 | unsigned long offset = (unsigned long)head->func; |
@@ -108,12 +112,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) | |||
108 | RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); | 112 | RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); |
109 | kfree((void *)head - offset); | 113 | kfree((void *)head - offset); |
110 | rcu_lock_release(&rcu_callback_map); | 114 | rcu_lock_release(&rcu_callback_map); |
111 | return 1; | 115 | return true; |
112 | } else { | 116 | } else { |
113 | RCU_TRACE(trace_rcu_invoke_callback(rn, head)); | 117 | RCU_TRACE(trace_rcu_invoke_callback(rn, head)); |
114 | head->func(head); | 118 | head->func(head); |
115 | rcu_lock_release(&rcu_callback_map); | 119 | rcu_lock_release(&rcu_callback_map); |
116 | return 0; | 120 | return false; |
117 | } | 121 | } |
118 | } | 122 | } |
119 | 123 | ||
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7fa34f86e5ba..948a7693748e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
@@ -18,7 +18,7 @@ | |||
18 | * Copyright (C) IBM Corporation, 2005, 2006 | 18 | * Copyright (C) IBM Corporation, 2005, 2006 |
19 | * | 19 | * |
20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> | 20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> |
21 | * Josh Triplett <josh@freedesktop.org> | 21 | * Josh Triplett <josh@joshtriplett.org> |
22 | * | 22 | * |
23 | * See also: Documentation/RCU/torture.txt | 23 | * See also: Documentation/RCU/torture.txt |
24 | */ | 24 | */ |
@@ -51,7 +51,7 @@ | |||
51 | #include <linux/torture.h> | 51 | #include <linux/torture.h> |
52 | 52 | ||
53 | MODULE_LICENSE("GPL"); | 53 | MODULE_LICENSE("GPL"); |
54 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); | 54 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); |
55 | 55 | ||
56 | 56 | ||
57 | torture_param(int, fqs_duration, 0, | 57 | torture_param(int, fqs_duration, 0, |
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index c639556f3fa0..e037f3eb2f7b 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c | |||
@@ -298,9 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp) | |||
298 | 298 | ||
299 | idx = ACCESS_ONCE(sp->completed) & 0x1; | 299 | idx = ACCESS_ONCE(sp->completed) & 0x1; |
300 | preempt_disable(); | 300 | preempt_disable(); |
301 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; | 301 | __this_cpu_inc(sp->per_cpu_ref->c[idx]); |
302 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ | 302 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ |
303 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; | 303 | __this_cpu_inc(sp->per_cpu_ref->seq[idx]); |
304 | preempt_enable(); | 304 | preempt_enable(); |
305 | return idx; | 305 | return idx; |
306 | } | 306 | } |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f1ba77363fbb..1b70cb6fbe3c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -206,6 +206,70 @@ void rcu_bh_qs(int cpu) | |||
206 | rdp->passed_quiesce = 1; | 206 | rdp->passed_quiesce = 1; |
207 | } | 207 | } |
208 | 208 | ||
209 | static DEFINE_PER_CPU(int, rcu_sched_qs_mask); | ||
210 | |||
211 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | ||
212 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | ||
213 | .dynticks = ATOMIC_INIT(1), | ||
214 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
215 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, | ||
216 | .dynticks_idle = ATOMIC_INIT(1), | ||
217 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
218 | }; | ||
219 | |||
220 | /* | ||
221 | * Let the RCU core know that this CPU has gone through the scheduler, | ||
222 | * which is a quiescent state. This is called when the need for a | ||
223 | * quiescent state is urgent, so we burn an atomic operation and full | ||
224 | * memory barriers to let the RCU core know about it, regardless of what | ||
225 | * this CPU might (or might not) do in the near future. | ||
226 | * | ||
227 | * We inform the RCU core by emulating a zero-duration dyntick-idle | ||
228 | * period, which we in turn do by incrementing the ->dynticks counter | ||
229 | * by two. | ||
230 | */ | ||
231 | static void rcu_momentary_dyntick_idle(void) | ||
232 | { | ||
233 | unsigned long flags; | ||
234 | struct rcu_data *rdp; | ||
235 | struct rcu_dynticks *rdtp; | ||
236 | int resched_mask; | ||
237 | struct rcu_state *rsp; | ||
238 | |||
239 | local_irq_save(flags); | ||
240 | |||
241 | /* | ||
242 | * Yes, we can lose flag-setting operations. This is OK, because | ||
243 | * the flag will be set again after some delay. | ||
244 | */ | ||
245 | resched_mask = raw_cpu_read(rcu_sched_qs_mask); | ||
246 | raw_cpu_write(rcu_sched_qs_mask, 0); | ||
247 | |||
248 | /* Find the flavor that needs a quiescent state. */ | ||
249 | for_each_rcu_flavor(rsp) { | ||
250 | rdp = raw_cpu_ptr(rsp->rda); | ||
251 | if (!(resched_mask & rsp->flavor_mask)) | ||
252 | continue; | ||
253 | smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ | ||
254 | if (ACCESS_ONCE(rdp->mynode->completed) != | ||
255 | ACCESS_ONCE(rdp->cond_resched_completed)) | ||
256 | continue; | ||
257 | |||
258 | /* | ||
259 | * Pretend to be momentarily idle for the quiescent state. | ||
260 | * This allows the grace-period kthread to record the | ||
261 | * quiescent state, with no need for this CPU to do anything | ||
262 | * further. | ||
263 | */ | ||
264 | rdtp = this_cpu_ptr(&rcu_dynticks); | ||
265 | smp_mb__before_atomic(); /* Earlier stuff before QS. */ | ||
266 | atomic_add(2, &rdtp->dynticks); /* QS. */ | ||
267 | smp_mb__after_atomic(); /* Later stuff after QS. */ | ||
268 | break; | ||
269 | } | ||
270 | local_irq_restore(flags); | ||
271 | } | ||
272 | |||
209 | /* | 273 | /* |
210 | * Note a context switch. This is a quiescent state for RCU-sched, | 274 | * Note a context switch. This is a quiescent state for RCU-sched, |
211 | * and requires special handling for preemptible RCU. | 275 | * and requires special handling for preemptible RCU. |
@@ -216,19 +280,12 @@ void rcu_note_context_switch(int cpu) | |||
216 | trace_rcu_utilization(TPS("Start context switch")); | 280 | trace_rcu_utilization(TPS("Start context switch")); |
217 | rcu_sched_qs(cpu); | 281 | rcu_sched_qs(cpu); |
218 | rcu_preempt_note_context_switch(cpu); | 282 | rcu_preempt_note_context_switch(cpu); |
283 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) | ||
284 | rcu_momentary_dyntick_idle(); | ||
219 | trace_rcu_utilization(TPS("End context switch")); | 285 | trace_rcu_utilization(TPS("End context switch")); |
220 | } | 286 | } |
221 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 287 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
222 | 288 | ||
223 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | ||
224 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | ||
225 | .dynticks = ATOMIC_INIT(1), | ||
226 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
227 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, | ||
228 | .dynticks_idle = ATOMIC_INIT(1), | ||
229 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
230 | }; | ||
231 | |||
232 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ | 289 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ |
233 | static long qhimark = 10000; /* If this many pending, ignore blimit. */ | 290 | static long qhimark = 10000; /* If this many pending, ignore blimit. */ |
234 | static long qlowmark = 100; /* Once only this many pending, use blimit. */ | 291 | static long qlowmark = 100; /* Once only this many pending, use blimit. */ |
@@ -243,6 +300,13 @@ static ulong jiffies_till_next_fqs = ULONG_MAX; | |||
243 | module_param(jiffies_till_first_fqs, ulong, 0644); | 300 | module_param(jiffies_till_first_fqs, ulong, 0644); |
244 | module_param(jiffies_till_next_fqs, ulong, 0644); | 301 | module_param(jiffies_till_next_fqs, ulong, 0644); |
245 | 302 | ||
303 | /* | ||
304 | * How long the grace period must be before we start recruiting | ||
305 | * quiescent-state help from rcu_note_context_switch(). | ||
306 | */ | ||
307 | static ulong jiffies_till_sched_qs = HZ / 20; | ||
308 | module_param(jiffies_till_sched_qs, ulong, 0644); | ||
309 | |||
246 | static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | 310 | static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, |
247 | struct rcu_data *rdp); | 311 | struct rcu_data *rdp); |
248 | static void force_qs_rnp(struct rcu_state *rsp, | 312 | static void force_qs_rnp(struct rcu_state *rsp, |
@@ -853,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
853 | bool *isidle, unsigned long *maxj) | 917 | bool *isidle, unsigned long *maxj) |
854 | { | 918 | { |
855 | unsigned int curr; | 919 | unsigned int curr; |
920 | int *rcrmp; | ||
856 | unsigned int snap; | 921 | unsigned int snap; |
857 | 922 | ||
858 | curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); | 923 | curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); |
@@ -893,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
893 | } | 958 | } |
894 | 959 | ||
895 | /* | 960 | /* |
896 | * There is a possibility that a CPU in adaptive-ticks state | 961 | * A CPU running for an extended time within the kernel can |
897 | * might run in the kernel with the scheduling-clock tick disabled | 962 | * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, |
898 | * for an extended time period. Invoke rcu_kick_nohz_cpu() to | 963 | * even context-switching back and forth between a pair of |
899 | * force the CPU to restart the scheduling-clock tick in this | 964 | * in-kernel CPU-bound tasks cannot advance grace periods. |
900 | * CPU is in this state. | 965 | * So if the grace period is old enough, make the CPU pay attention. |
901 | */ | 966 | * Note that the unsynchronized assignments to the per-CPU |
902 | rcu_kick_nohz_cpu(rdp->cpu); | 967 | * rcu_sched_qs_mask variable are safe. Yes, setting of |
903 | 968 | * bits can be lost, but they will be set again on the next | |
904 | /* | 969 | * force-quiescent-state pass. So lost bit sets do not result |
905 | * Alternatively, the CPU might be running in the kernel | 970 | * in incorrect behavior, merely in a grace period lasting |
906 | * for an extended period of time without a quiescent state. | 971 | * a few jiffies longer than it might otherwise. Because |
907 | * Attempt to force the CPU through the scheduler to gain the | 972 | * there are at most four threads involved, and because the |
908 | * needed quiescent state, but only if the grace period has gone | 973 | * updates are only once every few jiffies, the probability of |
909 | * on for an uncommonly long time. If there are many stuck CPUs, | 974 | * lossage (and thus of slight grace-period extension) is |
910 | * we will beat on the first one until it gets unstuck, then move | 975 | * quite low. |
911 | * to the next. Only do this for the primary flavor of RCU. | 976 | * |
977 | * Note that if the jiffies_till_sched_qs boot/sysfs parameter | ||
978 | * is set too high, we override with half of the RCU CPU stall | ||
979 | * warning delay. | ||
912 | */ | 980 | */ |
913 | if (rdp->rsp == rcu_state_p && | 981 | rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); |
982 | if (ULONG_CMP_GE(jiffies, | ||
983 | rdp->rsp->gp_start + jiffies_till_sched_qs) || | ||
914 | ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { | 984 | ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { |
915 | rdp->rsp->jiffies_resched += 5; | 985 | if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { |
916 | resched_cpu(rdp->cpu); | 986 | ACCESS_ONCE(rdp->cond_resched_completed) = |
987 | ACCESS_ONCE(rdp->mynode->completed); | ||
988 | smp_mb(); /* ->cond_resched_completed before *rcrmp. */ | ||
989 | ACCESS_ONCE(*rcrmp) = | ||
990 | ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask; | ||
991 | resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ | ||
992 | rdp->rsp->jiffies_resched += 5; /* Enable beating. */ | ||
993 | } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { | ||
994 | /* Time to beat on that CPU again! */ | ||
995 | resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ | ||
996 | rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ | ||
997 | } | ||
917 | } | 998 | } |
918 | 999 | ||
919 | return 0; | 1000 | return 0; |
@@ -932,10 +1013,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) | |||
932 | } | 1013 | } |
933 | 1014 | ||
934 | /* | 1015 | /* |
935 | * Dump stacks of all tasks running on stalled CPUs. This is a fallback | 1016 | * Dump stacks of all tasks running on stalled CPUs. |
936 | * for architectures that do not implement trigger_all_cpu_backtrace(). | ||
937 | * The NMI-triggered stack traces are more accurate because they are | ||
938 | * printed by the target CPU. | ||
939 | */ | 1017 | */ |
940 | static void rcu_dump_cpu_stacks(struct rcu_state *rsp) | 1018 | static void rcu_dump_cpu_stacks(struct rcu_state *rsp) |
941 | { | 1019 | { |
@@ -1013,7 +1091,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
1013 | (long)rsp->gpnum, (long)rsp->completed, totqlen); | 1091 | (long)rsp->gpnum, (long)rsp->completed, totqlen); |
1014 | if (ndetected == 0) | 1092 | if (ndetected == 0) |
1015 | pr_err("INFO: Stall ended before state dump start\n"); | 1093 | pr_err("INFO: Stall ended before state dump start\n"); |
1016 | else if (!trigger_all_cpu_backtrace()) | 1094 | else |
1017 | rcu_dump_cpu_stacks(rsp); | 1095 | rcu_dump_cpu_stacks(rsp); |
1018 | 1096 | ||
1019 | /* Complain about tasks blocking the grace period. */ | 1097 | /* Complain about tasks blocking the grace period. */ |
@@ -1044,8 +1122,7 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
1044 | pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", | 1122 | pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", |
1045 | jiffies - rsp->gp_start, | 1123 | jiffies - rsp->gp_start, |
1046 | (long)rsp->gpnum, (long)rsp->completed, totqlen); | 1124 | (long)rsp->gpnum, (long)rsp->completed, totqlen); |
1047 | if (!trigger_all_cpu_backtrace()) | 1125 | rcu_dump_cpu_stacks(rsp); |
1048 | dump_stack(); | ||
1049 | 1126 | ||
1050 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1127 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1051 | if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) | 1128 | if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) |
@@ -1224,10 +1301,16 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | |||
1224 | * believe that a grace period is in progress, then we must wait | 1301 | * believe that a grace period is in progress, then we must wait |
1225 | * for the one following, which is in "c". Because our request | 1302 | * for the one following, which is in "c". Because our request |
1226 | * will be noticed at the end of the current grace period, we don't | 1303 | * will be noticed at the end of the current grace period, we don't |
1227 | * need to explicitly start one. | 1304 | * need to explicitly start one. We only do the lockless check |
1305 | * of rnp_root's fields if the current rcu_node structure thinks | ||
1306 | * there is no grace period in flight, and because we hold rnp->lock, | ||
1307 | * the only possible change is when rnp_root's two fields are | ||
1308 | * equal, in which case rnp_root->gpnum might be concurrently | ||
1309 | * incremented. But that is OK, as it will just result in our | ||
1310 | * doing some extra useless work. | ||
1228 | */ | 1311 | */ |
1229 | if (rnp->gpnum != rnp->completed || | 1312 | if (rnp->gpnum != rnp->completed || |
1230 | ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { | 1313 | ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) { |
1231 | rnp->need_future_gp[c & 0x1]++; | 1314 | rnp->need_future_gp[c & 0x1]++; |
1232 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); | 1315 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); |
1233 | goto out; | 1316 | goto out; |
@@ -1564,11 +1647,6 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1564 | rnp->level, rnp->grplo, | 1647 | rnp->level, rnp->grplo, |
1565 | rnp->grphi, rnp->qsmask); | 1648 | rnp->grphi, rnp->qsmask); |
1566 | raw_spin_unlock_irq(&rnp->lock); | 1649 | raw_spin_unlock_irq(&rnp->lock); |
1567 | #ifdef CONFIG_PROVE_RCU_DELAY | ||
1568 | if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 && | ||
1569 | system_state == SYSTEM_RUNNING) | ||
1570 | udelay(200); | ||
1571 | #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ | ||
1572 | cond_resched(); | 1650 | cond_resched(); |
1573 | } | 1651 | } |
1574 | 1652 | ||
@@ -2266,7 +2344,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
2266 | } | 2344 | } |
2267 | smp_mb(); /* List handling before counting for rcu_barrier(). */ | 2345 | smp_mb(); /* List handling before counting for rcu_barrier(). */ |
2268 | rdp->qlen_lazy -= count_lazy; | 2346 | rdp->qlen_lazy -= count_lazy; |
2269 | ACCESS_ONCE(rdp->qlen) -= count; | 2347 | ACCESS_ONCE(rdp->qlen) = rdp->qlen - count; |
2270 | rdp->n_cbs_invoked += count; | 2348 | rdp->n_cbs_invoked += count; |
2271 | 2349 | ||
2272 | /* Reinstate batch limit if we have worked down the excess. */ | 2350 | /* Reinstate batch limit if we have worked down the excess. */ |
@@ -2404,14 +2482,14 @@ static void force_quiescent_state(struct rcu_state *rsp) | |||
2404 | struct rcu_node *rnp_old = NULL; | 2482 | struct rcu_node *rnp_old = NULL; |
2405 | 2483 | ||
2406 | /* Funnel through hierarchy to reduce memory contention. */ | 2484 | /* Funnel through hierarchy to reduce memory contention. */ |
2407 | rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; | 2485 | rnp = __this_cpu_read(rsp->rda->mynode); |
2408 | for (; rnp != NULL; rnp = rnp->parent) { | 2486 | for (; rnp != NULL; rnp = rnp->parent) { |
2409 | ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || | 2487 | ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || |
2410 | !raw_spin_trylock(&rnp->fqslock); | 2488 | !raw_spin_trylock(&rnp->fqslock); |
2411 | if (rnp_old != NULL) | 2489 | if (rnp_old != NULL) |
2412 | raw_spin_unlock(&rnp_old->fqslock); | 2490 | raw_spin_unlock(&rnp_old->fqslock); |
2413 | if (ret) { | 2491 | if (ret) { |
2414 | ACCESS_ONCE(rsp->n_force_qs_lh)++; | 2492 | rsp->n_force_qs_lh++; |
2415 | return; | 2493 | return; |
2416 | } | 2494 | } |
2417 | rnp_old = rnp; | 2495 | rnp_old = rnp; |
@@ -2423,7 +2501,7 @@ static void force_quiescent_state(struct rcu_state *rsp) | |||
2423 | smp_mb__after_unlock_lock(); | 2501 | smp_mb__after_unlock_lock(); |
2424 | raw_spin_unlock(&rnp_old->fqslock); | 2502 | raw_spin_unlock(&rnp_old->fqslock); |
2425 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | 2503 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
2426 | ACCESS_ONCE(rsp->n_force_qs_lh)++; | 2504 | rsp->n_force_qs_lh++; |
2427 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); | 2505 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); |
2428 | return; /* Someone beat us to it. */ | 2506 | return; /* Someone beat us to it. */ |
2429 | } | 2507 | } |
@@ -2581,7 +2659,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
2581 | unsigned long flags; | 2659 | unsigned long flags; |
2582 | struct rcu_data *rdp; | 2660 | struct rcu_data *rdp; |
2583 | 2661 | ||
2584 | WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ | 2662 | WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */ |
2585 | if (debug_rcu_head_queue(head)) { | 2663 | if (debug_rcu_head_queue(head)) { |
2586 | /* Probable double call_rcu(), so leak the callback. */ | 2664 | /* Probable double call_rcu(), so leak the callback. */ |
2587 | ACCESS_ONCE(head->func) = rcu_leak_callback; | 2665 | ACCESS_ONCE(head->func) = rcu_leak_callback; |
@@ -2612,7 +2690,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
2612 | local_irq_restore(flags); | 2690 | local_irq_restore(flags); |
2613 | return; | 2691 | return; |
2614 | } | 2692 | } |
2615 | ACCESS_ONCE(rdp->qlen)++; | 2693 | ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1; |
2616 | if (lazy) | 2694 | if (lazy) |
2617 | rdp->qlen_lazy++; | 2695 | rdp->qlen_lazy++; |
2618 | else | 2696 | else |
@@ -3176,7 +3254,7 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
3176 | * ACCESS_ONCE() to prevent the compiler from speculating | 3254 | * ACCESS_ONCE() to prevent the compiler from speculating |
3177 | * the increment to precede the early-exit check. | 3255 | * the increment to precede the early-exit check. |
3178 | */ | 3256 | */ |
3179 | ACCESS_ONCE(rsp->n_barrier_done)++; | 3257 | ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1; |
3180 | WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); | 3258 | WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); |
3181 | _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); | 3259 | _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); |
3182 | smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ | 3260 | smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ |
@@ -3226,7 +3304,7 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
3226 | 3304 | ||
3227 | /* Increment ->n_barrier_done to prevent duplicate work. */ | 3305 | /* Increment ->n_barrier_done to prevent duplicate work. */ |
3228 | smp_mb(); /* Keep increment after above mechanism. */ | 3306 | smp_mb(); /* Keep increment after above mechanism. */ |
3229 | ACCESS_ONCE(rsp->n_barrier_done)++; | 3307 | ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1; |
3230 | WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); | 3308 | WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); |
3231 | _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); | 3309 | _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); |
3232 | smp_mb(); /* Keep increment before caller's subsequent code. */ | 3310 | smp_mb(); /* Keep increment before caller's subsequent code. */ |
@@ -3483,14 +3561,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
3483 | static void __init rcu_init_one(struct rcu_state *rsp, | 3561 | static void __init rcu_init_one(struct rcu_state *rsp, |
3484 | struct rcu_data __percpu *rda) | 3562 | struct rcu_data __percpu *rda) |
3485 | { | 3563 | { |
3486 | static char *buf[] = { "rcu_node_0", | 3564 | static const char * const buf[] = { |
3487 | "rcu_node_1", | 3565 | "rcu_node_0", |
3488 | "rcu_node_2", | 3566 | "rcu_node_1", |
3489 | "rcu_node_3" }; /* Match MAX_RCU_LVLS */ | 3567 | "rcu_node_2", |
3490 | static char *fqs[] = { "rcu_node_fqs_0", | 3568 | "rcu_node_3" }; /* Match MAX_RCU_LVLS */ |
3491 | "rcu_node_fqs_1", | 3569 | static const char * const fqs[] = { |
3492 | "rcu_node_fqs_2", | 3570 | "rcu_node_fqs_0", |
3493 | "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ | 3571 | "rcu_node_fqs_1", |
3572 | "rcu_node_fqs_2", | ||
3573 | "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ | ||
3574 | static u8 fl_mask = 0x1; | ||
3494 | int cpustride = 1; | 3575 | int cpustride = 1; |
3495 | int i; | 3576 | int i; |
3496 | int j; | 3577 | int j; |
@@ -3509,6 +3590,8 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
3509 | for (i = 1; i < rcu_num_lvls; i++) | 3590 | for (i = 1; i < rcu_num_lvls; i++) |
3510 | rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; | 3591 | rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; |
3511 | rcu_init_levelspread(rsp); | 3592 | rcu_init_levelspread(rsp); |
3593 | rsp->flavor_mask = fl_mask; | ||
3594 | fl_mask <<= 1; | ||
3512 | 3595 | ||
3513 | /* Initialize the elements themselves, starting from the leaves. */ | 3596 | /* Initialize the elements themselves, starting from the leaves. */ |
3514 | 3597 | ||
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bf2c1e669691..71e64c718f75 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
@@ -172,6 +172,14 @@ struct rcu_node { | |||
172 | /* queued on this rcu_node structure that */ | 172 | /* queued on this rcu_node structure that */ |
173 | /* are blocking the current grace period, */ | 173 | /* are blocking the current grace period, */ |
174 | /* there can be no such task. */ | 174 | /* there can be no such task. */ |
175 | struct completion boost_completion; | ||
176 | /* Used to ensure that the rt_mutex used */ | ||
177 | /* to carry out the boosting is fully */ | ||
178 | /* released with no future boostee accesses */ | ||
179 | /* before that rt_mutex is re-initialized. */ | ||
180 | struct rt_mutex boost_mtx; | ||
181 | /* Used only for the priority-boosting */ | ||
182 | /* side effect, not as a lock. */ | ||
175 | unsigned long boost_time; | 183 | unsigned long boost_time; |
176 | /* When to start boosting (jiffies). */ | 184 | /* When to start boosting (jiffies). */ |
177 | struct task_struct *boost_kthread_task; | 185 | struct task_struct *boost_kthread_task; |
@@ -307,6 +315,9 @@ struct rcu_data { | |||
307 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ | 315 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ |
308 | unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ | 316 | unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ |
309 | unsigned long offline_fqs; /* Kicked due to being offline. */ | 317 | unsigned long offline_fqs; /* Kicked due to being offline. */ |
318 | unsigned long cond_resched_completed; | ||
319 | /* Grace period that needs help */ | ||
320 | /* from cond_resched(). */ | ||
310 | 321 | ||
311 | /* 5) __rcu_pending() statistics. */ | 322 | /* 5) __rcu_pending() statistics. */ |
312 | unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ | 323 | unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ |
@@ -331,11 +342,29 @@ struct rcu_data { | |||
331 | struct rcu_head **nocb_tail; | 342 | struct rcu_head **nocb_tail; |
332 | atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ | 343 | atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ |
333 | atomic_long_t nocb_q_count_lazy; /* (approximate). */ | 344 | atomic_long_t nocb_q_count_lazy; /* (approximate). */ |
345 | struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ | ||
346 | struct rcu_head **nocb_follower_tail; | ||
347 | atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */ | ||
348 | atomic_long_t nocb_follower_count_lazy; /* (approximate). */ | ||
334 | int nocb_p_count; /* # CBs being invoked by kthread */ | 349 | int nocb_p_count; /* # CBs being invoked by kthread */ |
335 | int nocb_p_count_lazy; /* (approximate). */ | 350 | int nocb_p_count_lazy; /* (approximate). */ |
336 | wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ | 351 | wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ |
337 | struct task_struct *nocb_kthread; | 352 | struct task_struct *nocb_kthread; |
338 | bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ | 353 | bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ |
354 | |||
355 | /* The following fields are used by the leader, hence own cacheline. */ | ||
356 | struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; | ||
357 | /* CBs waiting for GP. */ | ||
358 | struct rcu_head **nocb_gp_tail; | ||
359 | long nocb_gp_count; | ||
360 | long nocb_gp_count_lazy; | ||
361 | bool nocb_leader_wake; /* Is the nocb leader thread awake? */ | ||
362 | struct rcu_data *nocb_next_follower; | ||
363 | /* Next follower in wakeup chain. */ | ||
364 | |||
365 | /* The following fields are used by the follower, hence new cachline. */ | ||
366 | struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp; | ||
367 | /* Leader CPU takes GP-end wakeups. */ | ||
339 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | 368 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ |
340 | 369 | ||
341 | /* 8) RCU CPU stall data. */ | 370 | /* 8) RCU CPU stall data. */ |
@@ -392,6 +421,7 @@ struct rcu_state { | |||
392 | struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ | 421 | struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ |
393 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ | 422 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ |
394 | u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ | 423 | u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ |
424 | u8 flavor_mask; /* bit in flavor mask. */ | ||
395 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ | 425 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
396 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ | 426 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ |
397 | void (*func)(struct rcu_head *head)); | 427 | void (*func)(struct rcu_head *head)); |
@@ -563,7 +593,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); | |||
563 | static void do_nocb_deferred_wakeup(struct rcu_data *rdp); | 593 | static void do_nocb_deferred_wakeup(struct rcu_data *rdp); |
564 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); | 594 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); |
565 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); | 595 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); |
566 | static void rcu_kick_nohz_cpu(int cpu); | 596 | static void __maybe_unused rcu_kick_nohz_cpu(int cpu); |
567 | static bool init_nocb_callback_list(struct rcu_data *rdp); | 597 | static bool init_nocb_callback_list(struct rcu_data *rdp); |
568 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); | 598 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); |
569 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); | 599 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); |
@@ -583,8 +613,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp); | |||
583 | /* Sum up queue lengths for tracing. */ | 613 | /* Sum up queue lengths for tracing. */ |
584 | static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) | 614 | static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) |
585 | { | 615 | { |
586 | *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count; | 616 | *ql = atomic_long_read(&rdp->nocb_q_count) + |
587 | *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy; | 617 | rdp->nocb_p_count + |
618 | atomic_long_read(&rdp->nocb_follower_count) + | ||
619 | rdp->nocb_p_count + rdp->nocb_gp_count; | ||
620 | *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + | ||
621 | rdp->nocb_p_count_lazy + | ||
622 | atomic_long_read(&rdp->nocb_follower_count_lazy) + | ||
623 | rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy; | ||
588 | } | 624 | } |
589 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | 625 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ |
590 | static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) | 626 | static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) |
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index cbc2c45265e2..00dc411e9676 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -33,6 +33,7 @@ | |||
33 | #define RCU_KTHREAD_PRIO 1 | 33 | #define RCU_KTHREAD_PRIO 1 |
34 | 34 | ||
35 | #ifdef CONFIG_RCU_BOOST | 35 | #ifdef CONFIG_RCU_BOOST |
36 | #include "../locking/rtmutex_common.h" | ||
36 | #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO | 37 | #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO |
37 | #else | 38 | #else |
38 | #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO | 39 | #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO |
@@ -336,7 +337,7 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
336 | unsigned long flags; | 337 | unsigned long flags; |
337 | struct list_head *np; | 338 | struct list_head *np; |
338 | #ifdef CONFIG_RCU_BOOST | 339 | #ifdef CONFIG_RCU_BOOST |
339 | struct rt_mutex *rbmp = NULL; | 340 | bool drop_boost_mutex = false; |
340 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 341 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
341 | struct rcu_node *rnp; | 342 | struct rcu_node *rnp; |
342 | int special; | 343 | int special; |
@@ -398,11 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
398 | #ifdef CONFIG_RCU_BOOST | 399 | #ifdef CONFIG_RCU_BOOST |
399 | if (&t->rcu_node_entry == rnp->boost_tasks) | 400 | if (&t->rcu_node_entry == rnp->boost_tasks) |
400 | rnp->boost_tasks = np; | 401 | rnp->boost_tasks = np; |
401 | /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ | 402 | /* Snapshot ->boost_mtx ownership with rcu_node lock held. */ |
402 | if (t->rcu_boost_mutex) { | 403 | drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; |
403 | rbmp = t->rcu_boost_mutex; | ||
404 | t->rcu_boost_mutex = NULL; | ||
405 | } | ||
406 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 404 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
407 | 405 | ||
408 | /* | 406 | /* |
@@ -427,8 +425,10 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
427 | 425 | ||
428 | #ifdef CONFIG_RCU_BOOST | 426 | #ifdef CONFIG_RCU_BOOST |
429 | /* Unboost if we were boosted. */ | 427 | /* Unboost if we were boosted. */ |
430 | if (rbmp) | 428 | if (drop_boost_mutex) { |
431 | rt_mutex_unlock(rbmp); | 429 | rt_mutex_unlock(&rnp->boost_mtx); |
430 | complete(&rnp->boost_completion); | ||
431 | } | ||
432 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 432 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
433 | 433 | ||
434 | /* | 434 | /* |
@@ -988,6 +988,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) | |||
988 | 988 | ||
989 | /* Because preemptible RCU does not exist, no quieting of tasks. */ | 989 | /* Because preemptible RCU does not exist, no quieting of tasks. */ |
990 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | 990 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) |
991 | __releases(rnp->lock) | ||
991 | { | 992 | { |
992 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 993 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
993 | } | 994 | } |
@@ -1149,7 +1150,6 @@ static void rcu_wake_cond(struct task_struct *t, int status) | |||
1149 | static int rcu_boost(struct rcu_node *rnp) | 1150 | static int rcu_boost(struct rcu_node *rnp) |
1150 | { | 1151 | { |
1151 | unsigned long flags; | 1152 | unsigned long flags; |
1152 | struct rt_mutex mtx; | ||
1153 | struct task_struct *t; | 1153 | struct task_struct *t; |
1154 | struct list_head *tb; | 1154 | struct list_head *tb; |
1155 | 1155 | ||
@@ -1200,11 +1200,15 @@ static int rcu_boost(struct rcu_node *rnp) | |||
1200 | * section. | 1200 | * section. |
1201 | */ | 1201 | */ |
1202 | t = container_of(tb, struct task_struct, rcu_node_entry); | 1202 | t = container_of(tb, struct task_struct, rcu_node_entry); |
1203 | rt_mutex_init_proxy_locked(&mtx, t); | 1203 | rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); |
1204 | t->rcu_boost_mutex = &mtx; | 1204 | init_completion(&rnp->boost_completion); |
1205 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1205 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1206 | rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ | 1206 | /* Lock only for side effect: boosts task t's priority. */ |
1207 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | 1207 | rt_mutex_lock(&rnp->boost_mtx); |
1208 | rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ | ||
1209 | |||
1210 | /* Wait for boostee to be done w/boost_mtx before reinitializing. */ | ||
1211 | wait_for_completion(&rnp->boost_completion); | ||
1208 | 1212 | ||
1209 | return ACCESS_ONCE(rnp->exp_tasks) != NULL || | 1213 | return ACCESS_ONCE(rnp->exp_tasks) != NULL || |
1210 | ACCESS_ONCE(rnp->boost_tasks) != NULL; | 1214 | ACCESS_ONCE(rnp->boost_tasks) != NULL; |
@@ -1256,6 +1260,7 @@ static int rcu_boost_kthread(void *arg) | |||
1256 | * about it going away. | 1260 | * about it going away. |
1257 | */ | 1261 | */ |
1258 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | 1262 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) |
1263 | __releases(rnp->lock) | ||
1259 | { | 1264 | { |
1260 | struct task_struct *t; | 1265 | struct task_struct *t; |
1261 | 1266 | ||
@@ -1491,6 +1496,7 @@ static void rcu_prepare_kthreads(int cpu) | |||
1491 | #else /* #ifdef CONFIG_RCU_BOOST */ | 1496 | #else /* #ifdef CONFIG_RCU_BOOST */ |
1492 | 1497 | ||
1493 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | 1498 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) |
1499 | __releases(rnp->lock) | ||
1494 | { | 1500 | { |
1495 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1501 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1496 | } | 1502 | } |
@@ -2060,6 +2066,22 @@ bool rcu_is_nocb_cpu(int cpu) | |||
2060 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ | 2066 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ |
2061 | 2067 | ||
2062 | /* | 2068 | /* |
2069 | * Kick the leader kthread for this NOCB group. | ||
2070 | */ | ||
2071 | static void wake_nocb_leader(struct rcu_data *rdp, bool force) | ||
2072 | { | ||
2073 | struct rcu_data *rdp_leader = rdp->nocb_leader; | ||
2074 | |||
2075 | if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) | ||
2076 | return; | ||
2077 | if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) { | ||
2078 | /* Prior xchg orders against prior callback enqueue. */ | ||
2079 | ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true; | ||
2080 | wake_up(&rdp_leader->nocb_wq); | ||
2081 | } | ||
2082 | } | ||
2083 | |||
2084 | /* | ||
2063 | * Enqueue the specified string of rcu_head structures onto the specified | 2085 | * Enqueue the specified string of rcu_head structures onto the specified |
2064 | * CPU's no-CBs lists. The CPU is specified by rdp, the head of the | 2086 | * CPU's no-CBs lists. The CPU is specified by rdp, the head of the |
2065 | * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy | 2087 | * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy |
@@ -2093,7 +2115,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
2093 | len = atomic_long_read(&rdp->nocb_q_count); | 2115 | len = atomic_long_read(&rdp->nocb_q_count); |
2094 | if (old_rhpp == &rdp->nocb_head) { | 2116 | if (old_rhpp == &rdp->nocb_head) { |
2095 | if (!irqs_disabled_flags(flags)) { | 2117 | if (!irqs_disabled_flags(flags)) { |
2096 | wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */ | 2118 | /* ... if queue was empty ... */ |
2119 | wake_nocb_leader(rdp, false); | ||
2097 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 2120 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
2098 | TPS("WakeEmpty")); | 2121 | TPS("WakeEmpty")); |
2099 | } else { | 2122 | } else { |
@@ -2103,7 +2126,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
2103 | } | 2126 | } |
2104 | rdp->qlen_last_fqs_check = 0; | 2127 | rdp->qlen_last_fqs_check = 0; |
2105 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { | 2128 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { |
2106 | wake_up_process(t); /* ... or if many callbacks queued. */ | 2129 | /* ... or if many callbacks queued. */ |
2130 | wake_nocb_leader(rdp, true); | ||
2107 | rdp->qlen_last_fqs_check = LONG_MAX / 2; | 2131 | rdp->qlen_last_fqs_check = LONG_MAX / 2; |
2108 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); | 2132 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); |
2109 | } else { | 2133 | } else { |
@@ -2213,13 +2237,150 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
2213 | } | 2237 | } |
2214 | 2238 | ||
2215 | /* | 2239 | /* |
2240 | * Leaders come here to wait for additional callbacks to show up. | ||
2241 | * This function does not return until callbacks appear. | ||
2242 | */ | ||
2243 | static void nocb_leader_wait(struct rcu_data *my_rdp) | ||
2244 | { | ||
2245 | bool firsttime = true; | ||
2246 | bool gotcbs; | ||
2247 | struct rcu_data *rdp; | ||
2248 | struct rcu_head **tail; | ||
2249 | |||
2250 | wait_again: | ||
2251 | |||
2252 | /* Wait for callbacks to appear. */ | ||
2253 | if (!rcu_nocb_poll) { | ||
2254 | trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); | ||
2255 | wait_event_interruptible(my_rdp->nocb_wq, | ||
2256 | ACCESS_ONCE(my_rdp->nocb_leader_wake)); | ||
2257 | /* Memory barrier handled by smp_mb() calls below and repoll. */ | ||
2258 | } else if (firsttime) { | ||
2259 | firsttime = false; /* Don't drown trace log with "Poll"! */ | ||
2260 | trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); | ||
2261 | } | ||
2262 | |||
2263 | /* | ||
2264 | * Each pass through the following loop checks a follower for CBs. | ||
2265 | * We are our own first follower. Any CBs found are moved to | ||
2266 | * nocb_gp_head, where they await a grace period. | ||
2267 | */ | ||
2268 | gotcbs = false; | ||
2269 | for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { | ||
2270 | rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head); | ||
2271 | if (!rdp->nocb_gp_head) | ||
2272 | continue; /* No CBs here, try next follower. */ | ||
2273 | |||
2274 | /* Move callbacks to wait-for-GP list, which is empty. */ | ||
2275 | ACCESS_ONCE(rdp->nocb_head) = NULL; | ||
2276 | rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); | ||
2277 | rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0); | ||
2278 | rdp->nocb_gp_count_lazy = | ||
2279 | atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); | ||
2280 | gotcbs = true; | ||
2281 | } | ||
2282 | |||
2283 | /* | ||
2284 | * If there were no callbacks, sleep a bit, rescan after a | ||
2285 | * memory barrier, and go retry. | ||
2286 | */ | ||
2287 | if (unlikely(!gotcbs)) { | ||
2288 | if (!rcu_nocb_poll) | ||
2289 | trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, | ||
2290 | "WokeEmpty"); | ||
2291 | flush_signals(current); | ||
2292 | schedule_timeout_interruptible(1); | ||
2293 | |||
2294 | /* Rescan in case we were a victim of memory ordering. */ | ||
2295 | my_rdp->nocb_leader_wake = false; | ||
2296 | smp_mb(); /* Ensure _wake false before scan. */ | ||
2297 | for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) | ||
2298 | if (ACCESS_ONCE(rdp->nocb_head)) { | ||
2299 | /* Found CB, so short-circuit next wait. */ | ||
2300 | my_rdp->nocb_leader_wake = true; | ||
2301 | break; | ||
2302 | } | ||
2303 | goto wait_again; | ||
2304 | } | ||
2305 | |||
2306 | /* Wait for one grace period. */ | ||
2307 | rcu_nocb_wait_gp(my_rdp); | ||
2308 | |||
2309 | /* | ||
2310 | * We left ->nocb_leader_wake set to reduce cache thrashing. | ||
2311 | * We clear it now, but recheck for new callbacks while | ||
2312 | * traversing our follower list. | ||
2313 | */ | ||
2314 | my_rdp->nocb_leader_wake = false; | ||
2315 | smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */ | ||
2316 | |||
2317 | /* Each pass through the following loop wakes a follower, if needed. */ | ||
2318 | for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { | ||
2319 | if (ACCESS_ONCE(rdp->nocb_head)) | ||
2320 | my_rdp->nocb_leader_wake = true; /* No need to wait. */ | ||
2321 | if (!rdp->nocb_gp_head) | ||
2322 | continue; /* No CBs, so no need to wake follower. */ | ||
2323 | |||
2324 | /* Append callbacks to follower's "done" list. */ | ||
2325 | tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); | ||
2326 | *tail = rdp->nocb_gp_head; | ||
2327 | atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); | ||
2328 | atomic_long_add(rdp->nocb_gp_count_lazy, | ||
2329 | &rdp->nocb_follower_count_lazy); | ||
2330 | if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { | ||
2331 | /* | ||
2332 | * List was empty, wake up the follower. | ||
2333 | * Memory barriers supplied by atomic_long_add(). | ||
2334 | */ | ||
2335 | wake_up(&rdp->nocb_wq); | ||
2336 | } | ||
2337 | } | ||
2338 | |||
2339 | /* If we (the leader) don't have CBs, go wait some more. */ | ||
2340 | if (!my_rdp->nocb_follower_head) | ||
2341 | goto wait_again; | ||
2342 | } | ||
2343 | |||
2344 | /* | ||
2345 | * Followers come here to wait for additional callbacks to show up. | ||
2346 | * This function does not return until callbacks appear. | ||
2347 | */ | ||
2348 | static void nocb_follower_wait(struct rcu_data *rdp) | ||
2349 | { | ||
2350 | bool firsttime = true; | ||
2351 | |||
2352 | for (;;) { | ||
2353 | if (!rcu_nocb_poll) { | ||
2354 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2355 | "FollowerSleep"); | ||
2356 | wait_event_interruptible(rdp->nocb_wq, | ||
2357 | ACCESS_ONCE(rdp->nocb_follower_head)); | ||
2358 | } else if (firsttime) { | ||
2359 | /* Don't drown trace log with "Poll"! */ | ||
2360 | firsttime = false; | ||
2361 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll"); | ||
2362 | } | ||
2363 | if (smp_load_acquire(&rdp->nocb_follower_head)) { | ||
2364 | /* ^^^ Ensure CB invocation follows _head test. */ | ||
2365 | return; | ||
2366 | } | ||
2367 | if (!rcu_nocb_poll) | ||
2368 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2369 | "WokeEmpty"); | ||
2370 | flush_signals(current); | ||
2371 | schedule_timeout_interruptible(1); | ||
2372 | } | ||
2373 | } | ||
2374 | |||
2375 | /* | ||
2216 | * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes | 2376 | * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes |
2217 | * callbacks queued by the corresponding no-CBs CPU. | 2377 | * callbacks queued by the corresponding no-CBs CPU, however, there is |
2378 | * an optional leader-follower relationship so that the grace-period | ||
2379 | * kthreads don't have to do quite so many wakeups. | ||
2218 | */ | 2380 | */ |
2219 | static int rcu_nocb_kthread(void *arg) | 2381 | static int rcu_nocb_kthread(void *arg) |
2220 | { | 2382 | { |
2221 | int c, cl; | 2383 | int c, cl; |
2222 | bool firsttime = 1; | ||
2223 | struct rcu_head *list; | 2384 | struct rcu_head *list; |
2224 | struct rcu_head *next; | 2385 | struct rcu_head *next; |
2225 | struct rcu_head **tail; | 2386 | struct rcu_head **tail; |
@@ -2227,41 +2388,22 @@ static int rcu_nocb_kthread(void *arg) | |||
2227 | 2388 | ||
2228 | /* Each pass through this loop invokes one batch of callbacks */ | 2389 | /* Each pass through this loop invokes one batch of callbacks */ |
2229 | for (;;) { | 2390 | for (;;) { |
2230 | /* If not polling, wait for next batch of callbacks. */ | 2391 | /* Wait for callbacks. */ |
2231 | if (!rcu_nocb_poll) { | 2392 | if (rdp->nocb_leader == rdp) |
2232 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 2393 | nocb_leader_wait(rdp); |
2233 | TPS("Sleep")); | 2394 | else |
2234 | wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); | 2395 | nocb_follower_wait(rdp); |
2235 | /* Memory barrier provide by xchg() below. */ | 2396 | |
2236 | } else if (firsttime) { | 2397 | /* Pull the ready-to-invoke callbacks onto local list. */ |
2237 | firsttime = 0; | 2398 | list = ACCESS_ONCE(rdp->nocb_follower_head); |
2238 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 2399 | BUG_ON(!list); |
2239 | TPS("Poll")); | 2400 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); |
2240 | } | 2401 | ACCESS_ONCE(rdp->nocb_follower_head) = NULL; |
2241 | list = ACCESS_ONCE(rdp->nocb_head); | 2402 | tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); |
2242 | if (!list) { | 2403 | c = atomic_long_xchg(&rdp->nocb_follower_count, 0); |
2243 | if (!rcu_nocb_poll) | 2404 | cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0); |
2244 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 2405 | rdp->nocb_p_count += c; |
2245 | TPS("WokeEmpty")); | 2406 | rdp->nocb_p_count_lazy += cl; |
2246 | schedule_timeout_interruptible(1); | ||
2247 | flush_signals(current); | ||
2248 | continue; | ||
2249 | } | ||
2250 | firsttime = 1; | ||
2251 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2252 | TPS("WokeNonEmpty")); | ||
2253 | |||
2254 | /* | ||
2255 | * Extract queued callbacks, update counts, and wait | ||
2256 | * for a grace period to elapse. | ||
2257 | */ | ||
2258 | ACCESS_ONCE(rdp->nocb_head) = NULL; | ||
2259 | tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); | ||
2260 | c = atomic_long_xchg(&rdp->nocb_q_count, 0); | ||
2261 | cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); | ||
2262 | ACCESS_ONCE(rdp->nocb_p_count) += c; | ||
2263 | ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; | ||
2264 | rcu_nocb_wait_gp(rdp); | ||
2265 | 2407 | ||
2266 | /* Each pass through the following loop invokes a callback. */ | 2408 | /* Each pass through the following loop invokes a callback. */ |
2267 | trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); | 2409 | trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); |
@@ -2305,7 +2447,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) | |||
2305 | if (!rcu_nocb_need_deferred_wakeup(rdp)) | 2447 | if (!rcu_nocb_need_deferred_wakeup(rdp)) |
2306 | return; | 2448 | return; |
2307 | ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; | 2449 | ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; |
2308 | wake_up(&rdp->nocb_wq); | 2450 | wake_nocb_leader(rdp, false); |
2309 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); | 2451 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); |
2310 | } | 2452 | } |
2311 | 2453 | ||
@@ -2314,19 +2456,57 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | |||
2314 | { | 2456 | { |
2315 | rdp->nocb_tail = &rdp->nocb_head; | 2457 | rdp->nocb_tail = &rdp->nocb_head; |
2316 | init_waitqueue_head(&rdp->nocb_wq); | 2458 | init_waitqueue_head(&rdp->nocb_wq); |
2459 | rdp->nocb_follower_tail = &rdp->nocb_follower_head; | ||
2317 | } | 2460 | } |
2318 | 2461 | ||
2319 | /* Create a kthread for each RCU flavor for each no-CBs CPU. */ | 2462 | /* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ |
2463 | static int rcu_nocb_leader_stride = -1; | ||
2464 | module_param(rcu_nocb_leader_stride, int, 0444); | ||
2465 | |||
2466 | /* | ||
2467 | * Create a kthread for each RCU flavor for each no-CBs CPU. | ||
2468 | * Also initialize leader-follower relationships. | ||
2469 | */ | ||
2320 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | 2470 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) |
2321 | { | 2471 | { |
2322 | int cpu; | 2472 | int cpu; |
2473 | int ls = rcu_nocb_leader_stride; | ||
2474 | int nl = 0; /* Next leader. */ | ||
2323 | struct rcu_data *rdp; | 2475 | struct rcu_data *rdp; |
2476 | struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ | ||
2477 | struct rcu_data *rdp_prev = NULL; | ||
2324 | struct task_struct *t; | 2478 | struct task_struct *t; |
2325 | 2479 | ||
2326 | if (rcu_nocb_mask == NULL) | 2480 | if (rcu_nocb_mask == NULL) |
2327 | return; | 2481 | return; |
2482 | #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) | ||
2483 | if (tick_nohz_full_running) | ||
2484 | cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); | ||
2485 | #endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */ | ||
2486 | if (ls == -1) { | ||
2487 | ls = int_sqrt(nr_cpu_ids); | ||
2488 | rcu_nocb_leader_stride = ls; | ||
2489 | } | ||
2490 | |||
2491 | /* | ||
2492 | * Each pass through this loop sets up one rcu_data structure and | ||
2493 | * spawns one rcu_nocb_kthread(). | ||
2494 | */ | ||
2328 | for_each_cpu(cpu, rcu_nocb_mask) { | 2495 | for_each_cpu(cpu, rcu_nocb_mask) { |
2329 | rdp = per_cpu_ptr(rsp->rda, cpu); | 2496 | rdp = per_cpu_ptr(rsp->rda, cpu); |
2497 | if (rdp->cpu >= nl) { | ||
2498 | /* New leader, set up for followers & next leader. */ | ||
2499 | nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; | ||
2500 | rdp->nocb_leader = rdp; | ||
2501 | rdp_leader = rdp; | ||
2502 | } else { | ||
2503 | /* Another follower, link to previous leader. */ | ||
2504 | rdp->nocb_leader = rdp_leader; | ||
2505 | rdp_prev->nocb_next_follower = rdp; | ||
2506 | } | ||
2507 | rdp_prev = rdp; | ||
2508 | |||
2509 | /* Spawn the kthread for this CPU. */ | ||
2330 | t = kthread_run(rcu_nocb_kthread, rdp, | 2510 | t = kthread_run(rcu_nocb_kthread, rdp, |
2331 | "rcuo%c/%d", rsp->abbr, cpu); | 2511 | "rcuo%c/%d", rsp->abbr, cpu); |
2332 | BUG_ON(IS_ERR(t)); | 2512 | BUG_ON(IS_ERR(t)); |
@@ -2404,7 +2584,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) | |||
2404 | * if an adaptive-ticks CPU is failing to respond to the current grace | 2584 | * if an adaptive-ticks CPU is failing to respond to the current grace |
2405 | * period and has not be idle from an RCU perspective, kick it. | 2585 | * period and has not be idle from an RCU perspective, kick it. |
2406 | */ | 2586 | */ |
2407 | static void rcu_kick_nohz_cpu(int cpu) | 2587 | static void __maybe_unused rcu_kick_nohz_cpu(int cpu) |
2408 | { | 2588 | { |
2409 | #ifdef CONFIG_NO_HZ_FULL | 2589 | #ifdef CONFIG_NO_HZ_FULL |
2410 | if (tick_nohz_full_cpu(cpu)) | 2590 | if (tick_nohz_full_cpu(cpu)) |
@@ -2843,12 +3023,16 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) | |||
2843 | */ | 3023 | */ |
2844 | static void rcu_bind_gp_kthread(void) | 3024 | static void rcu_bind_gp_kthread(void) |
2845 | { | 3025 | { |
2846 | #ifdef CONFIG_NO_HZ_FULL | 3026 | int __maybe_unused cpu; |
2847 | int cpu = ACCESS_ONCE(tick_do_timer_cpu); | ||
2848 | 3027 | ||
2849 | if (cpu < 0 || cpu >= nr_cpu_ids) | 3028 | if (!tick_nohz_full_enabled()) |
2850 | return; | 3029 | return; |
2851 | if (raw_smp_processor_id() != cpu) | 3030 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE |
3031 | cpu = tick_do_timer_cpu; | ||
3032 | if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu) | ||
2852 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | 3033 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); |
2853 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ | 3034 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ |
3035 | if (!is_housekeeping_cpu(raw_smp_processor_id())) | ||
3036 | housekeeping_affine(current); | ||
3037 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
2854 | } | 3038 | } |
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index a2aeb4df0f60..4056d7992a6c 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
@@ -90,9 +90,6 @@ void __rcu_read_unlock(void) | |||
90 | } else { | 90 | } else { |
91 | barrier(); /* critical section before exit code. */ | 91 | barrier(); /* critical section before exit code. */ |
92 | t->rcu_read_lock_nesting = INT_MIN; | 92 | t->rcu_read_lock_nesting = INT_MIN; |
93 | #ifdef CONFIG_PROVE_RCU_DELAY | ||
94 | udelay(10); /* Make preemption more probable. */ | ||
95 | #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ | ||
96 | barrier(); /* assign before ->rcu_read_unlock_special load */ | 93 | barrier(); /* assign before ->rcu_read_unlock_special load */ |
97 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | 94 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) |
98 | rcu_read_unlock_special(t); | 95 | rcu_read_unlock_special(t); |
@@ -200,12 +197,12 @@ void wait_rcu_gp(call_rcu_func_t crf) | |||
200 | EXPORT_SYMBOL_GPL(wait_rcu_gp); | 197 | EXPORT_SYMBOL_GPL(wait_rcu_gp); |
201 | 198 | ||
202 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | 199 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD |
203 | static inline void debug_init_rcu_head(struct rcu_head *head) | 200 | void init_rcu_head(struct rcu_head *head) |
204 | { | 201 | { |
205 | debug_object_init(head, &rcuhead_debug_descr); | 202 | debug_object_init(head, &rcuhead_debug_descr); |
206 | } | 203 | } |
207 | 204 | ||
208 | static inline void debug_rcu_head_free(struct rcu_head *head) | 205 | void destroy_rcu_head(struct rcu_head *head) |
209 | { | 206 | { |
210 | debug_object_free(head, &rcuhead_debug_descr); | 207 | debug_object_free(head, &rcuhead_debug_descr); |
211 | } | 208 | } |
@@ -350,21 +347,3 @@ static int __init check_cpu_stall_init(void) | |||
350 | early_initcall(check_cpu_stall_init); | 347 | early_initcall(check_cpu_stall_init); |
351 | 348 | ||
352 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ | 349 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ |
353 | |||
354 | /* | ||
355 | * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. | ||
356 | */ | ||
357 | |||
358 | DEFINE_PER_CPU(int, rcu_cond_resched_count); | ||
359 | |||
360 | /* | ||
361 | * Report a set of RCU quiescent states, for use by cond_resched() | ||
362 | * and friends. Out of line due to being called infrequently. | ||
363 | */ | ||
364 | void rcu_resched(void) | ||
365 | { | ||
366 | preempt_disable(); | ||
367 | __this_cpu_write(rcu_cond_resched_count, 0); | ||
368 | rcu_note_context_switch(smp_processor_id()); | ||
369 | preempt_enable(); | ||
370 | } | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3bdf01b494fe..1211575a2208 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -139,6 +139,8 @@ void update_rq_clock(struct rq *rq) | |||
139 | return; | 139 | return; |
140 | 140 | ||
141 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | 141 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; |
142 | if (delta < 0) | ||
143 | return; | ||
142 | rq->clock += delta; | 144 | rq->clock += delta; |
143 | update_rq_clock_task(rq, delta); | 145 | update_rq_clock_task(rq, delta); |
144 | } | 146 | } |
@@ -243,6 +245,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
243 | char buf[64]; | 245 | char buf[64]; |
244 | char *cmp; | 246 | char *cmp; |
245 | int i; | 247 | int i; |
248 | struct inode *inode; | ||
246 | 249 | ||
247 | if (cnt > 63) | 250 | if (cnt > 63) |
248 | cnt = 63; | 251 | cnt = 63; |
@@ -253,7 +256,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
253 | buf[cnt] = 0; | 256 | buf[cnt] = 0; |
254 | cmp = strstrip(buf); | 257 | cmp = strstrip(buf); |
255 | 258 | ||
259 | /* Ensure the static_key remains in a consistent state */ | ||
260 | inode = file_inode(filp); | ||
261 | mutex_lock(&inode->i_mutex); | ||
256 | i = sched_feat_set(cmp); | 262 | i = sched_feat_set(cmp); |
263 | mutex_unlock(&inode->i_mutex); | ||
257 | if (i == __SCHED_FEAT_NR) | 264 | if (i == __SCHED_FEAT_NR) |
258 | return -EINVAL; | 265 | return -EINVAL; |
259 | 266 | ||
@@ -587,30 +594,31 @@ static bool set_nr_if_polling(struct task_struct *p) | |||
587 | #endif | 594 | #endif |
588 | 595 | ||
589 | /* | 596 | /* |
590 | * resched_task - mark a task 'to be rescheduled now'. | 597 | * resched_curr - mark rq's current task 'to be rescheduled now'. |
591 | * | 598 | * |
592 | * On UP this means the setting of the need_resched flag, on SMP it | 599 | * On UP this means the setting of the need_resched flag, on SMP it |
593 | * might also involve a cross-CPU call to trigger the scheduler on | 600 | * might also involve a cross-CPU call to trigger the scheduler on |
594 | * the target CPU. | 601 | * the target CPU. |
595 | */ | 602 | */ |
596 | void resched_task(struct task_struct *p) | 603 | void resched_curr(struct rq *rq) |
597 | { | 604 | { |
605 | struct task_struct *curr = rq->curr; | ||
598 | int cpu; | 606 | int cpu; |
599 | 607 | ||
600 | lockdep_assert_held(&task_rq(p)->lock); | 608 | lockdep_assert_held(&rq->lock); |
601 | 609 | ||
602 | if (test_tsk_need_resched(p)) | 610 | if (test_tsk_need_resched(curr)) |
603 | return; | 611 | return; |
604 | 612 | ||
605 | cpu = task_cpu(p); | 613 | cpu = cpu_of(rq); |
606 | 614 | ||
607 | if (cpu == smp_processor_id()) { | 615 | if (cpu == smp_processor_id()) { |
608 | set_tsk_need_resched(p); | 616 | set_tsk_need_resched(curr); |
609 | set_preempt_need_resched(); | 617 | set_preempt_need_resched(); |
610 | return; | 618 | return; |
611 | } | 619 | } |
612 | 620 | ||
613 | if (set_nr_and_not_polling(p)) | 621 | if (set_nr_and_not_polling(curr)) |
614 | smp_send_reschedule(cpu); | 622 | smp_send_reschedule(cpu); |
615 | else | 623 | else |
616 | trace_sched_wake_idle_without_ipi(cpu); | 624 | trace_sched_wake_idle_without_ipi(cpu); |
@@ -623,7 +631,7 @@ void resched_cpu(int cpu) | |||
623 | 631 | ||
624 | if (!raw_spin_trylock_irqsave(&rq->lock, flags)) | 632 | if (!raw_spin_trylock_irqsave(&rq->lock, flags)) |
625 | return; | 633 | return; |
626 | resched_task(cpu_curr(cpu)); | 634 | resched_curr(rq); |
627 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 635 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
628 | } | 636 | } |
629 | 637 | ||
@@ -684,10 +692,16 @@ static void wake_up_idle_cpu(int cpu) | |||
684 | 692 | ||
685 | static bool wake_up_full_nohz_cpu(int cpu) | 693 | static bool wake_up_full_nohz_cpu(int cpu) |
686 | { | 694 | { |
695 | /* | ||
696 | * We just need the target to call irq_exit() and re-evaluate | ||
697 | * the next tick. The nohz full kick at least implies that. | ||
698 | * If needed we can still optimize that later with an | ||
699 | * empty IRQ. | ||
700 | */ | ||
687 | if (tick_nohz_full_cpu(cpu)) { | 701 | if (tick_nohz_full_cpu(cpu)) { |
688 | if (cpu != smp_processor_id() || | 702 | if (cpu != smp_processor_id() || |
689 | tick_nohz_tick_stopped()) | 703 | tick_nohz_tick_stopped()) |
690 | smp_send_reschedule(cpu); | 704 | tick_nohz_full_kick_cpu(cpu); |
691 | return true; | 705 | return true; |
692 | } | 706 | } |
693 | 707 | ||
@@ -730,18 +744,15 @@ static inline bool got_nohz_idle_kick(void) | |||
730 | #ifdef CONFIG_NO_HZ_FULL | 744 | #ifdef CONFIG_NO_HZ_FULL |
731 | bool sched_can_stop_tick(void) | 745 | bool sched_can_stop_tick(void) |
732 | { | 746 | { |
733 | struct rq *rq; | 747 | /* |
734 | 748 | * More than one running task need preemption. | |
735 | rq = this_rq(); | 749 | * nr_running update is assumed to be visible |
736 | 750 | * after IPI is sent from wakers. | |
737 | /* Make sure rq->nr_running update is visible after the IPI */ | 751 | */ |
738 | smp_rmb(); | 752 | if (this_rq()->nr_running > 1) |
739 | 753 | return false; | |
740 | /* More than one running task need preemption */ | ||
741 | if (rq->nr_running > 1) | ||
742 | return false; | ||
743 | 754 | ||
744 | return true; | 755 | return true; |
745 | } | 756 | } |
746 | #endif /* CONFIG_NO_HZ_FULL */ | 757 | #endif /* CONFIG_NO_HZ_FULL */ |
747 | 758 | ||
@@ -1022,7 +1033,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
1022 | if (class == rq->curr->sched_class) | 1033 | if (class == rq->curr->sched_class) |
1023 | break; | 1034 | break; |
1024 | if (class == p->sched_class) { | 1035 | if (class == p->sched_class) { |
1025 | resched_task(rq->curr); | 1036 | resched_curr(rq); |
1026 | break; | 1037 | break; |
1027 | } | 1038 | } |
1028 | } | 1039 | } |
@@ -1568,9 +1579,7 @@ void scheduler_ipi(void) | |||
1568 | */ | 1579 | */ |
1569 | preempt_fold_need_resched(); | 1580 | preempt_fold_need_resched(); |
1570 | 1581 | ||
1571 | if (llist_empty(&this_rq()->wake_list) | 1582 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) |
1572 | && !tick_nohz_full_cpu(smp_processor_id()) | ||
1573 | && !got_nohz_idle_kick()) | ||
1574 | return; | 1583 | return; |
1575 | 1584 | ||
1576 | /* | 1585 | /* |
@@ -1587,7 +1596,6 @@ void scheduler_ipi(void) | |||
1587 | * somewhat pessimize the simple resched case. | 1596 | * somewhat pessimize the simple resched case. |
1588 | */ | 1597 | */ |
1589 | irq_enter(); | 1598 | irq_enter(); |
1590 | tick_nohz_full_check(); | ||
1591 | sched_ttwu_pending(); | 1599 | sched_ttwu_pending(); |
1592 | 1600 | ||
1593 | /* | 1601 | /* |
@@ -2431,7 +2439,12 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | |||
2431 | { | 2439 | { |
2432 | u64 ns = 0; | 2440 | u64 ns = 0; |
2433 | 2441 | ||
2434 | if (task_current(rq, p)) { | 2442 | /* |
2443 | * Must be ->curr _and_ ->on_rq. If dequeued, we would | ||
2444 | * project cycles that may never be accounted to this | ||
2445 | * thread, breaking clock_gettime(). | ||
2446 | */ | ||
2447 | if (task_current(rq, p) && p->on_rq) { | ||
2435 | update_rq_clock(rq); | 2448 | update_rq_clock(rq); |
2436 | ns = rq_clock_task(rq) - p->se.exec_start; | 2449 | ns = rq_clock_task(rq) - p->se.exec_start; |
2437 | if ((s64)ns < 0) | 2450 | if ((s64)ns < 0) |
@@ -2474,8 +2487,10 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
2474 | * If we race with it leaving cpu, we'll take a lock. So we're correct. | 2487 | * If we race with it leaving cpu, we'll take a lock. So we're correct. |
2475 | * If we race with it entering cpu, unaccounted time is 0. This is | 2488 | * If we race with it entering cpu, unaccounted time is 0. This is |
2476 | * indistinguishable from the read occurring a few cycles earlier. | 2489 | * indistinguishable from the read occurring a few cycles earlier. |
2490 | * If we see ->on_cpu without ->on_rq, the task is leaving, and has | ||
2491 | * been accounted, so we're correct here as well. | ||
2477 | */ | 2492 | */ |
2478 | if (!p->on_cpu) | 2493 | if (!p->on_cpu || !p->on_rq) |
2479 | return p->se.sum_exec_runtime; | 2494 | return p->se.sum_exec_runtime; |
2480 | #endif | 2495 | #endif |
2481 | 2496 | ||
@@ -2971,7 +2986,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
2971 | } | 2986 | } |
2972 | 2987 | ||
2973 | trace_sched_pi_setprio(p, prio); | 2988 | trace_sched_pi_setprio(p, prio); |
2974 | p->pi_top_task = rt_mutex_get_top_task(p); | ||
2975 | oldprio = p->prio; | 2989 | oldprio = p->prio; |
2976 | prev_class = p->sched_class; | 2990 | prev_class = p->sched_class; |
2977 | on_rq = p->on_rq; | 2991 | on_rq = p->on_rq; |
@@ -2991,8 +3005,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
2991 | * running task | 3005 | * running task |
2992 | */ | 3006 | */ |
2993 | if (dl_prio(prio)) { | 3007 | if (dl_prio(prio)) { |
2994 | if (!dl_prio(p->normal_prio) || (p->pi_top_task && | 3008 | struct task_struct *pi_task = rt_mutex_get_top_task(p); |
2995 | dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { | 3009 | if (!dl_prio(p->normal_prio) || |
3010 | (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { | ||
2996 | p->dl.dl_boosted = 1; | 3011 | p->dl.dl_boosted = 1; |
2997 | p->dl.dl_throttled = 0; | 3012 | p->dl.dl_throttled = 0; |
2998 | enqueue_flag = ENQUEUE_REPLENISH; | 3013 | enqueue_flag = ENQUEUE_REPLENISH; |
@@ -3064,7 +3079,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3064 | * lowered its priority, then reschedule its CPU: | 3079 | * lowered its priority, then reschedule its CPU: |
3065 | */ | 3080 | */ |
3066 | if (delta < 0 || (delta > 0 && task_running(rq, p))) | 3081 | if (delta < 0 || (delta > 0 && task_running(rq, p))) |
3067 | resched_task(rq->curr); | 3082 | resched_curr(rq); |
3068 | } | 3083 | } |
3069 | out_unlock: | 3084 | out_unlock: |
3070 | task_rq_unlock(rq, p, &flags); | 3085 | task_rq_unlock(rq, p, &flags); |
@@ -3203,12 +3218,18 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) | |||
3203 | dl_se->dl_yielded = 0; | 3218 | dl_se->dl_yielded = 0; |
3204 | } | 3219 | } |
3205 | 3220 | ||
3221 | /* | ||
3222 | * sched_setparam() passes in -1 for its policy, to let the functions | ||
3223 | * it calls know not to change it. | ||
3224 | */ | ||
3225 | #define SETPARAM_POLICY -1 | ||
3226 | |||
3206 | static void __setscheduler_params(struct task_struct *p, | 3227 | static void __setscheduler_params(struct task_struct *p, |
3207 | const struct sched_attr *attr) | 3228 | const struct sched_attr *attr) |
3208 | { | 3229 | { |
3209 | int policy = attr->sched_policy; | 3230 | int policy = attr->sched_policy; |
3210 | 3231 | ||
3211 | if (policy == -1) /* setparam */ | 3232 | if (policy == SETPARAM_POLICY) |
3212 | policy = p->policy; | 3233 | policy = p->policy; |
3213 | 3234 | ||
3214 | p->policy = policy; | 3235 | p->policy = policy; |
@@ -3557,10 +3578,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy, | |||
3557 | .sched_nice = PRIO_TO_NICE(p->static_prio), | 3578 | .sched_nice = PRIO_TO_NICE(p->static_prio), |
3558 | }; | 3579 | }; |
3559 | 3580 | ||
3560 | /* | 3581 | /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ |
3561 | * Fixup the legacy SCHED_RESET_ON_FORK hack | 3582 | if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { |
3562 | */ | ||
3563 | if (policy & SCHED_RESET_ON_FORK) { | ||
3564 | attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; | 3583 | attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; |
3565 | policy &= ~SCHED_RESET_ON_FORK; | 3584 | policy &= ~SCHED_RESET_ON_FORK; |
3566 | attr.sched_policy = policy; | 3585 | attr.sched_policy = policy; |
@@ -3730,7 +3749,7 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, | |||
3730 | */ | 3749 | */ |
3731 | SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) | 3750 | SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) |
3732 | { | 3751 | { |
3733 | return do_sched_setscheduler(pid, -1, param); | 3752 | return do_sched_setscheduler(pid, SETPARAM_POLICY, param); |
3734 | } | 3753 | } |
3735 | 3754 | ||
3736 | /** | 3755 | /** |
@@ -4147,7 +4166,6 @@ static void __cond_resched(void) | |||
4147 | 4166 | ||
4148 | int __sched _cond_resched(void) | 4167 | int __sched _cond_resched(void) |
4149 | { | 4168 | { |
4150 | rcu_cond_resched(); | ||
4151 | if (should_resched()) { | 4169 | if (should_resched()) { |
4152 | __cond_resched(); | 4170 | __cond_resched(); |
4153 | return 1; | 4171 | return 1; |
@@ -4166,18 +4184,15 @@ EXPORT_SYMBOL(_cond_resched); | |||
4166 | */ | 4184 | */ |
4167 | int __cond_resched_lock(spinlock_t *lock) | 4185 | int __cond_resched_lock(spinlock_t *lock) |
4168 | { | 4186 | { |
4169 | bool need_rcu_resched = rcu_should_resched(); | ||
4170 | int resched = should_resched(); | 4187 | int resched = should_resched(); |
4171 | int ret = 0; | 4188 | int ret = 0; |
4172 | 4189 | ||
4173 | lockdep_assert_held(lock); | 4190 | lockdep_assert_held(lock); |
4174 | 4191 | ||
4175 | if (spin_needbreak(lock) || resched || need_rcu_resched) { | 4192 | if (spin_needbreak(lock) || resched) { |
4176 | spin_unlock(lock); | 4193 | spin_unlock(lock); |
4177 | if (resched) | 4194 | if (resched) |
4178 | __cond_resched(); | 4195 | __cond_resched(); |
4179 | else if (unlikely(need_rcu_resched)) | ||
4180 | rcu_resched(); | ||
4181 | else | 4196 | else |
4182 | cpu_relax(); | 4197 | cpu_relax(); |
4183 | ret = 1; | 4198 | ret = 1; |
@@ -4191,7 +4206,6 @@ int __sched __cond_resched_softirq(void) | |||
4191 | { | 4206 | { |
4192 | BUG_ON(!in_softirq()); | 4207 | BUG_ON(!in_softirq()); |
4193 | 4208 | ||
4194 | rcu_cond_resched(); /* BH disabled OK, just recording QSes. */ | ||
4195 | if (should_resched()) { | 4209 | if (should_resched()) { |
4196 | local_bh_enable(); | 4210 | local_bh_enable(); |
4197 | __cond_resched(); | 4211 | __cond_resched(); |
@@ -4290,7 +4304,7 @@ again: | |||
4290 | * fairness. | 4304 | * fairness. |
4291 | */ | 4305 | */ |
4292 | if (preempt && rq != p_rq) | 4306 | if (preempt && rq != p_rq) |
4293 | resched_task(p_rq->curr); | 4307 | resched_curr(p_rq); |
4294 | } | 4308 | } |
4295 | 4309 | ||
4296 | out_unlock: | 4310 | out_unlock: |
@@ -6470,6 +6484,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
6470 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | 6484 | sched_domain_level_max = max(sched_domain_level_max, sd->level); |
6471 | child->parent = sd; | 6485 | child->parent = sd; |
6472 | sd->child = child; | 6486 | sd->child = child; |
6487 | |||
6488 | if (!cpumask_subset(sched_domain_span(child), | ||
6489 | sched_domain_span(sd))) { | ||
6490 | pr_err("BUG: arch topology borken\n"); | ||
6491 | #ifdef CONFIG_SCHED_DEBUG | ||
6492 | pr_err(" the %s domain not a subset of the %s domain\n", | ||
6493 | child->name, sd->name); | ||
6494 | #endif | ||
6495 | /* Fixup, ensure @sd has at least @child cpus. */ | ||
6496 | cpumask_or(sched_domain_span(sd), | ||
6497 | sched_domain_span(sd), | ||
6498 | sched_domain_span(child)); | ||
6499 | } | ||
6500 | |||
6473 | } | 6501 | } |
6474 | set_domain_attribute(sd, attr); | 6502 | set_domain_attribute(sd, attr); |
6475 | 6503 | ||
@@ -7097,7 +7125,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
7097 | __setscheduler(rq, p, &attr); | 7125 | __setscheduler(rq, p, &attr); |
7098 | if (on_rq) { | 7126 | if (on_rq) { |
7099 | enqueue_task(rq, p, 0); | 7127 | enqueue_task(rq, p, 0); |
7100 | resched_task(rq->curr); | 7128 | resched_curr(rq); |
7101 | } | 7129 | } |
7102 | 7130 | ||
7103 | check_class_changed(rq, p, prev_class, old_prio); | 7131 | check_class_changed(rq, p, prev_class, old_prio); |
@@ -7808,6 +7836,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
7808 | if (period > max_cfs_quota_period) | 7836 | if (period > max_cfs_quota_period) |
7809 | return -EINVAL; | 7837 | return -EINVAL; |
7810 | 7838 | ||
7839 | /* | ||
7840 | * Prevent race between setting of cfs_rq->runtime_enabled and | ||
7841 | * unthrottle_offline_cfs_rqs(). | ||
7842 | */ | ||
7843 | get_online_cpus(); | ||
7811 | mutex_lock(&cfs_constraints_mutex); | 7844 | mutex_lock(&cfs_constraints_mutex); |
7812 | ret = __cfs_schedulable(tg, period, quota); | 7845 | ret = __cfs_schedulable(tg, period, quota); |
7813 | if (ret) | 7846 | if (ret) |
@@ -7833,7 +7866,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
7833 | } | 7866 | } |
7834 | raw_spin_unlock_irq(&cfs_b->lock); | 7867 | raw_spin_unlock_irq(&cfs_b->lock); |
7835 | 7868 | ||
7836 | for_each_possible_cpu(i) { | 7869 | for_each_online_cpu(i) { |
7837 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; | 7870 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; |
7838 | struct rq *rq = cfs_rq->rq; | 7871 | struct rq *rq = cfs_rq->rq; |
7839 | 7872 | ||
@@ -7849,6 +7882,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
7849 | cfs_bandwidth_usage_dec(); | 7882 | cfs_bandwidth_usage_dec(); |
7850 | out_unlock: | 7883 | out_unlock: |
7851 | mutex_unlock(&cfs_constraints_mutex); | 7884 | mutex_unlock(&cfs_constraints_mutex); |
7885 | put_online_cpus(); | ||
7852 | 7886 | ||
7853 | return ret; | 7887 | return ret; |
7854 | } | 7888 | } |
@@ -8088,7 +8122,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { | |||
8088 | .can_attach = cpu_cgroup_can_attach, | 8122 | .can_attach = cpu_cgroup_can_attach, |
8089 | .attach = cpu_cgroup_attach, | 8123 | .attach = cpu_cgroup_attach, |
8090 | .exit = cpu_cgroup_exit, | 8124 | .exit = cpu_cgroup_exit, |
8091 | .base_cftypes = cpu_files, | 8125 | .legacy_cftypes = cpu_files, |
8092 | .early_init = 1, | 8126 | .early_init = 1, |
8093 | }; | 8127 | }; |
8094 | 8128 | ||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9cf350c94ec4..dd7cbb55bbf2 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
@@ -278,6 +278,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) | |||
278 | struct cgroup_subsys cpuacct_cgrp_subsys = { | 278 | struct cgroup_subsys cpuacct_cgrp_subsys = { |
279 | .css_alloc = cpuacct_css_alloc, | 279 | .css_alloc = cpuacct_css_alloc, |
280 | .css_free = cpuacct_css_free, | 280 | .css_free = cpuacct_css_free, |
281 | .base_cftypes = files, | 281 | .legacy_cftypes = files, |
282 | .early_init = 1, | 282 | .early_init = 1, |
283 | }; | 283 | }; |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fc4f98b1258f..255ce138b652 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -306,7 +306,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, | |||
306 | * the overrunning entity can't interfere with other entity in the system and | 306 | * the overrunning entity can't interfere with other entity in the system and |
307 | * can't make them miss their deadlines. Reasons why this kind of overruns | 307 | * can't make them miss their deadlines. Reasons why this kind of overruns |
308 | * could happen are, typically, a entity voluntarily trying to overcome its | 308 | * could happen are, typically, a entity voluntarily trying to overcome its |
309 | * runtime, or it just underestimated it during sched_setscheduler_ex(). | 309 | * runtime, or it just underestimated it during sched_setattr(). |
310 | */ | 310 | */ |
311 | static void replenish_dl_entity(struct sched_dl_entity *dl_se, | 311 | static void replenish_dl_entity(struct sched_dl_entity *dl_se, |
312 | struct sched_dl_entity *pi_se) | 312 | struct sched_dl_entity *pi_se) |
@@ -535,7 +535,7 @@ again: | |||
535 | if (task_has_dl_policy(rq->curr)) | 535 | if (task_has_dl_policy(rq->curr)) |
536 | check_preempt_curr_dl(rq, p, 0); | 536 | check_preempt_curr_dl(rq, p, 0); |
537 | else | 537 | else |
538 | resched_task(rq->curr); | 538 | resched_curr(rq); |
539 | #ifdef CONFIG_SMP | 539 | #ifdef CONFIG_SMP |
540 | /* | 540 | /* |
541 | * Queueing this task back might have overloaded rq, | 541 | * Queueing this task back might have overloaded rq, |
@@ -634,7 +634,7 @@ static void update_curr_dl(struct rq *rq) | |||
634 | enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); | 634 | enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); |
635 | 635 | ||
636 | if (!is_leftmost(curr, &rq->dl)) | 636 | if (!is_leftmost(curr, &rq->dl)) |
637 | resched_task(curr); | 637 | resched_curr(rq); |
638 | } | 638 | } |
639 | 639 | ||
640 | /* | 640 | /* |
@@ -964,7 +964,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) | |||
964 | cpudl_find(&rq->rd->cpudl, p, NULL) != -1) | 964 | cpudl_find(&rq->rd->cpudl, p, NULL) != -1) |
965 | return; | 965 | return; |
966 | 966 | ||
967 | resched_task(rq->curr); | 967 | resched_curr(rq); |
968 | } | 968 | } |
969 | 969 | ||
970 | static int pull_dl_task(struct rq *this_rq); | 970 | static int pull_dl_task(struct rq *this_rq); |
@@ -979,7 +979,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | |||
979 | int flags) | 979 | int flags) |
980 | { | 980 | { |
981 | if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { | 981 | if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { |
982 | resched_task(rq->curr); | 982 | resched_curr(rq); |
983 | return; | 983 | return; |
984 | } | 984 | } |
985 | 985 | ||
@@ -1333,7 +1333,7 @@ retry: | |||
1333 | if (dl_task(rq->curr) && | 1333 | if (dl_task(rq->curr) && |
1334 | dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && | 1334 | dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && |
1335 | rq->curr->nr_cpus_allowed > 1) { | 1335 | rq->curr->nr_cpus_allowed > 1) { |
1336 | resched_task(rq->curr); | 1336 | resched_curr(rq); |
1337 | return 0; | 1337 | return 0; |
1338 | } | 1338 | } |
1339 | 1339 | ||
@@ -1373,7 +1373,7 @@ retry: | |||
1373 | set_task_cpu(next_task, later_rq->cpu); | 1373 | set_task_cpu(next_task, later_rq->cpu); |
1374 | activate_task(later_rq, next_task, 0); | 1374 | activate_task(later_rq, next_task, 0); |
1375 | 1375 | ||
1376 | resched_task(later_rq->curr); | 1376 | resched_curr(later_rq); |
1377 | 1377 | ||
1378 | double_unlock_balance(rq, later_rq); | 1378 | double_unlock_balance(rq, later_rq); |
1379 | 1379 | ||
@@ -1632,14 +1632,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, | |||
1632 | */ | 1632 | */ |
1633 | if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && | 1633 | if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && |
1634 | rq->curr == p) | 1634 | rq->curr == p) |
1635 | resched_task(p); | 1635 | resched_curr(rq); |
1636 | #else | 1636 | #else |
1637 | /* | 1637 | /* |
1638 | * Again, we don't know if p has a earlier | 1638 | * Again, we don't know if p has a earlier |
1639 | * or later deadline, so let's blindly set a | 1639 | * or later deadline, so let's blindly set a |
1640 | * (maybe not needed) rescheduling point. | 1640 | * (maybe not needed) rescheduling point. |
1641 | */ | 1641 | */ |
1642 | resched_task(p); | 1642 | resched_curr(rq); |
1643 | #endif /* CONFIG_SMP */ | 1643 | #endif /* CONFIG_SMP */ |
1644 | } else | 1644 | } else |
1645 | switched_to_dl(rq, p); | 1645 | switched_to_dl(rq, p); |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 695f9773bb60..627b3c34b821 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -608,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
608 | 608 | ||
609 | avg_atom = p->se.sum_exec_runtime; | 609 | avg_atom = p->se.sum_exec_runtime; |
610 | if (nr_switches) | 610 | if (nr_switches) |
611 | do_div(avg_atom, nr_switches); | 611 | avg_atom = div64_ul(avg_atom, nr_switches); |
612 | else | 612 | else |
613 | avg_atom = -1LL; | 613 | avg_atom = -1LL; |
614 | 614 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fea7d3335e1f..bfa3c86d0d68 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid) | |||
1062 | if (!cpus) | 1062 | if (!cpus) |
1063 | return; | 1063 | return; |
1064 | 1064 | ||
1065 | ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity; | ||
1066 | ns->task_capacity = | 1065 | ns->task_capacity = |
1067 | DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); | 1066 | DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); |
1068 | ns->has_free_capacity = (ns->nr_running < ns->task_capacity); | 1067 | ns->has_free_capacity = (ns->nr_running < ns->task_capacity); |
@@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env, | |||
1096 | env->best_cpu = env->dst_cpu; | 1095 | env->best_cpu = env->dst_cpu; |
1097 | } | 1096 | } |
1098 | 1097 | ||
1099 | static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, | 1098 | static bool load_too_imbalanced(long src_load, long dst_load, |
1100 | long src_load, long dst_load, | ||
1101 | struct task_numa_env *env) | 1099 | struct task_numa_env *env) |
1102 | { | 1100 | { |
1103 | long imb, old_imb; | 1101 | long imb, old_imb; |
1102 | long orig_src_load, orig_dst_load; | ||
1103 | long src_capacity, dst_capacity; | ||
1104 | |||
1105 | /* | ||
1106 | * The load is corrected for the CPU capacity available on each node. | ||
1107 | * | ||
1108 | * src_load dst_load | ||
1109 | * ------------ vs --------- | ||
1110 | * src_capacity dst_capacity | ||
1111 | */ | ||
1112 | src_capacity = env->src_stats.compute_capacity; | ||
1113 | dst_capacity = env->dst_stats.compute_capacity; | ||
1104 | 1114 | ||
1105 | /* We care about the slope of the imbalance, not the direction. */ | 1115 | /* We care about the slope of the imbalance, not the direction. */ |
1106 | if (dst_load < src_load) | 1116 | if (dst_load < src_load) |
1107 | swap(dst_load, src_load); | 1117 | swap(dst_load, src_load); |
1108 | 1118 | ||
1109 | /* Is the difference below the threshold? */ | 1119 | /* Is the difference below the threshold? */ |
1110 | imb = dst_load * 100 - src_load * env->imbalance_pct; | 1120 | imb = dst_load * src_capacity * 100 - |
1121 | src_load * dst_capacity * env->imbalance_pct; | ||
1111 | if (imb <= 0) | 1122 | if (imb <= 0) |
1112 | return false; | 1123 | return false; |
1113 | 1124 | ||
@@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, | |||
1115 | * The imbalance is above the allowed threshold. | 1126 | * The imbalance is above the allowed threshold. |
1116 | * Compare it with the old imbalance. | 1127 | * Compare it with the old imbalance. |
1117 | */ | 1128 | */ |
1129 | orig_src_load = env->src_stats.load; | ||
1130 | orig_dst_load = env->dst_stats.load; | ||
1131 | |||
1118 | if (orig_dst_load < orig_src_load) | 1132 | if (orig_dst_load < orig_src_load) |
1119 | swap(orig_dst_load, orig_src_load); | 1133 | swap(orig_dst_load, orig_src_load); |
1120 | 1134 | ||
1121 | old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; | 1135 | old_imb = orig_dst_load * src_capacity * 100 - |
1136 | orig_src_load * dst_capacity * env->imbalance_pct; | ||
1122 | 1137 | ||
1123 | /* Would this change make things worse? */ | 1138 | /* Would this change make things worse? */ |
1124 | return (imb > old_imb); | 1139 | return (imb > old_imb); |
@@ -1136,10 +1151,10 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1136 | struct rq *src_rq = cpu_rq(env->src_cpu); | 1151 | struct rq *src_rq = cpu_rq(env->src_cpu); |
1137 | struct rq *dst_rq = cpu_rq(env->dst_cpu); | 1152 | struct rq *dst_rq = cpu_rq(env->dst_cpu); |
1138 | struct task_struct *cur; | 1153 | struct task_struct *cur; |
1139 | long orig_src_load, src_load; | 1154 | long src_load, dst_load; |
1140 | long orig_dst_load, dst_load; | ||
1141 | long load; | 1155 | long load; |
1142 | long imp = (groupimp > 0) ? groupimp : taskimp; | 1156 | long imp = env->p->numa_group ? groupimp : taskimp; |
1157 | long moveimp = imp; | ||
1143 | 1158 | ||
1144 | rcu_read_lock(); | 1159 | rcu_read_lock(); |
1145 | cur = ACCESS_ONCE(dst_rq->curr); | 1160 | cur = ACCESS_ONCE(dst_rq->curr); |
@@ -1177,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1177 | * itself (not part of a group), use the task weight | 1192 | * itself (not part of a group), use the task weight |
1178 | * instead. | 1193 | * instead. |
1179 | */ | 1194 | */ |
1180 | if (env->p->numa_group) | ||
1181 | imp = groupimp; | ||
1182 | else | ||
1183 | imp = taskimp; | ||
1184 | |||
1185 | if (cur->numa_group) | 1195 | if (cur->numa_group) |
1186 | imp += group_weight(cur, env->src_nid) - | 1196 | imp += group_weight(cur, env->src_nid) - |
1187 | group_weight(cur, env->dst_nid); | 1197 | group_weight(cur, env->dst_nid); |
@@ -1191,7 +1201,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1191 | } | 1201 | } |
1192 | } | 1202 | } |
1193 | 1203 | ||
1194 | if (imp < env->best_imp) | 1204 | if (imp <= env->best_imp && moveimp <= env->best_imp) |
1195 | goto unlock; | 1205 | goto unlock; |
1196 | 1206 | ||
1197 | if (!cur) { | 1207 | if (!cur) { |
@@ -1204,20 +1214,34 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1204 | } | 1214 | } |
1205 | 1215 | ||
1206 | /* Balance doesn't matter much if we're running a task per cpu */ | 1216 | /* Balance doesn't matter much if we're running a task per cpu */ |
1207 | if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) | 1217 | if (imp > env->best_imp && src_rq->nr_running == 1 && |
1218 | dst_rq->nr_running == 1) | ||
1208 | goto assign; | 1219 | goto assign; |
1209 | 1220 | ||
1210 | /* | 1221 | /* |
1211 | * In the overloaded case, try and keep the load balanced. | 1222 | * In the overloaded case, try and keep the load balanced. |
1212 | */ | 1223 | */ |
1213 | balance: | 1224 | balance: |
1214 | orig_dst_load = env->dst_stats.load; | ||
1215 | orig_src_load = env->src_stats.load; | ||
1216 | |||
1217 | /* XXX missing capacity terms */ | ||
1218 | load = task_h_load(env->p); | 1225 | load = task_h_load(env->p); |
1219 | dst_load = orig_dst_load + load; | 1226 | dst_load = env->dst_stats.load + load; |
1220 | src_load = orig_src_load - load; | 1227 | src_load = env->src_stats.load - load; |
1228 | |||
1229 | if (moveimp > imp && moveimp > env->best_imp) { | ||
1230 | /* | ||
1231 | * If the improvement from just moving env->p direction is | ||
1232 | * better than swapping tasks around, check if a move is | ||
1233 | * possible. Store a slightly smaller score than moveimp, | ||
1234 | * so an actually idle CPU will win. | ||
1235 | */ | ||
1236 | if (!load_too_imbalanced(src_load, dst_load, env)) { | ||
1237 | imp = moveimp - 1; | ||
1238 | cur = NULL; | ||
1239 | goto assign; | ||
1240 | } | ||
1241 | } | ||
1242 | |||
1243 | if (imp <= env->best_imp) | ||
1244 | goto unlock; | ||
1221 | 1245 | ||
1222 | if (cur) { | 1246 | if (cur) { |
1223 | load = task_h_load(cur); | 1247 | load = task_h_load(cur); |
@@ -1225,8 +1249,7 @@ balance: | |||
1225 | src_load += load; | 1249 | src_load += load; |
1226 | } | 1250 | } |
1227 | 1251 | ||
1228 | if (load_too_imbalanced(orig_src_load, orig_dst_load, | 1252 | if (load_too_imbalanced(src_load, dst_load, env)) |
1229 | src_load, dst_load, env)) | ||
1230 | goto unlock; | 1253 | goto unlock; |
1231 | 1254 | ||
1232 | assign: | 1255 | assign: |
@@ -1302,9 +1325,8 @@ static int task_numa_migrate(struct task_struct *p) | |||
1302 | groupimp = group_weight(p, env.dst_nid) - groupweight; | 1325 | groupimp = group_weight(p, env.dst_nid) - groupweight; |
1303 | update_numa_stats(&env.dst_stats, env.dst_nid); | 1326 | update_numa_stats(&env.dst_stats, env.dst_nid); |
1304 | 1327 | ||
1305 | /* If the preferred nid has free capacity, try to use it. */ | 1328 | /* Try to find a spot on the preferred nid. */ |
1306 | if (env.dst_stats.has_free_capacity) | 1329 | task_numa_find_cpu(&env, taskimp, groupimp); |
1307 | task_numa_find_cpu(&env, taskimp, groupimp); | ||
1308 | 1330 | ||
1309 | /* No space available on the preferred nid. Look elsewhere. */ | 1331 | /* No space available on the preferred nid. Look elsewhere. */ |
1310 | if (env.best_cpu == -1) { | 1332 | if (env.best_cpu == -1) { |
@@ -1324,10 +1346,6 @@ static int task_numa_migrate(struct task_struct *p) | |||
1324 | } | 1346 | } |
1325 | } | 1347 | } |
1326 | 1348 | ||
1327 | /* No better CPU than the current one was found. */ | ||
1328 | if (env.best_cpu == -1) | ||
1329 | return -EAGAIN; | ||
1330 | |||
1331 | /* | 1349 | /* |
1332 | * If the task is part of a workload that spans multiple NUMA nodes, | 1350 | * If the task is part of a workload that spans multiple NUMA nodes, |
1333 | * and is migrating into one of the workload's active nodes, remember | 1351 | * and is migrating into one of the workload's active nodes, remember |
@@ -1336,8 +1354,19 @@ static int task_numa_migrate(struct task_struct *p) | |||
1336 | * A task that migrated to a second choice node will be better off | 1354 | * A task that migrated to a second choice node will be better off |
1337 | * trying for a better one later. Do not set the preferred node here. | 1355 | * trying for a better one later. Do not set the preferred node here. |
1338 | */ | 1356 | */ |
1339 | if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) | 1357 | if (p->numa_group) { |
1340 | sched_setnuma(p, env.dst_nid); | 1358 | if (env.best_cpu == -1) |
1359 | nid = env.src_nid; | ||
1360 | else | ||
1361 | nid = env.dst_nid; | ||
1362 | |||
1363 | if (node_isset(nid, p->numa_group->active_nodes)) | ||
1364 | sched_setnuma(p, env.dst_nid); | ||
1365 | } | ||
1366 | |||
1367 | /* No better CPU than the current one was found. */ | ||
1368 | if (env.best_cpu == -1) | ||
1369 | return -EAGAIN; | ||
1341 | 1370 | ||
1342 | /* | 1371 | /* |
1343 | * Reset the scan period if the task is being rescheduled on an | 1372 | * Reset the scan period if the task is being rescheduled on an |
@@ -1415,12 +1444,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) | |||
1415 | /* | 1444 | /* |
1416 | * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS | 1445 | * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS |
1417 | * increments. The more local the fault statistics are, the higher the scan | 1446 | * increments. The more local the fault statistics are, the higher the scan |
1418 | * period will be for the next scan window. If local/remote ratio is below | 1447 | * period will be for the next scan window. If local/(local+remote) ratio is |
1419 | * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the | 1448 | * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) |
1420 | * scan period will decrease | 1449 | * the scan period will decrease. Aim for 70% local accesses. |
1421 | */ | 1450 | */ |
1422 | #define NUMA_PERIOD_SLOTS 10 | 1451 | #define NUMA_PERIOD_SLOTS 10 |
1423 | #define NUMA_PERIOD_THRESHOLD 3 | 1452 | #define NUMA_PERIOD_THRESHOLD 7 |
1424 | 1453 | ||
1425 | /* | 1454 | /* |
1426 | * Increase the scan period (slow down scanning) if the majority of | 1455 | * Increase the scan period (slow down scanning) if the majority of |
@@ -1595,30 +1624,17 @@ static void task_numa_placement(struct task_struct *p) | |||
1595 | 1624 | ||
1596 | if (p->numa_group) { | 1625 | if (p->numa_group) { |
1597 | update_numa_active_node_mask(p->numa_group); | 1626 | update_numa_active_node_mask(p->numa_group); |
1598 | /* | ||
1599 | * If the preferred task and group nids are different, | ||
1600 | * iterate over the nodes again to find the best place. | ||
1601 | */ | ||
1602 | if (max_nid != max_group_nid) { | ||
1603 | unsigned long weight, max_weight = 0; | ||
1604 | |||
1605 | for_each_online_node(nid) { | ||
1606 | weight = task_weight(p, nid) + group_weight(p, nid); | ||
1607 | if (weight > max_weight) { | ||
1608 | max_weight = weight; | ||
1609 | max_nid = nid; | ||
1610 | } | ||
1611 | } | ||
1612 | } | ||
1613 | |||
1614 | spin_unlock_irq(group_lock); | 1627 | spin_unlock_irq(group_lock); |
1628 | max_nid = max_group_nid; | ||
1615 | } | 1629 | } |
1616 | 1630 | ||
1617 | /* Preferred node as the node with the most faults */ | 1631 | if (max_faults) { |
1618 | if (max_faults && max_nid != p->numa_preferred_nid) { | 1632 | /* Set the new preferred node */ |
1619 | /* Update the preferred nid and migrate task if possible */ | 1633 | if (max_nid != p->numa_preferred_nid) |
1620 | sched_setnuma(p, max_nid); | 1634 | sched_setnuma(p, max_nid); |
1621 | numa_migrate_preferred(p); | 1635 | |
1636 | if (task_node(p) != p->numa_preferred_nid) | ||
1637 | numa_migrate_preferred(p); | ||
1622 | } | 1638 | } |
1623 | } | 1639 | } |
1624 | 1640 | ||
@@ -2899,7 +2915,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
2899 | ideal_runtime = sched_slice(cfs_rq, curr); | 2915 | ideal_runtime = sched_slice(cfs_rq, curr); |
2900 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | 2916 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; |
2901 | if (delta_exec > ideal_runtime) { | 2917 | if (delta_exec > ideal_runtime) { |
2902 | resched_task(rq_of(cfs_rq)->curr); | 2918 | resched_curr(rq_of(cfs_rq)); |
2903 | /* | 2919 | /* |
2904 | * The current task ran long enough, ensure it doesn't get | 2920 | * The current task ran long enough, ensure it doesn't get |
2905 | * re-elected due to buddy favours. | 2921 | * re-elected due to buddy favours. |
@@ -2923,7 +2939,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
2923 | return; | 2939 | return; |
2924 | 2940 | ||
2925 | if (delta > ideal_runtime) | 2941 | if (delta > ideal_runtime) |
2926 | resched_task(rq_of(cfs_rq)->curr); | 2942 | resched_curr(rq_of(cfs_rq)); |
2927 | } | 2943 | } |
2928 | 2944 | ||
2929 | static void | 2945 | static void |
@@ -3063,7 +3079,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
3063 | * validating it and just reschedule. | 3079 | * validating it and just reschedule. |
3064 | */ | 3080 | */ |
3065 | if (queued) { | 3081 | if (queued) { |
3066 | resched_task(rq_of(cfs_rq)->curr); | 3082 | resched_curr(rq_of(cfs_rq)); |
3067 | return; | 3083 | return; |
3068 | } | 3084 | } |
3069 | /* | 3085 | /* |
@@ -3254,7 +3270,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) | |||
3254 | * hierarchy can be throttled | 3270 | * hierarchy can be throttled |
3255 | */ | 3271 | */ |
3256 | if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) | 3272 | if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) |
3257 | resched_task(rq_of(cfs_rq)->curr); | 3273 | resched_curr(rq_of(cfs_rq)); |
3258 | } | 3274 | } |
3259 | 3275 | ||
3260 | static __always_inline | 3276 | static __always_inline |
@@ -3360,7 +3376,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
3360 | cfs_rq->throttled = 1; | 3376 | cfs_rq->throttled = 1; |
3361 | cfs_rq->throttled_clock = rq_clock(rq); | 3377 | cfs_rq->throttled_clock = rq_clock(rq); |
3362 | raw_spin_lock(&cfs_b->lock); | 3378 | raw_spin_lock(&cfs_b->lock); |
3363 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | 3379 | /* |
3380 | * Add to the _head_ of the list, so that an already-started | ||
3381 | * distribute_cfs_runtime will not see us | ||
3382 | */ | ||
3383 | list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | ||
3364 | if (!cfs_b->timer_active) | 3384 | if (!cfs_b->timer_active) |
3365 | __start_cfs_bandwidth(cfs_b, false); | 3385 | __start_cfs_bandwidth(cfs_b, false); |
3366 | raw_spin_unlock(&cfs_b->lock); | 3386 | raw_spin_unlock(&cfs_b->lock); |
@@ -3410,14 +3430,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | |||
3410 | 3430 | ||
3411 | /* determine whether we need to wake up potentially idle cpu */ | 3431 | /* determine whether we need to wake up potentially idle cpu */ |
3412 | if (rq->curr == rq->idle && rq->cfs.nr_running) | 3432 | if (rq->curr == rq->idle && rq->cfs.nr_running) |
3413 | resched_task(rq->curr); | 3433 | resched_curr(rq); |
3414 | } | 3434 | } |
3415 | 3435 | ||
3416 | static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, | 3436 | static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, |
3417 | u64 remaining, u64 expires) | 3437 | u64 remaining, u64 expires) |
3418 | { | 3438 | { |
3419 | struct cfs_rq *cfs_rq; | 3439 | struct cfs_rq *cfs_rq; |
3420 | u64 runtime = remaining; | 3440 | u64 runtime; |
3441 | u64 starting_runtime = remaining; | ||
3421 | 3442 | ||
3422 | rcu_read_lock(); | 3443 | rcu_read_lock(); |
3423 | list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, | 3444 | list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, |
@@ -3448,7 +3469,7 @@ next: | |||
3448 | } | 3469 | } |
3449 | rcu_read_unlock(); | 3470 | rcu_read_unlock(); |
3450 | 3471 | ||
3451 | return remaining; | 3472 | return starting_runtime - remaining; |
3452 | } | 3473 | } |
3453 | 3474 | ||
3454 | /* | 3475 | /* |
@@ -3494,22 +3515,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | |||
3494 | /* account preceding periods in which throttling occurred */ | 3515 | /* account preceding periods in which throttling occurred */ |
3495 | cfs_b->nr_throttled += overrun; | 3516 | cfs_b->nr_throttled += overrun; |
3496 | 3517 | ||
3497 | /* | ||
3498 | * There are throttled entities so we must first use the new bandwidth | ||
3499 | * to unthrottle them before making it generally available. This | ||
3500 | * ensures that all existing debts will be paid before a new cfs_rq is | ||
3501 | * allowed to run. | ||
3502 | */ | ||
3503 | runtime = cfs_b->runtime; | ||
3504 | runtime_expires = cfs_b->runtime_expires; | 3518 | runtime_expires = cfs_b->runtime_expires; |
3505 | cfs_b->runtime = 0; | ||
3506 | 3519 | ||
3507 | /* | 3520 | /* |
3508 | * This check is repeated as we are holding onto the new bandwidth | 3521 | * This check is repeated as we are holding onto the new bandwidth while |
3509 | * while we unthrottle. This can potentially race with an unthrottled | 3522 | * we unthrottle. This can potentially race with an unthrottled group |
3510 | * group trying to acquire new bandwidth from the global pool. | 3523 | * trying to acquire new bandwidth from the global pool. This can result |
3524 | * in us over-using our runtime if it is all used during this loop, but | ||
3525 | * only by limited amounts in that extreme case. | ||
3511 | */ | 3526 | */ |
3512 | while (throttled && runtime > 0) { | 3527 | while (throttled && cfs_b->runtime > 0) { |
3528 | runtime = cfs_b->runtime; | ||
3513 | raw_spin_unlock(&cfs_b->lock); | 3529 | raw_spin_unlock(&cfs_b->lock); |
3514 | /* we can't nest cfs_b->lock while distributing bandwidth */ | 3530 | /* we can't nest cfs_b->lock while distributing bandwidth */ |
3515 | runtime = distribute_cfs_runtime(cfs_b, runtime, | 3531 | runtime = distribute_cfs_runtime(cfs_b, runtime, |
@@ -3517,10 +3533,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | |||
3517 | raw_spin_lock(&cfs_b->lock); | 3533 | raw_spin_lock(&cfs_b->lock); |
3518 | 3534 | ||
3519 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); | 3535 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); |
3536 | |||
3537 | cfs_b->runtime -= min(runtime, cfs_b->runtime); | ||
3520 | } | 3538 | } |
3521 | 3539 | ||
3522 | /* return (any) remaining runtime */ | ||
3523 | cfs_b->runtime = runtime; | ||
3524 | /* | 3540 | /* |
3525 | * While we are ensured activity in the period following an | 3541 | * While we are ensured activity in the period following an |
3526 | * unthrottle, this also covers the case in which the new bandwidth is | 3542 | * unthrottle, this also covers the case in which the new bandwidth is |
@@ -3631,10 +3647,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | |||
3631 | return; | 3647 | return; |
3632 | } | 3648 | } |
3633 | 3649 | ||
3634 | if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { | 3650 | if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) |
3635 | runtime = cfs_b->runtime; | 3651 | runtime = cfs_b->runtime; |
3636 | cfs_b->runtime = 0; | 3652 | |
3637 | } | ||
3638 | expires = cfs_b->runtime_expires; | 3653 | expires = cfs_b->runtime_expires; |
3639 | raw_spin_unlock(&cfs_b->lock); | 3654 | raw_spin_unlock(&cfs_b->lock); |
3640 | 3655 | ||
@@ -3645,7 +3660,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | |||
3645 | 3660 | ||
3646 | raw_spin_lock(&cfs_b->lock); | 3661 | raw_spin_lock(&cfs_b->lock); |
3647 | if (expires == cfs_b->runtime_expires) | 3662 | if (expires == cfs_b->runtime_expires) |
3648 | cfs_b->runtime = runtime; | 3663 | cfs_b->runtime -= min(runtime, cfs_b->runtime); |
3649 | raw_spin_unlock(&cfs_b->lock); | 3664 | raw_spin_unlock(&cfs_b->lock); |
3650 | } | 3665 | } |
3651 | 3666 | ||
@@ -3775,6 +3790,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
3775 | hrtimer_cancel(&cfs_b->slack_timer); | 3790 | hrtimer_cancel(&cfs_b->slack_timer); |
3776 | } | 3791 | } |
3777 | 3792 | ||
3793 | static void __maybe_unused update_runtime_enabled(struct rq *rq) | ||
3794 | { | ||
3795 | struct cfs_rq *cfs_rq; | ||
3796 | |||
3797 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
3798 | struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; | ||
3799 | |||
3800 | raw_spin_lock(&cfs_b->lock); | ||
3801 | cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; | ||
3802 | raw_spin_unlock(&cfs_b->lock); | ||
3803 | } | ||
3804 | } | ||
3805 | |||
3778 | static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) | 3806 | static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) |
3779 | { | 3807 | { |
3780 | struct cfs_rq *cfs_rq; | 3808 | struct cfs_rq *cfs_rq; |
@@ -3788,6 +3816,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) | |||
3788 | * there's some valid quota amount | 3816 | * there's some valid quota amount |
3789 | */ | 3817 | */ |
3790 | cfs_rq->runtime_remaining = 1; | 3818 | cfs_rq->runtime_remaining = 1; |
3819 | /* | ||
3820 | * Offline rq is schedulable till cpu is completely disabled | ||
3821 | * in take_cpu_down(), so we prevent new cfs throttling here. | ||
3822 | */ | ||
3823 | cfs_rq->runtime_enabled = 0; | ||
3824 | |||
3791 | if (cfs_rq_throttled(cfs_rq)) | 3825 | if (cfs_rq_throttled(cfs_rq)) |
3792 | unthrottle_cfs_rq(cfs_rq); | 3826 | unthrottle_cfs_rq(cfs_rq); |
3793 | } | 3827 | } |
@@ -3831,6 +3865,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | |||
3831 | return NULL; | 3865 | return NULL; |
3832 | } | 3866 | } |
3833 | static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | 3867 | static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} |
3868 | static inline void update_runtime_enabled(struct rq *rq) {} | ||
3834 | static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} | 3869 | static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} |
3835 | 3870 | ||
3836 | #endif /* CONFIG_CFS_BANDWIDTH */ | 3871 | #endif /* CONFIG_CFS_BANDWIDTH */ |
@@ -3854,7 +3889,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
3854 | 3889 | ||
3855 | if (delta < 0) { | 3890 | if (delta < 0) { |
3856 | if (rq->curr == p) | 3891 | if (rq->curr == p) |
3857 | resched_task(p); | 3892 | resched_curr(rq); |
3858 | return; | 3893 | return; |
3859 | } | 3894 | } |
3860 | 3895 | ||
@@ -4723,7 +4758,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
4723 | return; | 4758 | return; |
4724 | 4759 | ||
4725 | preempt: | 4760 | preempt: |
4726 | resched_task(curr); | 4761 | resched_curr(rq); |
4727 | /* | 4762 | /* |
4728 | * Only set the backward buddy when the current task is still | 4763 | * Only set the backward buddy when the current task is still |
4729 | * on the rq. This can happen when a wakeup gets interleaved | 4764 | * on the rq. This can happen when a wakeup gets interleaved |
@@ -5094,8 +5129,7 @@ static void move_task(struct task_struct *p, struct lb_env *env) | |||
5094 | /* | 5129 | /* |
5095 | * Is this task likely cache-hot: | 5130 | * Is this task likely cache-hot: |
5096 | */ | 5131 | */ |
5097 | static int | 5132 | static int task_hot(struct task_struct *p, struct lb_env *env) |
5098 | task_hot(struct task_struct *p, u64 now) | ||
5099 | { | 5133 | { |
5100 | s64 delta; | 5134 | s64 delta; |
5101 | 5135 | ||
@@ -5108,7 +5142,7 @@ task_hot(struct task_struct *p, u64 now) | |||
5108 | /* | 5142 | /* |
5109 | * Buddy candidates are cache hot: | 5143 | * Buddy candidates are cache hot: |
5110 | */ | 5144 | */ |
5111 | if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && | 5145 | if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && |
5112 | (&p->se == cfs_rq_of(&p->se)->next || | 5146 | (&p->se == cfs_rq_of(&p->se)->next || |
5113 | &p->se == cfs_rq_of(&p->se)->last)) | 5147 | &p->se == cfs_rq_of(&p->se)->last)) |
5114 | return 1; | 5148 | return 1; |
@@ -5118,7 +5152,7 @@ task_hot(struct task_struct *p, u64 now) | |||
5118 | if (sysctl_sched_migration_cost == 0) | 5152 | if (sysctl_sched_migration_cost == 0) |
5119 | return 0; | 5153 | return 0; |
5120 | 5154 | ||
5121 | delta = now - p->se.exec_start; | 5155 | delta = rq_clock_task(env->src_rq) - p->se.exec_start; |
5122 | 5156 | ||
5123 | return delta < (s64)sysctl_sched_migration_cost; | 5157 | return delta < (s64)sysctl_sched_migration_cost; |
5124 | } | 5158 | } |
@@ -5272,7 +5306,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
5272 | * 2) task is cache cold, or | 5306 | * 2) task is cache cold, or |
5273 | * 3) too many balance attempts have failed. | 5307 | * 3) too many balance attempts have failed. |
5274 | */ | 5308 | */ |
5275 | tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); | 5309 | tsk_cache_hot = task_hot(p, env); |
5276 | if (!tsk_cache_hot) | 5310 | if (!tsk_cache_hot) |
5277 | tsk_cache_hot = migrate_degrades_locality(p, env); | 5311 | tsk_cache_hot = migrate_degrades_locality(p, env); |
5278 | 5312 | ||
@@ -5864,10 +5898,12 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro | |||
5864 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 5898 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
5865 | * @local_group: Does group contain this_cpu. | 5899 | * @local_group: Does group contain this_cpu. |
5866 | * @sgs: variable to hold the statistics for this group. | 5900 | * @sgs: variable to hold the statistics for this group. |
5901 | * @overload: Indicate more than one runnable task for any CPU. | ||
5867 | */ | 5902 | */ |
5868 | static inline void update_sg_lb_stats(struct lb_env *env, | 5903 | static inline void update_sg_lb_stats(struct lb_env *env, |
5869 | struct sched_group *group, int load_idx, | 5904 | struct sched_group *group, int load_idx, |
5870 | int local_group, struct sg_lb_stats *sgs) | 5905 | int local_group, struct sg_lb_stats *sgs, |
5906 | bool *overload) | ||
5871 | { | 5907 | { |
5872 | unsigned long load; | 5908 | unsigned long load; |
5873 | int i; | 5909 | int i; |
@@ -5885,6 +5921,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5885 | 5921 | ||
5886 | sgs->group_load += load; | 5922 | sgs->group_load += load; |
5887 | sgs->sum_nr_running += rq->nr_running; | 5923 | sgs->sum_nr_running += rq->nr_running; |
5924 | |||
5925 | if (rq->nr_running > 1) | ||
5926 | *overload = true; | ||
5927 | |||
5888 | #ifdef CONFIG_NUMA_BALANCING | 5928 | #ifdef CONFIG_NUMA_BALANCING |
5889 | sgs->nr_numa_running += rq->nr_numa_running; | 5929 | sgs->nr_numa_running += rq->nr_numa_running; |
5890 | sgs->nr_preferred_running += rq->nr_preferred_running; | 5930 | sgs->nr_preferred_running += rq->nr_preferred_running; |
@@ -5995,6 +6035,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
5995 | struct sched_group *sg = env->sd->groups; | 6035 | struct sched_group *sg = env->sd->groups; |
5996 | struct sg_lb_stats tmp_sgs; | 6036 | struct sg_lb_stats tmp_sgs; |
5997 | int load_idx, prefer_sibling = 0; | 6037 | int load_idx, prefer_sibling = 0; |
6038 | bool overload = false; | ||
5998 | 6039 | ||
5999 | if (child && child->flags & SD_PREFER_SIBLING) | 6040 | if (child && child->flags & SD_PREFER_SIBLING) |
6000 | prefer_sibling = 1; | 6041 | prefer_sibling = 1; |
@@ -6015,7 +6056,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
6015 | update_group_capacity(env->sd, env->dst_cpu); | 6056 | update_group_capacity(env->sd, env->dst_cpu); |
6016 | } | 6057 | } |
6017 | 6058 | ||
6018 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); | 6059 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs, |
6060 | &overload); | ||
6019 | 6061 | ||
6020 | if (local_group) | 6062 | if (local_group) |
6021 | goto next_group; | 6063 | goto next_group; |
@@ -6049,6 +6091,13 @@ next_group: | |||
6049 | 6091 | ||
6050 | if (env->sd->flags & SD_NUMA) | 6092 | if (env->sd->flags & SD_NUMA) |
6051 | env->fbq_type = fbq_classify_group(&sds->busiest_stat); | 6093 | env->fbq_type = fbq_classify_group(&sds->busiest_stat); |
6094 | |||
6095 | if (!env->sd->parent) { | ||
6096 | /* update overload indicator if we are at root domain */ | ||
6097 | if (env->dst_rq->rd->overload != overload) | ||
6098 | env->dst_rq->rd->overload = overload; | ||
6099 | } | ||
6100 | |||
6052 | } | 6101 | } |
6053 | 6102 | ||
6054 | /** | 6103 | /** |
@@ -6767,7 +6816,8 @@ static int idle_balance(struct rq *this_rq) | |||
6767 | */ | 6816 | */ |
6768 | this_rq->idle_stamp = rq_clock(this_rq); | 6817 | this_rq->idle_stamp = rq_clock(this_rq); |
6769 | 6818 | ||
6770 | if (this_rq->avg_idle < sysctl_sched_migration_cost) { | 6819 | if (this_rq->avg_idle < sysctl_sched_migration_cost || |
6820 | !this_rq->rd->overload) { | ||
6771 | rcu_read_lock(); | 6821 | rcu_read_lock(); |
6772 | sd = rcu_dereference_check_sched_domain(this_rq->sd); | 6822 | sd = rcu_dereference_check_sched_domain(this_rq->sd); |
6773 | if (sd) | 6823 | if (sd) |
@@ -7325,6 +7375,8 @@ void trigger_load_balance(struct rq *rq) | |||
7325 | static void rq_online_fair(struct rq *rq) | 7375 | static void rq_online_fair(struct rq *rq) |
7326 | { | 7376 | { |
7327 | update_sysctl(); | 7377 | update_sysctl(); |
7378 | |||
7379 | update_runtime_enabled(rq); | ||
7328 | } | 7380 | } |
7329 | 7381 | ||
7330 | static void rq_offline_fair(struct rq *rq) | 7382 | static void rq_offline_fair(struct rq *rq) |
@@ -7398,7 +7450,7 @@ static void task_fork_fair(struct task_struct *p) | |||
7398 | * 'current' within the tree based on its new key value. | 7450 | * 'current' within the tree based on its new key value. |
7399 | */ | 7451 | */ |
7400 | swap(curr->vruntime, se->vruntime); | 7452 | swap(curr->vruntime, se->vruntime); |
7401 | resched_task(rq->curr); | 7453 | resched_curr(rq); |
7402 | } | 7454 | } |
7403 | 7455 | ||
7404 | se->vruntime -= cfs_rq->min_vruntime; | 7456 | se->vruntime -= cfs_rq->min_vruntime; |
@@ -7423,7 +7475,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) | |||
7423 | */ | 7475 | */ |
7424 | if (rq->curr == p) { | 7476 | if (rq->curr == p) { |
7425 | if (p->prio > oldprio) | 7477 | if (p->prio > oldprio) |
7426 | resched_task(rq->curr); | 7478 | resched_curr(rq); |
7427 | } else | 7479 | } else |
7428 | check_preempt_curr(rq, p, 0); | 7480 | check_preempt_curr(rq, p, 0); |
7429 | } | 7481 | } |
@@ -7486,7 +7538,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) | |||
7486 | * if we can still preempt the current task. | 7538 | * if we can still preempt the current task. |
7487 | */ | 7539 | */ |
7488 | if (rq->curr == p) | 7540 | if (rq->curr == p) |
7489 | resched_task(rq->curr); | 7541 | resched_curr(rq); |
7490 | else | 7542 | else |
7491 | check_preempt_curr(rq, p, 0); | 7543 | check_preempt_curr(rq, p, 0); |
7492 | } | 7544 | } |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cf009fb0bc25..9f1608f99819 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -79,7 +79,7 @@ static void cpuidle_idle_call(void) | |||
79 | struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); | 79 | struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); |
80 | struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); | 80 | struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); |
81 | int next_state, entered_state; | 81 | int next_state, entered_state; |
82 | bool broadcast; | 82 | unsigned int broadcast; |
83 | 83 | ||
84 | /* | 84 | /* |
85 | * Check if the idle task must be rescheduled. If it is the | 85 | * Check if the idle task must be rescheduled. If it is the |
@@ -135,7 +135,7 @@ use_default: | |||
135 | goto exit_idle; | 135 | goto exit_idle; |
136 | } | 136 | } |
137 | 137 | ||
138 | broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); | 138 | broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP; |
139 | 139 | ||
140 | /* | 140 | /* |
141 | * Tell the time framework to switch to a broadcast timer | 141 | * Tell the time framework to switch to a broadcast timer |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 879f2b75266a..67ad4e7f506a 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -20,7 +20,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) | |||
20 | */ | 20 | */ |
21 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) | 21 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) |
22 | { | 22 | { |
23 | resched_task(rq->idle); | 23 | resched_curr(rq); |
24 | } | 24 | } |
25 | 25 | ||
26 | static struct task_struct * | 26 | static struct task_struct * |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a49083192c64..5f6edca4fafd 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -463,9 +463,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); | |||
463 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | 463 | static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) |
464 | { | 464 | { |
465 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; | 465 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; |
466 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
466 | struct sched_rt_entity *rt_se; | 467 | struct sched_rt_entity *rt_se; |
467 | 468 | ||
468 | int cpu = cpu_of(rq_of_rt_rq(rt_rq)); | 469 | int cpu = cpu_of(rq); |
469 | 470 | ||
470 | rt_se = rt_rq->tg->rt_se[cpu]; | 471 | rt_se = rt_rq->tg->rt_se[cpu]; |
471 | 472 | ||
@@ -476,7 +477,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | |||
476 | enqueue_rt_entity(rt_se, false); | 477 | enqueue_rt_entity(rt_se, false); |
477 | 478 | ||
478 | if (rt_rq->highest_prio.curr < curr->prio) | 479 | if (rt_rq->highest_prio.curr < curr->prio) |
479 | resched_task(curr); | 480 | resched_curr(rq); |
480 | } | 481 | } |
481 | } | 482 | } |
482 | 483 | ||
@@ -566,7 +567,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | |||
566 | return; | 567 | return; |
567 | 568 | ||
568 | enqueue_top_rt_rq(rt_rq); | 569 | enqueue_top_rt_rq(rt_rq); |
569 | resched_task(rq->curr); | 570 | resched_curr(rq); |
570 | } | 571 | } |
571 | 572 | ||
572 | static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) | 573 | static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) |
@@ -740,6 +741,9 @@ balanced: | |||
740 | rt_rq->rt_throttled = 0; | 741 | rt_rq->rt_throttled = 0; |
741 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 742 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
742 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 743 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
744 | |||
745 | /* Make rt_rq available for pick_next_task() */ | ||
746 | sched_rt_rq_enqueue(rt_rq); | ||
743 | } | 747 | } |
744 | } | 748 | } |
745 | 749 | ||
@@ -948,7 +952,7 @@ static void update_curr_rt(struct rq *rq) | |||
948 | raw_spin_lock(&rt_rq->rt_runtime_lock); | 952 | raw_spin_lock(&rt_rq->rt_runtime_lock); |
949 | rt_rq->rt_time += delta_exec; | 953 | rt_rq->rt_time += delta_exec; |
950 | if (sched_rt_runtime_exceeded(rt_rq)) | 954 | if (sched_rt_runtime_exceeded(rt_rq)) |
951 | resched_task(curr); | 955 | resched_curr(rq); |
952 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 956 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
953 | } | 957 | } |
954 | } | 958 | } |
@@ -1363,7 +1367,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | |||
1363 | * to try and push current away: | 1367 | * to try and push current away: |
1364 | */ | 1368 | */ |
1365 | requeue_task_rt(rq, p, 1); | 1369 | requeue_task_rt(rq, p, 1); |
1366 | resched_task(rq->curr); | 1370 | resched_curr(rq); |
1367 | } | 1371 | } |
1368 | 1372 | ||
1369 | #endif /* CONFIG_SMP */ | 1373 | #endif /* CONFIG_SMP */ |
@@ -1374,7 +1378,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | |||
1374 | static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) | 1378 | static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) |
1375 | { | 1379 | { |
1376 | if (p->prio < rq->curr->prio) { | 1380 | if (p->prio < rq->curr->prio) { |
1377 | resched_task(rq->curr); | 1381 | resched_curr(rq); |
1378 | return; | 1382 | return; |
1379 | } | 1383 | } |
1380 | 1384 | ||
@@ -1690,7 +1694,7 @@ retry: | |||
1690 | * just reschedule current. | 1694 | * just reschedule current. |
1691 | */ | 1695 | */ |
1692 | if (unlikely(next_task->prio < rq->curr->prio)) { | 1696 | if (unlikely(next_task->prio < rq->curr->prio)) { |
1693 | resched_task(rq->curr); | 1697 | resched_curr(rq); |
1694 | return 0; | 1698 | return 0; |
1695 | } | 1699 | } |
1696 | 1700 | ||
@@ -1737,7 +1741,7 @@ retry: | |||
1737 | activate_task(lowest_rq, next_task, 0); | 1741 | activate_task(lowest_rq, next_task, 0); |
1738 | ret = 1; | 1742 | ret = 1; |
1739 | 1743 | ||
1740 | resched_task(lowest_rq->curr); | 1744 | resched_curr(lowest_rq); |
1741 | 1745 | ||
1742 | double_unlock_balance(rq, lowest_rq); | 1746 | double_unlock_balance(rq, lowest_rq); |
1743 | 1747 | ||
@@ -1936,7 +1940,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
1936 | return; | 1940 | return; |
1937 | 1941 | ||
1938 | if (pull_rt_task(rq)) | 1942 | if (pull_rt_task(rq)) |
1939 | resched_task(rq->curr); | 1943 | resched_curr(rq); |
1940 | } | 1944 | } |
1941 | 1945 | ||
1942 | void __init init_sched_rt_class(void) | 1946 | void __init init_sched_rt_class(void) |
@@ -1974,7 +1978,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
1974 | check_resched = 0; | 1978 | check_resched = 0; |
1975 | #endif /* CONFIG_SMP */ | 1979 | #endif /* CONFIG_SMP */ |
1976 | if (check_resched && p->prio < rq->curr->prio) | 1980 | if (check_resched && p->prio < rq->curr->prio) |
1977 | resched_task(rq->curr); | 1981 | resched_curr(rq); |
1978 | } | 1982 | } |
1979 | } | 1983 | } |
1980 | 1984 | ||
@@ -2003,11 +2007,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) | |||
2003 | * Only reschedule if p is still on the same runqueue. | 2007 | * Only reschedule if p is still on the same runqueue. |
2004 | */ | 2008 | */ |
2005 | if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) | 2009 | if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) |
2006 | resched_task(p); | 2010 | resched_curr(rq); |
2007 | #else | 2011 | #else |
2008 | /* For UP simply resched on drop of prio */ | 2012 | /* For UP simply resched on drop of prio */ |
2009 | if (oldprio < p->prio) | 2013 | if (oldprio < p->prio) |
2010 | resched_task(p); | 2014 | resched_curr(rq); |
2011 | #endif /* CONFIG_SMP */ | 2015 | #endif /* CONFIG_SMP */ |
2012 | } else { | 2016 | } else { |
2013 | /* | 2017 | /* |
@@ -2016,7 +2020,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) | |||
2016 | * then reschedule. | 2020 | * then reschedule. |
2017 | */ | 2021 | */ |
2018 | if (p->prio < rq->curr->prio) | 2022 | if (p->prio < rq->curr->prio) |
2019 | resched_task(rq->curr); | 2023 | resched_curr(rq); |
2020 | } | 2024 | } |
2021 | } | 2025 | } |
2022 | 2026 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 31cc02ebc54e..579712f4e9d5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -477,6 +477,9 @@ struct root_domain { | |||
477 | cpumask_var_t span; | 477 | cpumask_var_t span; |
478 | cpumask_var_t online; | 478 | cpumask_var_t online; |
479 | 479 | ||
480 | /* Indicate more than one runnable task for any CPU */ | ||
481 | bool overload; | ||
482 | |||
480 | /* | 483 | /* |
481 | * The bit corresponding to a CPU gets set here if such CPU has more | 484 | * The bit corresponding to a CPU gets set here if such CPU has more |
482 | * than one runnable -deadline task (as it is below for RT tasks). | 485 | * than one runnable -deadline task (as it is below for RT tasks). |
@@ -884,20 +887,10 @@ enum { | |||
884 | #undef SCHED_FEAT | 887 | #undef SCHED_FEAT |
885 | 888 | ||
886 | #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) | 889 | #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) |
887 | static __always_inline bool static_branch__true(struct static_key *key) | ||
888 | { | ||
889 | return static_key_true(key); /* Not out of line branch. */ | ||
890 | } | ||
891 | |||
892 | static __always_inline bool static_branch__false(struct static_key *key) | ||
893 | { | ||
894 | return static_key_false(key); /* Out of line branch. */ | ||
895 | } | ||
896 | |||
897 | #define SCHED_FEAT(name, enabled) \ | 890 | #define SCHED_FEAT(name, enabled) \ |
898 | static __always_inline bool static_branch_##name(struct static_key *key) \ | 891 | static __always_inline bool static_branch_##name(struct static_key *key) \ |
899 | { \ | 892 | { \ |
900 | return static_branch__##enabled(key); \ | 893 | return static_key_##enabled(key); \ |
901 | } | 894 | } |
902 | 895 | ||
903 | #include "features.h" | 896 | #include "features.h" |
@@ -1196,7 +1189,7 @@ extern void init_sched_rt_class(void); | |||
1196 | extern void init_sched_fair_class(void); | 1189 | extern void init_sched_fair_class(void); |
1197 | extern void init_sched_dl_class(void); | 1190 | extern void init_sched_dl_class(void); |
1198 | 1191 | ||
1199 | extern void resched_task(struct task_struct *p); | 1192 | extern void resched_curr(struct rq *rq); |
1200 | extern void resched_cpu(int cpu); | 1193 | extern void resched_cpu(int cpu); |
1201 | 1194 | ||
1202 | extern struct rt_bandwidth def_rt_bandwidth; | 1195 | extern struct rt_bandwidth def_rt_bandwidth; |
@@ -1218,15 +1211,26 @@ static inline void add_nr_running(struct rq *rq, unsigned count) | |||
1218 | 1211 | ||
1219 | rq->nr_running = prev_nr + count; | 1212 | rq->nr_running = prev_nr + count; |
1220 | 1213 | ||
1221 | #ifdef CONFIG_NO_HZ_FULL | ||
1222 | if (prev_nr < 2 && rq->nr_running >= 2) { | 1214 | if (prev_nr < 2 && rq->nr_running >= 2) { |
1215 | #ifdef CONFIG_SMP | ||
1216 | if (!rq->rd->overload) | ||
1217 | rq->rd->overload = true; | ||
1218 | #endif | ||
1219 | |||
1220 | #ifdef CONFIG_NO_HZ_FULL | ||
1223 | if (tick_nohz_full_cpu(rq->cpu)) { | 1221 | if (tick_nohz_full_cpu(rq->cpu)) { |
1224 | /* Order rq->nr_running write against the IPI */ | 1222 | /* |
1225 | smp_wmb(); | 1223 | * Tick is needed if more than one task runs on a CPU. |
1226 | smp_send_reschedule(rq->cpu); | 1224 | * Send the target an IPI to kick it out of nohz mode. |
1225 | * | ||
1226 | * We assume that IPI implies full memory barrier and the | ||
1227 | * new value of rq->nr_running is visible on reception | ||
1228 | * from the target. | ||
1229 | */ | ||
1230 | tick_nohz_full_kick_cpu(rq->cpu); | ||
1227 | } | 1231 | } |
1228 | } | ||
1229 | #endif | 1232 | #endif |
1233 | } | ||
1230 | } | 1234 | } |
1231 | 1235 | ||
1232 | static inline void sub_nr_running(struct rq *rq, unsigned count) | 1236 | static inline void sub_nr_running(struct rq *rq, unsigned count) |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 0ffa20ae657b..15cab1a4f84e 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
@@ -319,14 +319,14 @@ EXPORT_SYMBOL(wake_bit_function); | |||
319 | */ | 319 | */ |
320 | int __sched | 320 | int __sched |
321 | __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, | 321 | __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, |
322 | int (*action)(void *), unsigned mode) | 322 | wait_bit_action_f *action, unsigned mode) |
323 | { | 323 | { |
324 | int ret = 0; | 324 | int ret = 0; |
325 | 325 | ||
326 | do { | 326 | do { |
327 | prepare_to_wait(wq, &q->wait, mode); | 327 | prepare_to_wait(wq, &q->wait, mode); |
328 | if (test_bit(q->key.bit_nr, q->key.flags)) | 328 | if (test_bit(q->key.bit_nr, q->key.flags)) |
329 | ret = (*action)(q->key.flags); | 329 | ret = (*action)(&q->key); |
330 | } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); | 330 | } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); |
331 | finish_wait(wq, &q->wait); | 331 | finish_wait(wq, &q->wait); |
332 | return ret; | 332 | return ret; |
@@ -334,7 +334,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, | |||
334 | EXPORT_SYMBOL(__wait_on_bit); | 334 | EXPORT_SYMBOL(__wait_on_bit); |
335 | 335 | ||
336 | int __sched out_of_line_wait_on_bit(void *word, int bit, | 336 | int __sched out_of_line_wait_on_bit(void *word, int bit, |
337 | int (*action)(void *), unsigned mode) | 337 | wait_bit_action_f *action, unsigned mode) |
338 | { | 338 | { |
339 | wait_queue_head_t *wq = bit_waitqueue(word, bit); | 339 | wait_queue_head_t *wq = bit_waitqueue(word, bit); |
340 | DEFINE_WAIT_BIT(wait, word, bit); | 340 | DEFINE_WAIT_BIT(wait, word, bit); |
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit); | |||
345 | 345 | ||
346 | int __sched | 346 | int __sched |
347 | __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, | 347 | __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, |
348 | int (*action)(void *), unsigned mode) | 348 | wait_bit_action_f *action, unsigned mode) |
349 | { | 349 | { |
350 | do { | 350 | do { |
351 | int ret; | 351 | int ret; |
@@ -353,7 +353,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, | |||
353 | prepare_to_wait_exclusive(wq, &q->wait, mode); | 353 | prepare_to_wait_exclusive(wq, &q->wait, mode); |
354 | if (!test_bit(q->key.bit_nr, q->key.flags)) | 354 | if (!test_bit(q->key.bit_nr, q->key.flags)) |
355 | continue; | 355 | continue; |
356 | ret = action(q->key.flags); | 356 | ret = action(&q->key); |
357 | if (!ret) | 357 | if (!ret) |
358 | continue; | 358 | continue; |
359 | abort_exclusive_wait(wq, &q->wait, mode, &q->key); | 359 | abort_exclusive_wait(wq, &q->wait, mode, &q->key); |
@@ -365,7 +365,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, | |||
365 | EXPORT_SYMBOL(__wait_on_bit_lock); | 365 | EXPORT_SYMBOL(__wait_on_bit_lock); |
366 | 366 | ||
367 | int __sched out_of_line_wait_on_bit_lock(void *word, int bit, | 367 | int __sched out_of_line_wait_on_bit_lock(void *word, int bit, |
368 | int (*action)(void *), unsigned mode) | 368 | wait_bit_action_f *action, unsigned mode) |
369 | { | 369 | { |
370 | wait_queue_head_t *wq = bit_waitqueue(word, bit); | 370 | wait_queue_head_t *wq = bit_waitqueue(word, bit); |
371 | DEFINE_WAIT_BIT(wait, word, bit); | 371 | DEFINE_WAIT_BIT(wait, word, bit); |
@@ -502,3 +502,21 @@ void wake_up_atomic_t(atomic_t *p) | |||
502 | __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); | 502 | __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); |
503 | } | 503 | } |
504 | EXPORT_SYMBOL(wake_up_atomic_t); | 504 | EXPORT_SYMBOL(wake_up_atomic_t); |
505 | |||
506 | __sched int bit_wait(struct wait_bit_key *word) | ||
507 | { | ||
508 | if (signal_pending_state(current->state, current)) | ||
509 | return 1; | ||
510 | schedule(); | ||
511 | return 0; | ||
512 | } | ||
513 | EXPORT_SYMBOL(bit_wait); | ||
514 | |||
515 | __sched int bit_wait_io(struct wait_bit_key *word) | ||
516 | { | ||
517 | if (signal_pending_state(current->state, current)) | ||
518 | return 1; | ||
519 | io_schedule(); | ||
520 | return 0; | ||
521 | } | ||
522 | EXPORT_SYMBOL(bit_wait_io); | ||
diff --git a/kernel/signal.c b/kernel/signal.c index a4077e90f19f..40b76e351e64 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -1263,6 +1263,10 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, | |||
1263 | struct sighand_struct *sighand; | 1263 | struct sighand_struct *sighand; |
1264 | 1264 | ||
1265 | for (;;) { | 1265 | for (;;) { |
1266 | /* | ||
1267 | * Disable interrupts early to avoid deadlocks. | ||
1268 | * See rcu_read_unlock() comment header for details. | ||
1269 | */ | ||
1266 | local_irq_save(*flags); | 1270 | local_irq_save(*flags); |
1267 | rcu_read_lock(); | 1271 | rcu_read_lock(); |
1268 | sighand = rcu_dereference(tsk->sighand); | 1272 | sighand = rcu_dereference(tsk->sighand); |
diff --git a/kernel/smp.c b/kernel/smp.c index 80c33f8de14f..487653b5844f 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -3,6 +3,7 @@ | |||
3 | * | 3 | * |
4 | * (C) Jens Axboe <jens.axboe@oracle.com> 2008 | 4 | * (C) Jens Axboe <jens.axboe@oracle.com> 2008 |
5 | */ | 5 | */ |
6 | #include <linux/irq_work.h> | ||
6 | #include <linux/rcupdate.h> | 7 | #include <linux/rcupdate.h> |
7 | #include <linux/rculist.h> | 8 | #include <linux/rculist.h> |
8 | #include <linux/kernel.h> | 9 | #include <linux/kernel.h> |
@@ -251,6 +252,14 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) | |||
251 | csd->func(csd->info); | 252 | csd->func(csd->info); |
252 | csd_unlock(csd); | 253 | csd_unlock(csd); |
253 | } | 254 | } |
255 | |||
256 | /* | ||
257 | * Handle irq works queued remotely by irq_work_queue_on(). | ||
258 | * Smp functions above are typically synchronous so they | ||
259 | * better run first since some other CPUs may be busy waiting | ||
260 | * for them. | ||
261 | */ | ||
262 | irq_work_run(); | ||
254 | } | 263 | } |
255 | 264 | ||
256 | /* | 265 | /* |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 88c9c65a430d..4aec4a457431 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -71,7 +71,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void) | |||
71 | 71 | ||
72 | return ret; | 72 | return ret; |
73 | } | 73 | } |
74 | 74 | EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev); | |
75 | 75 | ||
76 | static int alarmtimer_rtc_add_device(struct device *dev, | 76 | static int alarmtimer_rtc_add_device(struct device *dev, |
77 | struct class_interface *class_intf) | 77 | struct class_interface *class_intf) |
@@ -585,9 +585,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, | |||
585 | struct itimerspec *new_setting, | 585 | struct itimerspec *new_setting, |
586 | struct itimerspec *old_setting) | 586 | struct itimerspec *old_setting) |
587 | { | 587 | { |
588 | ktime_t exp; | ||
589 | |||
588 | if (!rtcdev) | 590 | if (!rtcdev) |
589 | return -ENOTSUPP; | 591 | return -ENOTSUPP; |
590 | 592 | ||
593 | if (flags & ~TIMER_ABSTIME) | ||
594 | return -EINVAL; | ||
595 | |||
591 | if (old_setting) | 596 | if (old_setting) |
592 | alarm_timer_get(timr, old_setting); | 597 | alarm_timer_get(timr, old_setting); |
593 | 598 | ||
@@ -597,8 +602,16 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, | |||
597 | 602 | ||
598 | /* start the timer */ | 603 | /* start the timer */ |
599 | timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); | 604 | timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); |
600 | alarm_start(&timr->it.alarm.alarmtimer, | 605 | exp = timespec_to_ktime(new_setting->it_value); |
601 | timespec_to_ktime(new_setting->it_value)); | 606 | /* Convert (if necessary) to absolute time */ |
607 | if (flags != TIMER_ABSTIME) { | ||
608 | ktime_t now; | ||
609 | |||
610 | now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); | ||
611 | exp = ktime_add(now, exp); | ||
612 | } | ||
613 | |||
614 | alarm_start(&timr->it.alarm.alarmtimer, exp); | ||
602 | return 0; | 615 | return 0; |
603 | } | 616 | } |
604 | 617 | ||
@@ -730,6 +743,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, | |||
730 | if (!alarmtimer_get_rtcdev()) | 743 | if (!alarmtimer_get_rtcdev()) |
731 | return -ENOTSUPP; | 744 | return -ENOTSUPP; |
732 | 745 | ||
746 | if (flags & ~TIMER_ABSTIME) | ||
747 | return -EINVAL; | ||
748 | |||
733 | if (!capable(CAP_WAKE_ALARM)) | 749 | if (!capable(CAP_WAKE_ALARM)) |
734 | return -EPERM; | 750 | return -EPERM; |
735 | 751 | ||
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index ad362c260ef4..9c94c19f1305 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -146,7 +146,8 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev) | |||
146 | { | 146 | { |
147 | /* Nothing to do if we already reached the limit */ | 147 | /* Nothing to do if we already reached the limit */ |
148 | if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { | 148 | if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { |
149 | printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); | 149 | printk_deferred(KERN_WARNING |
150 | "CE: Reprogramming failure. Giving up\n"); | ||
150 | dev->next_event.tv64 = KTIME_MAX; | 151 | dev->next_event.tv64 = KTIME_MAX; |
151 | return -ETIME; | 152 | return -ETIME; |
152 | } | 153 | } |
@@ -159,9 +160,10 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev) | |||
159 | if (dev->min_delta_ns > MIN_DELTA_LIMIT) | 160 | if (dev->min_delta_ns > MIN_DELTA_LIMIT) |
160 | dev->min_delta_ns = MIN_DELTA_LIMIT; | 161 | dev->min_delta_ns = MIN_DELTA_LIMIT; |
161 | 162 | ||
162 | printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", | 163 | printk_deferred(KERN_WARNING |
163 | dev->name ? dev->name : "?", | 164 | "CE: %s increased min_delta_ns to %llu nsec\n", |
164 | (unsigned long long) dev->min_delta_ns); | 165 | dev->name ? dev->name : "?", |
166 | (unsigned long long) dev->min_delta_ns); | ||
165 | return 0; | 167 | return 0; |
166 | } | 168 | } |
167 | 169 | ||
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 445106d2c729..01d2d15aa662 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c | |||
@@ -191,7 +191,8 @@ void __init sched_clock_postinit(void) | |||
191 | 191 | ||
192 | static int sched_clock_suspend(void) | 192 | static int sched_clock_suspend(void) |
193 | { | 193 | { |
194 | sched_clock_poll(&sched_clock_timer); | 194 | update_sched_clock(); |
195 | hrtimer_cancel(&sched_clock_timer); | ||
195 | cd.suspended = true; | 196 | cd.suspended = true; |
196 | return 0; | 197 | return 0; |
197 | } | 198 | } |
@@ -199,6 +200,7 @@ static int sched_clock_suspend(void) | |||
199 | static void sched_clock_resume(void) | 200 | static void sched_clock_resume(void) |
200 | { | 201 | { |
201 | cd.epoch_cyc = read_sched_clock(); | 202 | cd.epoch_cyc = read_sched_clock(); |
203 | hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); | ||
202 | cd.suspended = false; | 204 | cd.suspended = false; |
203 | } | 205 | } |
204 | 206 | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6558b7ac112d..99aa6ee3908f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -154,6 +154,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) | |||
154 | 154 | ||
155 | #ifdef CONFIG_NO_HZ_FULL | 155 | #ifdef CONFIG_NO_HZ_FULL |
156 | cpumask_var_t tick_nohz_full_mask; | 156 | cpumask_var_t tick_nohz_full_mask; |
157 | cpumask_var_t housekeeping_mask; | ||
157 | bool tick_nohz_full_running; | 158 | bool tick_nohz_full_running; |
158 | 159 | ||
159 | static bool can_stop_full_tick(void) | 160 | static bool can_stop_full_tick(void) |
@@ -224,13 +225,15 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { | |||
224 | }; | 225 | }; |
225 | 226 | ||
226 | /* | 227 | /* |
227 | * Kick the current CPU if it's full dynticks in order to force it to | 228 | * Kick the CPU if it's full dynticks in order to force it to |
228 | * re-evaluate its dependency on the tick and restart it if necessary. | 229 | * re-evaluate its dependency on the tick and restart it if necessary. |
229 | */ | 230 | */ |
230 | void tick_nohz_full_kick(void) | 231 | void tick_nohz_full_kick_cpu(int cpu) |
231 | { | 232 | { |
232 | if (tick_nohz_full_cpu(smp_processor_id())) | 233 | if (!tick_nohz_full_cpu(cpu)) |
233 | irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); | 234 | return; |
235 | |||
236 | irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); | ||
234 | } | 237 | } |
235 | 238 | ||
236 | static void nohz_full_kick_ipi(void *info) | 239 | static void nohz_full_kick_ipi(void *info) |
@@ -281,6 +284,7 @@ static int __init tick_nohz_full_setup(char *str) | |||
281 | int cpu; | 284 | int cpu; |
282 | 285 | ||
283 | alloc_bootmem_cpumask_var(&tick_nohz_full_mask); | 286 | alloc_bootmem_cpumask_var(&tick_nohz_full_mask); |
287 | alloc_bootmem_cpumask_var(&housekeeping_mask); | ||
284 | if (cpulist_parse(str, tick_nohz_full_mask) < 0) { | 288 | if (cpulist_parse(str, tick_nohz_full_mask) < 0) { |
285 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); | 289 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); |
286 | return 1; | 290 | return 1; |
@@ -291,6 +295,8 @@ static int __init tick_nohz_full_setup(char *str) | |||
291 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); | 295 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); |
292 | cpumask_clear_cpu(cpu, tick_nohz_full_mask); | 296 | cpumask_clear_cpu(cpu, tick_nohz_full_mask); |
293 | } | 297 | } |
298 | cpumask_andnot(housekeeping_mask, | ||
299 | cpu_possible_mask, tick_nohz_full_mask); | ||
294 | tick_nohz_full_running = true; | 300 | tick_nohz_full_running = true; |
295 | 301 | ||
296 | return 1; | 302 | return 1; |
@@ -332,9 +338,15 @@ static int tick_nohz_init_all(void) | |||
332 | pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); | 338 | pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); |
333 | return err; | 339 | return err; |
334 | } | 340 | } |
341 | if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { | ||
342 | pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n"); | ||
343 | return err; | ||
344 | } | ||
335 | err = 0; | 345 | err = 0; |
336 | cpumask_setall(tick_nohz_full_mask); | 346 | cpumask_setall(tick_nohz_full_mask); |
337 | cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); | 347 | cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); |
348 | cpumask_clear(housekeeping_mask); | ||
349 | cpumask_set_cpu(smp_processor_id(), housekeeping_mask); | ||
338 | tick_nohz_full_running = true; | 350 | tick_nohz_full_running = true; |
339 | #endif | 351 | #endif |
340 | return err; | 352 | return err; |
diff --git a/kernel/torture.c b/kernel/torture.c index 40bb511cca48..d600af21f022 100644 --- a/kernel/torture.c +++ b/kernel/torture.c | |||
@@ -708,7 +708,7 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, | |||
708 | int ret = 0; | 708 | int ret = 0; |
709 | 709 | ||
710 | VERBOSE_TOROUT_STRING(m); | 710 | VERBOSE_TOROUT_STRING(m); |
711 | *tp = kthread_run(fn, arg, s); | 711 | *tp = kthread_run(fn, arg, "%s", s); |
712 | if (IS_ERR(*tp)) { | 712 | if (IS_ERR(*tp)) { |
713 | ret = PTR_ERR(*tp); | 713 | ret = PTR_ERR(*tp); |
714 | VERBOSE_TOROUT_ERRSTRING(f); | 714 | VERBOSE_TOROUT_ERRSTRING(f); |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d4409356f40d..a5da09c899dd 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -29,11 +29,6 @@ config HAVE_FUNCTION_GRAPH_FP_TEST | |||
29 | help | 29 | help |
30 | See Documentation/trace/ftrace-design.txt | 30 | See Documentation/trace/ftrace-design.txt |
31 | 31 | ||
32 | config HAVE_FUNCTION_TRACE_MCOUNT_TEST | ||
33 | bool | ||
34 | help | ||
35 | See Documentation/trace/ftrace-design.txt | ||
36 | |||
37 | config HAVE_DYNAMIC_FTRACE | 32 | config HAVE_DYNAMIC_FTRACE |
38 | bool | 33 | bool |
39 | help | 34 | help |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2611613f14f1..67d6369ddf83 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -28,6 +28,7 @@ obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o | |||
28 | 28 | ||
29 | obj-$(CONFIG_TRACING) += trace.o | 29 | obj-$(CONFIG_TRACING) += trace.o |
30 | obj-$(CONFIG_TRACING) += trace_output.o | 30 | obj-$(CONFIG_TRACING) += trace_output.o |
31 | obj-$(CONFIG_TRACING) += trace_seq.o | ||
31 | obj-$(CONFIG_TRACING) += trace_stat.o | 32 | obj-$(CONFIG_TRACING) += trace_stat.o |
32 | obj-$(CONFIG_TRACING) += trace_printk.o | 33 | obj-$(CONFIG_TRACING) += trace_printk.o |
33 | obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o | 34 | obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5b372e3ed675..1654b12c891a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -80,9 +80,6 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { | |||
80 | int ftrace_enabled __read_mostly; | 80 | int ftrace_enabled __read_mostly; |
81 | static int last_ftrace_enabled; | 81 | static int last_ftrace_enabled; |
82 | 82 | ||
83 | /* Quick disabling of function tracer. */ | ||
84 | int function_trace_stop __read_mostly; | ||
85 | |||
86 | /* Current function tracing op */ | 83 | /* Current function tracing op */ |
87 | struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; | 84 | struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; |
88 | /* What to set function_trace_op to */ | 85 | /* What to set function_trace_op to */ |
@@ -265,12 +262,12 @@ static void update_ftrace_function(void) | |||
265 | func = ftrace_ops_list_func; | 262 | func = ftrace_ops_list_func; |
266 | } | 263 | } |
267 | 264 | ||
265 | update_function_graph_func(); | ||
266 | |||
268 | /* If there's no change, then do nothing more here */ | 267 | /* If there's no change, then do nothing more here */ |
269 | if (ftrace_trace_function == func) | 268 | if (ftrace_trace_function == func) |
270 | return; | 269 | return; |
271 | 270 | ||
272 | update_function_graph_func(); | ||
273 | |||
274 | /* | 271 | /* |
275 | * If we are using the list function, it doesn't care | 272 | * If we are using the list function, it doesn't care |
276 | * about the function_trace_ops. | 273 | * about the function_trace_ops. |
@@ -1042,6 +1039,8 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid; | |||
1042 | 1039 | ||
1043 | #ifdef CONFIG_DYNAMIC_FTRACE | 1040 | #ifdef CONFIG_DYNAMIC_FTRACE |
1044 | 1041 | ||
1042 | static struct ftrace_ops *removed_ops; | ||
1043 | |||
1045 | #ifndef CONFIG_FTRACE_MCOUNT_RECORD | 1044 | #ifndef CONFIG_FTRACE_MCOUNT_RECORD |
1046 | # error Dynamic ftrace depends on MCOUNT_RECORD | 1045 | # error Dynamic ftrace depends on MCOUNT_RECORD |
1047 | #endif | 1046 | #endif |
@@ -1304,25 +1303,15 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, | |||
1304 | struct ftrace_hash *new_hash; | 1303 | struct ftrace_hash *new_hash; |
1305 | int size = src->count; | 1304 | int size = src->count; |
1306 | int bits = 0; | 1305 | int bits = 0; |
1307 | int ret; | ||
1308 | int i; | 1306 | int i; |
1309 | 1307 | ||
1310 | /* | 1308 | /* |
1311 | * Remove the current set, update the hash and add | ||
1312 | * them back. | ||
1313 | */ | ||
1314 | ftrace_hash_rec_disable(ops, enable); | ||
1315 | |||
1316 | /* | ||
1317 | * If the new source is empty, just free dst and assign it | 1309 | * If the new source is empty, just free dst and assign it |
1318 | * the empty_hash. | 1310 | * the empty_hash. |
1319 | */ | 1311 | */ |
1320 | if (!src->count) { | 1312 | if (!src->count) { |
1321 | free_ftrace_hash_rcu(*dst); | 1313 | new_hash = EMPTY_HASH; |
1322 | rcu_assign_pointer(*dst, EMPTY_HASH); | 1314 | goto update; |
1323 | /* still need to update the function records */ | ||
1324 | ret = 0; | ||
1325 | goto out; | ||
1326 | } | 1315 | } |
1327 | 1316 | ||
1328 | /* | 1317 | /* |
@@ -1335,10 +1324,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, | |||
1335 | if (bits > FTRACE_HASH_MAX_BITS) | 1324 | if (bits > FTRACE_HASH_MAX_BITS) |
1336 | bits = FTRACE_HASH_MAX_BITS; | 1325 | bits = FTRACE_HASH_MAX_BITS; |
1337 | 1326 | ||
1338 | ret = -ENOMEM; | ||
1339 | new_hash = alloc_ftrace_hash(bits); | 1327 | new_hash = alloc_ftrace_hash(bits); |
1340 | if (!new_hash) | 1328 | if (!new_hash) |
1341 | goto out; | 1329 | return -ENOMEM; |
1342 | 1330 | ||
1343 | size = 1 << src->size_bits; | 1331 | size = 1 << src->size_bits; |
1344 | for (i = 0; i < size; i++) { | 1332 | for (i = 0; i < size; i++) { |
@@ -1349,20 +1337,20 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, | |||
1349 | } | 1337 | } |
1350 | } | 1338 | } |
1351 | 1339 | ||
1340 | update: | ||
1341 | /* | ||
1342 | * Remove the current set, update the hash and add | ||
1343 | * them back. | ||
1344 | */ | ||
1345 | ftrace_hash_rec_disable(ops, enable); | ||
1346 | |||
1352 | old_hash = *dst; | 1347 | old_hash = *dst; |
1353 | rcu_assign_pointer(*dst, new_hash); | 1348 | rcu_assign_pointer(*dst, new_hash); |
1354 | free_ftrace_hash_rcu(old_hash); | 1349 | free_ftrace_hash_rcu(old_hash); |
1355 | 1350 | ||
1356 | ret = 0; | ||
1357 | out: | ||
1358 | /* | ||
1359 | * Enable regardless of ret: | ||
1360 | * On success, we enable the new hash. | ||
1361 | * On failure, we re-enable the original hash. | ||
1362 | */ | ||
1363 | ftrace_hash_rec_enable(ops, enable); | 1351 | ftrace_hash_rec_enable(ops, enable); |
1364 | 1352 | ||
1365 | return ret; | 1353 | return 0; |
1366 | } | 1354 | } |
1367 | 1355 | ||
1368 | /* | 1356 | /* |
@@ -1492,6 +1480,53 @@ int ftrace_text_reserved(const void *start, const void *end) | |||
1492 | return (int)!!ret; | 1480 | return (int)!!ret; |
1493 | } | 1481 | } |
1494 | 1482 | ||
1483 | /* Test if ops registered to this rec needs regs */ | ||
1484 | static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) | ||
1485 | { | ||
1486 | struct ftrace_ops *ops; | ||
1487 | bool keep_regs = false; | ||
1488 | |||
1489 | for (ops = ftrace_ops_list; | ||
1490 | ops != &ftrace_list_end; ops = ops->next) { | ||
1491 | /* pass rec in as regs to have non-NULL val */ | ||
1492 | if (ftrace_ops_test(ops, rec->ip, rec)) { | ||
1493 | if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { | ||
1494 | keep_regs = true; | ||
1495 | break; | ||
1496 | } | ||
1497 | } | ||
1498 | } | ||
1499 | |||
1500 | return keep_regs; | ||
1501 | } | ||
1502 | |||
1503 | static void ftrace_remove_tramp(struct ftrace_ops *ops, | ||
1504 | struct dyn_ftrace *rec) | ||
1505 | { | ||
1506 | struct ftrace_func_entry *entry; | ||
1507 | |||
1508 | entry = ftrace_lookup_ip(ops->tramp_hash, rec->ip); | ||
1509 | if (!entry) | ||
1510 | return; | ||
1511 | |||
1512 | /* | ||
1513 | * The tramp_hash entry will be removed at time | ||
1514 | * of update. | ||
1515 | */ | ||
1516 | ops->nr_trampolines--; | ||
1517 | rec->flags &= ~FTRACE_FL_TRAMP; | ||
1518 | } | ||
1519 | |||
1520 | static void ftrace_clear_tramps(struct dyn_ftrace *rec) | ||
1521 | { | ||
1522 | struct ftrace_ops *op; | ||
1523 | |||
1524 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
1525 | if (op->nr_trampolines) | ||
1526 | ftrace_remove_tramp(op, rec); | ||
1527 | } while_for_each_ftrace_op(op); | ||
1528 | } | ||
1529 | |||
1495 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | 1530 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, |
1496 | int filter_hash, | 1531 | int filter_hash, |
1497 | bool inc) | 1532 | bool inc) |
@@ -1572,8 +1607,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
1572 | 1607 | ||
1573 | if (inc) { | 1608 | if (inc) { |
1574 | rec->flags++; | 1609 | rec->flags++; |
1575 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) | 1610 | if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX)) |
1576 | return; | 1611 | return; |
1612 | |||
1613 | /* | ||
1614 | * If there's only a single callback registered to a | ||
1615 | * function, and the ops has a trampoline registered | ||
1616 | * for it, then we can call it directly. | ||
1617 | */ | ||
1618 | if (ftrace_rec_count(rec) == 1 && ops->trampoline) { | ||
1619 | rec->flags |= FTRACE_FL_TRAMP; | ||
1620 | ops->nr_trampolines++; | ||
1621 | } else { | ||
1622 | /* | ||
1623 | * If we are adding another function callback | ||
1624 | * to this function, and the previous had a | ||
1625 | * trampoline used, then we need to go back to | ||
1626 | * the default trampoline. | ||
1627 | */ | ||
1628 | rec->flags &= ~FTRACE_FL_TRAMP; | ||
1629 | |||
1630 | /* remove trampolines from any ops for this rec */ | ||
1631 | ftrace_clear_tramps(rec); | ||
1632 | } | ||
1633 | |||
1577 | /* | 1634 | /* |
1578 | * If any ops wants regs saved for this function | 1635 | * If any ops wants regs saved for this function |
1579 | * then all ops will get saved regs. | 1636 | * then all ops will get saved regs. |
@@ -1581,9 +1638,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
1581 | if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) | 1638 | if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) |
1582 | rec->flags |= FTRACE_FL_REGS; | 1639 | rec->flags |= FTRACE_FL_REGS; |
1583 | } else { | 1640 | } else { |
1584 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) | 1641 | if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0)) |
1585 | return; | 1642 | return; |
1586 | rec->flags--; | 1643 | rec->flags--; |
1644 | |||
1645 | if (ops->trampoline && !ftrace_rec_count(rec)) | ||
1646 | ftrace_remove_tramp(ops, rec); | ||
1647 | |||
1648 | /* | ||
1649 | * If the rec had REGS enabled and the ops that is | ||
1650 | * being removed had REGS set, then see if there is | ||
1651 | * still any ops for this record that wants regs. | ||
1652 | * If not, we can stop recording them. | ||
1653 | */ | ||
1654 | if (ftrace_rec_count(rec) > 0 && | ||
1655 | rec->flags & FTRACE_FL_REGS && | ||
1656 | ops->flags & FTRACE_OPS_FL_SAVE_REGS) { | ||
1657 | if (!test_rec_ops_needs_regs(rec)) | ||
1658 | rec->flags &= ~FTRACE_FL_REGS; | ||
1659 | } | ||
1660 | |||
1661 | /* | ||
1662 | * flags will be cleared in ftrace_check_record() | ||
1663 | * if rec count is zero. | ||
1664 | */ | ||
1587 | } | 1665 | } |
1588 | count++; | 1666 | count++; |
1589 | /* Shortcut, if we handled all records, we are done. */ | 1667 | /* Shortcut, if we handled all records, we are done. */ |
@@ -1668,17 +1746,23 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) | |||
1668 | * If we are disabling calls, then disable all records that | 1746 | * If we are disabling calls, then disable all records that |
1669 | * are enabled. | 1747 | * are enabled. |
1670 | */ | 1748 | */ |
1671 | if (enable && (rec->flags & ~FTRACE_FL_MASK)) | 1749 | if (enable && ftrace_rec_count(rec)) |
1672 | flag = FTRACE_FL_ENABLED; | 1750 | flag = FTRACE_FL_ENABLED; |
1673 | 1751 | ||
1674 | /* | 1752 | /* |
1675 | * If enabling and the REGS flag does not match the REGS_EN, then | 1753 | * If enabling and the REGS flag does not match the REGS_EN, or |
1676 | * do not ignore this record. Set flags to fail the compare against | 1754 | * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore |
1677 | * ENABLED. | 1755 | * this record. Set flags to fail the compare against ENABLED. |
1678 | */ | 1756 | */ |
1679 | if (flag && | 1757 | if (flag) { |
1680 | (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN))) | 1758 | if (!(rec->flags & FTRACE_FL_REGS) != |
1681 | flag |= FTRACE_FL_REGS; | 1759 | !(rec->flags & FTRACE_FL_REGS_EN)) |
1760 | flag |= FTRACE_FL_REGS; | ||
1761 | |||
1762 | if (!(rec->flags & FTRACE_FL_TRAMP) != | ||
1763 | !(rec->flags & FTRACE_FL_TRAMP_EN)) | ||
1764 | flag |= FTRACE_FL_TRAMP; | ||
1765 | } | ||
1682 | 1766 | ||
1683 | /* If the state of this record hasn't changed, then do nothing */ | 1767 | /* If the state of this record hasn't changed, then do nothing */ |
1684 | if ((rec->flags & FTRACE_FL_ENABLED) == flag) | 1768 | if ((rec->flags & FTRACE_FL_ENABLED) == flag) |
@@ -1696,6 +1780,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) | |||
1696 | else | 1780 | else |
1697 | rec->flags &= ~FTRACE_FL_REGS_EN; | 1781 | rec->flags &= ~FTRACE_FL_REGS_EN; |
1698 | } | 1782 | } |
1783 | if (flag & FTRACE_FL_TRAMP) { | ||
1784 | if (rec->flags & FTRACE_FL_TRAMP) | ||
1785 | rec->flags |= FTRACE_FL_TRAMP_EN; | ||
1786 | else | ||
1787 | rec->flags &= ~FTRACE_FL_TRAMP_EN; | ||
1788 | } | ||
1699 | } | 1789 | } |
1700 | 1790 | ||
1701 | /* | 1791 | /* |
@@ -1704,7 +1794,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) | |||
1704 | * Otherwise, | 1794 | * Otherwise, |
1705 | * return UPDATE_MODIFY_CALL to tell the caller to convert | 1795 | * return UPDATE_MODIFY_CALL to tell the caller to convert |
1706 | * from the save regs, to a non-save regs function or | 1796 | * from the save regs, to a non-save regs function or |
1707 | * vice versa. | 1797 | * vice versa, or from a trampoline call. |
1708 | */ | 1798 | */ |
1709 | if (flag & FTRACE_FL_ENABLED) | 1799 | if (flag & FTRACE_FL_ENABLED) |
1710 | return FTRACE_UPDATE_MAKE_CALL; | 1800 | return FTRACE_UPDATE_MAKE_CALL; |
@@ -1714,7 +1804,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) | |||
1714 | 1804 | ||
1715 | if (update) { | 1805 | if (update) { |
1716 | /* If there's no more users, clear all flags */ | 1806 | /* If there's no more users, clear all flags */ |
1717 | if (!(rec->flags & ~FTRACE_FL_MASK)) | 1807 | if (!ftrace_rec_count(rec)) |
1718 | rec->flags = 0; | 1808 | rec->flags = 0; |
1719 | else | 1809 | else |
1720 | /* Just disable the record (keep REGS state) */ | 1810 | /* Just disable the record (keep REGS state) */ |
@@ -1751,6 +1841,43 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) | |||
1751 | return ftrace_check_record(rec, enable, 0); | 1841 | return ftrace_check_record(rec, enable, 0); |
1752 | } | 1842 | } |
1753 | 1843 | ||
1844 | static struct ftrace_ops * | ||
1845 | ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) | ||
1846 | { | ||
1847 | struct ftrace_ops *op; | ||
1848 | |||
1849 | /* Removed ops need to be tested first */ | ||
1850 | if (removed_ops && removed_ops->tramp_hash) { | ||
1851 | if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip)) | ||
1852 | return removed_ops; | ||
1853 | } | ||
1854 | |||
1855 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
1856 | if (!op->tramp_hash) | ||
1857 | continue; | ||
1858 | |||
1859 | if (ftrace_lookup_ip(op->tramp_hash, rec->ip)) | ||
1860 | return op; | ||
1861 | |||
1862 | } while_for_each_ftrace_op(op); | ||
1863 | |||
1864 | return NULL; | ||
1865 | } | ||
1866 | |||
1867 | static struct ftrace_ops * | ||
1868 | ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) | ||
1869 | { | ||
1870 | struct ftrace_ops *op; | ||
1871 | |||
1872 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
1873 | /* pass rec in as regs to have non-NULL val */ | ||
1874 | if (ftrace_ops_test(op, rec->ip, rec)) | ||
1875 | return op; | ||
1876 | } while_for_each_ftrace_op(op); | ||
1877 | |||
1878 | return NULL; | ||
1879 | } | ||
1880 | |||
1754 | /** | 1881 | /** |
1755 | * ftrace_get_addr_new - Get the call address to set to | 1882 | * ftrace_get_addr_new - Get the call address to set to |
1756 | * @rec: The ftrace record descriptor | 1883 | * @rec: The ftrace record descriptor |
@@ -1763,6 +1890,20 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) | |||
1763 | */ | 1890 | */ |
1764 | unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) | 1891 | unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) |
1765 | { | 1892 | { |
1893 | struct ftrace_ops *ops; | ||
1894 | |||
1895 | /* Trampolines take precedence over regs */ | ||
1896 | if (rec->flags & FTRACE_FL_TRAMP) { | ||
1897 | ops = ftrace_find_tramp_ops_new(rec); | ||
1898 | if (FTRACE_WARN_ON(!ops || !ops->trampoline)) { | ||
1899 | pr_warning("Bad trampoline accounting at: %p (%pS)\n", | ||
1900 | (void *)rec->ip, (void *)rec->ip); | ||
1901 | /* Ftrace is shutting down, return anything */ | ||
1902 | return (unsigned long)FTRACE_ADDR; | ||
1903 | } | ||
1904 | return ops->trampoline; | ||
1905 | } | ||
1906 | |||
1766 | if (rec->flags & FTRACE_FL_REGS) | 1907 | if (rec->flags & FTRACE_FL_REGS) |
1767 | return (unsigned long)FTRACE_REGS_ADDR; | 1908 | return (unsigned long)FTRACE_REGS_ADDR; |
1768 | else | 1909 | else |
@@ -1781,6 +1922,20 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) | |||
1781 | */ | 1922 | */ |
1782 | unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) | 1923 | unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) |
1783 | { | 1924 | { |
1925 | struct ftrace_ops *ops; | ||
1926 | |||
1927 | /* Trampolines take precedence over regs */ | ||
1928 | if (rec->flags & FTRACE_FL_TRAMP_EN) { | ||
1929 | ops = ftrace_find_tramp_ops_curr(rec); | ||
1930 | if (FTRACE_WARN_ON(!ops)) { | ||
1931 | pr_warning("Bad trampoline accounting at: %p (%pS)\n", | ||
1932 | (void *)rec->ip, (void *)rec->ip); | ||
1933 | /* Ftrace is shutting down, return anything */ | ||
1934 | return (unsigned long)FTRACE_ADDR; | ||
1935 | } | ||
1936 | return ops->trampoline; | ||
1937 | } | ||
1938 | |||
1784 | if (rec->flags & FTRACE_FL_REGS_EN) | 1939 | if (rec->flags & FTRACE_FL_REGS_EN) |
1785 | return (unsigned long)FTRACE_REGS_ADDR; | 1940 | return (unsigned long)FTRACE_REGS_ADDR; |
1786 | else | 1941 | else |
@@ -2023,6 +2178,89 @@ void __weak arch_ftrace_update_code(int command) | |||
2023 | ftrace_run_stop_machine(command); | 2178 | ftrace_run_stop_machine(command); |
2024 | } | 2179 | } |
2025 | 2180 | ||
2181 | static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops) | ||
2182 | { | ||
2183 | struct ftrace_page *pg; | ||
2184 | struct dyn_ftrace *rec; | ||
2185 | int size, bits; | ||
2186 | int ret; | ||
2187 | |||
2188 | size = ops->nr_trampolines; | ||
2189 | bits = 0; | ||
2190 | /* | ||
2191 | * Make the hash size about 1/2 the # found | ||
2192 | */ | ||
2193 | for (size /= 2; size; size >>= 1) | ||
2194 | bits++; | ||
2195 | |||
2196 | ops->tramp_hash = alloc_ftrace_hash(bits); | ||
2197 | /* | ||
2198 | * TODO: a failed allocation is going to screw up | ||
2199 | * the accounting of what needs to be modified | ||
2200 | * and not. For now, we kill ftrace if we fail | ||
2201 | * to allocate here. But there are ways around this, | ||
2202 | * but that will take a little more work. | ||
2203 | */ | ||
2204 | if (!ops->tramp_hash) | ||
2205 | return -ENOMEM; | ||
2206 | |||
2207 | do_for_each_ftrace_rec(pg, rec) { | ||
2208 | if (ftrace_rec_count(rec) == 1 && | ||
2209 | ftrace_ops_test(ops, rec->ip, rec)) { | ||
2210 | |||
2211 | /* | ||
2212 | * If another ops adds to a rec, the rec will | ||
2213 | * lose its trampoline and never get it back | ||
2214 | * until all ops are off of it. | ||
2215 | */ | ||
2216 | if (!(rec->flags & FTRACE_FL_TRAMP)) | ||
2217 | continue; | ||
2218 | |||
2219 | /* This record had better have a trampoline */ | ||
2220 | if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN))) | ||
2221 | return -1; | ||
2222 | |||
2223 | ret = add_hash_entry(ops->tramp_hash, rec->ip); | ||
2224 | if (ret < 0) | ||
2225 | return ret; | ||
2226 | } | ||
2227 | } while_for_each_ftrace_rec(); | ||
2228 | |||
2229 | /* The number of recs in the hash must match nr_trampolines */ | ||
2230 | FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines); | ||
2231 | |||
2232 | return 0; | ||
2233 | } | ||
2234 | |||
2235 | static int ftrace_save_tramp_hashes(void) | ||
2236 | { | ||
2237 | struct ftrace_ops *op; | ||
2238 | int ret; | ||
2239 | |||
2240 | /* | ||
2241 | * Now that any trampoline is being used, we need to save the | ||
2242 | * hashes for the ops that have them. This allows the mapping | ||
2243 | * back from the record to the ops that has the trampoline to | ||
2244 | * know what code is being replaced. Modifying code must always | ||
2245 | * verify what it is changing. | ||
2246 | */ | ||
2247 | do_for_each_ftrace_op(op, ftrace_ops_list) { | ||
2248 | |||
2249 | /* The tramp_hash is recreated each time. */ | ||
2250 | free_ftrace_hash(op->tramp_hash); | ||
2251 | op->tramp_hash = NULL; | ||
2252 | |||
2253 | if (op->nr_trampolines) { | ||
2254 | ret = ftrace_save_ops_tramp_hash(op); | ||
2255 | if (ret) | ||
2256 | return ret; | ||
2257 | } | ||
2258 | |||
2259 | } while_for_each_ftrace_op(op); | ||
2260 | |||
2261 | return 0; | ||
2262 | } | ||
2263 | |||
2026 | static void ftrace_run_update_code(int command) | 2264 | static void ftrace_run_update_code(int command) |
2027 | { | 2265 | { |
2028 | int ret; | 2266 | int ret; |
@@ -2031,11 +2269,6 @@ static void ftrace_run_update_code(int command) | |||
2031 | FTRACE_WARN_ON(ret); | 2269 | FTRACE_WARN_ON(ret); |
2032 | if (ret) | 2270 | if (ret) |
2033 | return; | 2271 | return; |
2034 | /* | ||
2035 | * Do not call function tracer while we update the code. | ||
2036 | * We are in stop machine. | ||
2037 | */ | ||
2038 | function_trace_stop++; | ||
2039 | 2272 | ||
2040 | /* | 2273 | /* |
2041 | * By default we use stop_machine() to modify the code. | 2274 | * By default we use stop_machine() to modify the code. |
@@ -2045,15 +2278,15 @@ static void ftrace_run_update_code(int command) | |||
2045 | */ | 2278 | */ |
2046 | arch_ftrace_update_code(command); | 2279 | arch_ftrace_update_code(command); |
2047 | 2280 | ||
2048 | function_trace_stop--; | ||
2049 | |||
2050 | ret = ftrace_arch_code_modify_post_process(); | 2281 | ret = ftrace_arch_code_modify_post_process(); |
2051 | FTRACE_WARN_ON(ret); | 2282 | FTRACE_WARN_ON(ret); |
2283 | |||
2284 | ret = ftrace_save_tramp_hashes(); | ||
2285 | FTRACE_WARN_ON(ret); | ||
2052 | } | 2286 | } |
2053 | 2287 | ||
2054 | static ftrace_func_t saved_ftrace_func; | 2288 | static ftrace_func_t saved_ftrace_func; |
2055 | static int ftrace_start_up; | 2289 | static int ftrace_start_up; |
2056 | static int global_start_up; | ||
2057 | 2290 | ||
2058 | static void control_ops_free(struct ftrace_ops *ops) | 2291 | static void control_ops_free(struct ftrace_ops *ops) |
2059 | { | 2292 | { |
@@ -2117,8 +2350,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
2117 | 2350 | ||
2118 | ftrace_hash_rec_disable(ops, 1); | 2351 | ftrace_hash_rec_disable(ops, 1); |
2119 | 2352 | ||
2120 | if (!global_start_up) | 2353 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; |
2121 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | ||
2122 | 2354 | ||
2123 | command |= FTRACE_UPDATE_CALLS; | 2355 | command |= FTRACE_UPDATE_CALLS; |
2124 | 2356 | ||
@@ -2139,8 +2371,16 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
2139 | return 0; | 2371 | return 0; |
2140 | } | 2372 | } |
2141 | 2373 | ||
2374 | /* | ||
2375 | * If the ops uses a trampoline, then it needs to be | ||
2376 | * tested first on update. | ||
2377 | */ | ||
2378 | removed_ops = ops; | ||
2379 | |||
2142 | ftrace_run_update_code(command); | 2380 | ftrace_run_update_code(command); |
2143 | 2381 | ||
2382 | removed_ops = NULL; | ||
2383 | |||
2144 | /* | 2384 | /* |
2145 | * Dynamic ops may be freed, we must make sure that all | 2385 | * Dynamic ops may be freed, we must make sure that all |
2146 | * callers are done before leaving this function. | 2386 | * callers are done before leaving this function. |
@@ -2398,7 +2638,8 @@ ftrace_allocate_pages(unsigned long num_to_init) | |||
2398 | return start_pg; | 2638 | return start_pg; |
2399 | 2639 | ||
2400 | free_pages: | 2640 | free_pages: |
2401 | while (start_pg) { | 2641 | pg = start_pg; |
2642 | while (pg) { | ||
2402 | order = get_count_order(pg->size / ENTRIES_PER_PAGE); | 2643 | order = get_count_order(pg->size / ENTRIES_PER_PAGE); |
2403 | free_pages((unsigned long)pg->records, order); | 2644 | free_pages((unsigned long)pg->records, order); |
2404 | start_pg = pg->next; | 2645 | start_pg = pg->next; |
@@ -2595,8 +2836,10 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
2595 | * off, we can short cut and just print out that all | 2836 | * off, we can short cut and just print out that all |
2596 | * functions are enabled. | 2837 | * functions are enabled. |
2597 | */ | 2838 | */ |
2598 | if (iter->flags & FTRACE_ITER_FILTER && | 2839 | if ((iter->flags & FTRACE_ITER_FILTER && |
2599 | ftrace_hash_empty(ops->filter_hash)) { | 2840 | ftrace_hash_empty(ops->filter_hash)) || |
2841 | (iter->flags & FTRACE_ITER_NOTRACE && | ||
2842 | ftrace_hash_empty(ops->notrace_hash))) { | ||
2600 | if (*pos > 0) | 2843 | if (*pos > 0) |
2601 | return t_hash_start(m, pos); | 2844 | return t_hash_start(m, pos); |
2602 | iter->flags |= FTRACE_ITER_PRINTALL; | 2845 | iter->flags |= FTRACE_ITER_PRINTALL; |
@@ -2641,7 +2884,10 @@ static int t_show(struct seq_file *m, void *v) | |||
2641 | return t_hash_show(m, iter); | 2884 | return t_hash_show(m, iter); |
2642 | 2885 | ||
2643 | if (iter->flags & FTRACE_ITER_PRINTALL) { | 2886 | if (iter->flags & FTRACE_ITER_PRINTALL) { |
2644 | seq_printf(m, "#### all functions enabled ####\n"); | 2887 | if (iter->flags & FTRACE_ITER_NOTRACE) |
2888 | seq_printf(m, "#### no functions disabled ####\n"); | ||
2889 | else | ||
2890 | seq_printf(m, "#### all functions enabled ####\n"); | ||
2645 | return 0; | 2891 | return 0; |
2646 | } | 2892 | } |
2647 | 2893 | ||
@@ -2651,10 +2897,22 @@ static int t_show(struct seq_file *m, void *v) | |||
2651 | return 0; | 2897 | return 0; |
2652 | 2898 | ||
2653 | seq_printf(m, "%ps", (void *)rec->ip); | 2899 | seq_printf(m, "%ps", (void *)rec->ip); |
2654 | if (iter->flags & FTRACE_ITER_ENABLED) | 2900 | if (iter->flags & FTRACE_ITER_ENABLED) { |
2655 | seq_printf(m, " (%ld)%s", | 2901 | seq_printf(m, " (%ld)%s", |
2656 | rec->flags & ~FTRACE_FL_MASK, | 2902 | ftrace_rec_count(rec), |
2657 | rec->flags & FTRACE_FL_REGS ? " R" : ""); | 2903 | rec->flags & FTRACE_FL_REGS ? " R" : " "); |
2904 | if (rec->flags & FTRACE_FL_TRAMP_EN) { | ||
2905 | struct ftrace_ops *ops; | ||
2906 | |||
2907 | ops = ftrace_find_tramp_ops_curr(rec); | ||
2908 | if (ops && ops->trampoline) | ||
2909 | seq_printf(m, "\ttramp: %pS", | ||
2910 | (void *)ops->trampoline); | ||
2911 | else | ||
2912 | seq_printf(m, "\ttramp: ERROR!"); | ||
2913 | } | ||
2914 | } | ||
2915 | |||
2658 | seq_printf(m, "\n"); | 2916 | seq_printf(m, "\n"); |
2659 | 2917 | ||
2660 | return 0; | 2918 | return 0; |
@@ -2702,13 +2960,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file) | |||
2702 | return iter ? 0 : -ENOMEM; | 2960 | return iter ? 0 : -ENOMEM; |
2703 | } | 2961 | } |
2704 | 2962 | ||
2705 | static void ftrace_filter_reset(struct ftrace_hash *hash) | ||
2706 | { | ||
2707 | mutex_lock(&ftrace_lock); | ||
2708 | ftrace_hash_clear(hash); | ||
2709 | mutex_unlock(&ftrace_lock); | ||
2710 | } | ||
2711 | |||
2712 | /** | 2963 | /** |
2713 | * ftrace_regex_open - initialize function tracer filter files | 2964 | * ftrace_regex_open - initialize function tracer filter files |
2714 | * @ops: The ftrace_ops that hold the hash filters | 2965 | * @ops: The ftrace_ops that hold the hash filters |
@@ -2758,7 +3009,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, | |||
2758 | hash = ops->filter_hash; | 3009 | hash = ops->filter_hash; |
2759 | 3010 | ||
2760 | if (file->f_mode & FMODE_WRITE) { | 3011 | if (file->f_mode & FMODE_WRITE) { |
2761 | iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); | 3012 | const int size_bits = FTRACE_HASH_DEFAULT_BITS; |
3013 | |||
3014 | if (file->f_flags & O_TRUNC) | ||
3015 | iter->hash = alloc_ftrace_hash(size_bits); | ||
3016 | else | ||
3017 | iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); | ||
3018 | |||
2762 | if (!iter->hash) { | 3019 | if (!iter->hash) { |
2763 | trace_parser_put(&iter->parser); | 3020 | trace_parser_put(&iter->parser); |
2764 | kfree(iter); | 3021 | kfree(iter); |
@@ -2767,10 +3024,6 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, | |||
2767 | } | 3024 | } |
2768 | } | 3025 | } |
2769 | 3026 | ||
2770 | if ((file->f_mode & FMODE_WRITE) && | ||
2771 | (file->f_flags & O_TRUNC)) | ||
2772 | ftrace_filter_reset(iter->hash); | ||
2773 | |||
2774 | if (file->f_mode & FMODE_READ) { | 3027 | if (file->f_mode & FMODE_READ) { |
2775 | iter->pg = ftrace_pages_start; | 3028 | iter->pg = ftrace_pages_start; |
2776 | 3029 | ||
@@ -3471,14 +3724,16 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
3471 | else | 3724 | else |
3472 | orig_hash = &ops->notrace_hash; | 3725 | orig_hash = &ops->notrace_hash; |
3473 | 3726 | ||
3474 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); | 3727 | if (reset) |
3728 | hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); | ||
3729 | else | ||
3730 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); | ||
3731 | |||
3475 | if (!hash) { | 3732 | if (!hash) { |
3476 | ret = -ENOMEM; | 3733 | ret = -ENOMEM; |
3477 | goto out_regex_unlock; | 3734 | goto out_regex_unlock; |
3478 | } | 3735 | } |
3479 | 3736 | ||
3480 | if (reset) | ||
3481 | ftrace_filter_reset(hash); | ||
3482 | if (buf && !ftrace_match_records(hash, buf, len)) { | 3737 | if (buf && !ftrace_match_records(hash, buf, len)) { |
3483 | ret = -EINVAL; | 3738 | ret = -EINVAL; |
3484 | goto out_regex_unlock; | 3739 | goto out_regex_unlock; |
@@ -3630,6 +3885,7 @@ __setup("ftrace_filter=", set_ftrace_filter); | |||
3630 | 3885 | ||
3631 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 3886 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
3632 | static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; | 3887 | static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; |
3888 | static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; | ||
3633 | static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); | 3889 | static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); |
3634 | 3890 | ||
3635 | static int __init set_graph_function(char *str) | 3891 | static int __init set_graph_function(char *str) |
@@ -3639,16 +3895,29 @@ static int __init set_graph_function(char *str) | |||
3639 | } | 3895 | } |
3640 | __setup("ftrace_graph_filter=", set_graph_function); | 3896 | __setup("ftrace_graph_filter=", set_graph_function); |
3641 | 3897 | ||
3642 | static void __init set_ftrace_early_graph(char *buf) | 3898 | static int __init set_graph_notrace_function(char *str) |
3899 | { | ||
3900 | strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); | ||
3901 | return 1; | ||
3902 | } | ||
3903 | __setup("ftrace_graph_notrace=", set_graph_notrace_function); | ||
3904 | |||
3905 | static void __init set_ftrace_early_graph(char *buf, int enable) | ||
3643 | { | 3906 | { |
3644 | int ret; | 3907 | int ret; |
3645 | char *func; | 3908 | char *func; |
3909 | unsigned long *table = ftrace_graph_funcs; | ||
3910 | int *count = &ftrace_graph_count; | ||
3911 | |||
3912 | if (!enable) { | ||
3913 | table = ftrace_graph_notrace_funcs; | ||
3914 | count = &ftrace_graph_notrace_count; | ||
3915 | } | ||
3646 | 3916 | ||
3647 | while (buf) { | 3917 | while (buf) { |
3648 | func = strsep(&buf, ","); | 3918 | func = strsep(&buf, ","); |
3649 | /* we allow only one expression at a time */ | 3919 | /* we allow only one expression at a time */ |
3650 | ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, | 3920 | ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func); |
3651 | FTRACE_GRAPH_MAX_FUNCS, func); | ||
3652 | if (ret) | 3921 | if (ret) |
3653 | printk(KERN_DEBUG "ftrace: function %s not " | 3922 | printk(KERN_DEBUG "ftrace: function %s not " |
3654 | "traceable\n", func); | 3923 | "traceable\n", func); |
@@ -3677,7 +3946,9 @@ static void __init set_ftrace_early_filters(void) | |||
3677 | ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); | 3946 | ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); |
3678 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 3947 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
3679 | if (ftrace_graph_buf[0]) | 3948 | if (ftrace_graph_buf[0]) |
3680 | set_ftrace_early_graph(ftrace_graph_buf); | 3949 | set_ftrace_early_graph(ftrace_graph_buf, 1); |
3950 | if (ftrace_graph_notrace_buf[0]) | ||
3951 | set_ftrace_early_graph(ftrace_graph_notrace_buf, 0); | ||
3681 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 3952 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
3682 | } | 3953 | } |
3683 | 3954 | ||
@@ -3819,7 +4090,12 @@ static int g_show(struct seq_file *m, void *v) | |||
3819 | return 0; | 4090 | return 0; |
3820 | 4091 | ||
3821 | if (ptr == (unsigned long *)1) { | 4092 | if (ptr == (unsigned long *)1) { |
3822 | seq_printf(m, "#### all functions enabled ####\n"); | 4093 | struct ftrace_graph_data *fgd = m->private; |
4094 | |||
4095 | if (fgd->table == ftrace_graph_funcs) | ||
4096 | seq_printf(m, "#### all functions enabled ####\n"); | ||
4097 | else | ||
4098 | seq_printf(m, "#### no functions disabled ####\n"); | ||
3823 | return 0; | 4099 | return 0; |
3824 | } | 4100 | } |
3825 | 4101 | ||
@@ -4447,9 +4723,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, | |||
4447 | struct ftrace_ops *op; | 4723 | struct ftrace_ops *op; |
4448 | int bit; | 4724 | int bit; |
4449 | 4725 | ||
4450 | if (function_trace_stop) | ||
4451 | return; | ||
4452 | |||
4453 | bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); | 4726 | bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); |
4454 | if (bit < 0) | 4727 | if (bit < 0) |
4455 | return; | 4728 | return; |
@@ -4461,9 +4734,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, | |||
4461 | preempt_disable_notrace(); | 4734 | preempt_disable_notrace(); |
4462 | do_for_each_ftrace_op(op, ftrace_ops_list) { | 4735 | do_for_each_ftrace_op(op, ftrace_ops_list) { |
4463 | if (ftrace_ops_test(op, ip, regs)) { | 4736 | if (ftrace_ops_test(op, ip, regs)) { |
4464 | if (WARN_ON(!op->func)) { | 4737 | if (FTRACE_WARN_ON(!op->func)) { |
4465 | function_trace_stop = 1; | 4738 | pr_warn("op=%p %pS\n", op, op); |
4466 | printk("op=%p %pS\n", op, op); | ||
4467 | goto out; | 4739 | goto out; |
4468 | } | 4740 | } |
4469 | op->func(ip, parent_ip, op, regs); | 4741 | op->func(ip, parent_ip, op, regs); |
@@ -5084,6 +5356,12 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, | |||
5084 | /* Function graph doesn't use the .func field of global_ops */ | 5356 | /* Function graph doesn't use the .func field of global_ops */ |
5085 | global_ops.flags |= FTRACE_OPS_FL_STUB; | 5357 | global_ops.flags |= FTRACE_OPS_FL_STUB; |
5086 | 5358 | ||
5359 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
5360 | /* Optimize function graph calling (if implemented by arch) */ | ||
5361 | if (FTRACE_GRAPH_TRAMP_ADDR != 0) | ||
5362 | global_ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR; | ||
5363 | #endif | ||
5364 | |||
5087 | ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); | 5365 | ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); |
5088 | 5366 | ||
5089 | out: | 5367 | out: |
@@ -5104,6 +5382,10 @@ void unregister_ftrace_graph(void) | |||
5104 | __ftrace_graph_entry = ftrace_graph_entry_stub; | 5382 | __ftrace_graph_entry = ftrace_graph_entry_stub; |
5105 | ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); | 5383 | ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); |
5106 | global_ops.flags &= ~FTRACE_OPS_FL_STUB; | 5384 | global_ops.flags &= ~FTRACE_OPS_FL_STUB; |
5385 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
5386 | if (FTRACE_GRAPH_TRAMP_ADDR != 0) | ||
5387 | global_ops.trampoline = 0; | ||
5388 | #endif | ||
5107 | unregister_pm_notifier(&ftrace_suspend_notifier); | 5389 | unregister_pm_notifier(&ftrace_suspend_notifier); |
5108 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); | 5390 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); |
5109 | 5391 | ||
@@ -5183,9 +5465,4 @@ void ftrace_graph_exit_task(struct task_struct *t) | |||
5183 | 5465 | ||
5184 | kfree(ret_stack); | 5466 | kfree(ret_stack); |
5185 | } | 5467 | } |
5186 | |||
5187 | void ftrace_graph_stop(void) | ||
5188 | { | ||
5189 | ftrace_stop(); | ||
5190 | } | ||
5191 | #endif | 5468 | #endif |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7c56c3d06943..925f629658d6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -616,10 +616,6 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, | |||
616 | struct ring_buffer_per_cpu *cpu_buffer; | 616 | struct ring_buffer_per_cpu *cpu_buffer; |
617 | struct rb_irq_work *work; | 617 | struct rb_irq_work *work; |
618 | 618 | ||
619 | if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || | ||
620 | (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) | ||
621 | return POLLIN | POLLRDNORM; | ||
622 | |||
623 | if (cpu == RING_BUFFER_ALL_CPUS) | 619 | if (cpu == RING_BUFFER_ALL_CPUS) |
624 | work = &buffer->irq_work; | 620 | work = &buffer->irq_work; |
625 | else { | 621 | else { |
@@ -1693,22 +1689,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, | |||
1693 | if (!cpu_buffer->nr_pages_to_update) | 1689 | if (!cpu_buffer->nr_pages_to_update) |
1694 | continue; | 1690 | continue; |
1695 | 1691 | ||
1696 | /* The update must run on the CPU that is being updated. */ | 1692 | /* Can't run something on an offline CPU. */ |
1697 | preempt_disable(); | 1693 | if (!cpu_online(cpu)) { |
1698 | if (cpu == smp_processor_id() || !cpu_online(cpu)) { | ||
1699 | rb_update_pages(cpu_buffer); | 1694 | rb_update_pages(cpu_buffer); |
1700 | cpu_buffer->nr_pages_to_update = 0; | 1695 | cpu_buffer->nr_pages_to_update = 0; |
1701 | } else { | 1696 | } else { |
1702 | /* | ||
1703 | * Can not disable preemption for schedule_work_on() | ||
1704 | * on PREEMPT_RT. | ||
1705 | */ | ||
1706 | preempt_enable(); | ||
1707 | schedule_work_on(cpu, | 1697 | schedule_work_on(cpu, |
1708 | &cpu_buffer->update_pages_work); | 1698 | &cpu_buffer->update_pages_work); |
1709 | preempt_disable(); | ||
1710 | } | 1699 | } |
1711 | preempt_enable(); | ||
1712 | } | 1700 | } |
1713 | 1701 | ||
1714 | /* wait for all the updates to complete */ | 1702 | /* wait for all the updates to complete */ |
@@ -1746,22 +1734,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, | |||
1746 | 1734 | ||
1747 | get_online_cpus(); | 1735 | get_online_cpus(); |
1748 | 1736 | ||
1749 | preempt_disable(); | 1737 | /* Can't run something on an offline CPU. */ |
1750 | /* The update must run on the CPU that is being updated. */ | 1738 | if (!cpu_online(cpu_id)) |
1751 | if (cpu_id == smp_processor_id() || !cpu_online(cpu_id)) | ||
1752 | rb_update_pages(cpu_buffer); | 1739 | rb_update_pages(cpu_buffer); |
1753 | else { | 1740 | else { |
1754 | /* | ||
1755 | * Can not disable preemption for schedule_work_on() | ||
1756 | * on PREEMPT_RT. | ||
1757 | */ | ||
1758 | preempt_enable(); | ||
1759 | schedule_work_on(cpu_id, | 1741 | schedule_work_on(cpu_id, |
1760 | &cpu_buffer->update_pages_work); | 1742 | &cpu_buffer->update_pages_work); |
1761 | wait_for_completion(&cpu_buffer->update_done); | 1743 | wait_for_completion(&cpu_buffer->update_done); |
1762 | preempt_disable(); | ||
1763 | } | 1744 | } |
1764 | preempt_enable(); | ||
1765 | 1745 | ||
1766 | cpu_buffer->nr_pages_to_update = 0; | 1746 | cpu_buffer->nr_pages_to_update = 0; |
1767 | put_online_cpus(); | 1747 | put_online_cpus(); |
@@ -3779,7 +3759,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) | |||
3779 | if (rb_per_cpu_empty(cpu_buffer)) | 3759 | if (rb_per_cpu_empty(cpu_buffer)) |
3780 | return NULL; | 3760 | return NULL; |
3781 | 3761 | ||
3782 | if (iter->head >= local_read(&iter->head_page->page->commit)) { | 3762 | if (iter->head >= rb_page_size(iter->head_page)) { |
3783 | rb_inc_iter(iter); | 3763 | rb_inc_iter(iter); |
3784 | goto again; | 3764 | goto again; |
3785 | } | 3765 | } |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 84e2b45c0934..8a528392b1f4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -466,6 +466,12 @@ int __trace_puts(unsigned long ip, const char *str, int size) | |||
466 | struct print_entry *entry; | 466 | struct print_entry *entry; |
467 | unsigned long irq_flags; | 467 | unsigned long irq_flags; |
468 | int alloc; | 468 | int alloc; |
469 | int pc; | ||
470 | |||
471 | if (!(trace_flags & TRACE_ITER_PRINTK)) | ||
472 | return 0; | ||
473 | |||
474 | pc = preempt_count(); | ||
469 | 475 | ||
470 | if (unlikely(tracing_selftest_running || tracing_disabled)) | 476 | if (unlikely(tracing_selftest_running || tracing_disabled)) |
471 | return 0; | 477 | return 0; |
@@ -475,7 +481,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) | |||
475 | local_save_flags(irq_flags); | 481 | local_save_flags(irq_flags); |
476 | buffer = global_trace.trace_buffer.buffer; | 482 | buffer = global_trace.trace_buffer.buffer; |
477 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, | 483 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, |
478 | irq_flags, preempt_count()); | 484 | irq_flags, pc); |
479 | if (!event) | 485 | if (!event) |
480 | return 0; | 486 | return 0; |
481 | 487 | ||
@@ -492,6 +498,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) | |||
492 | entry->buf[size] = '\0'; | 498 | entry->buf[size] = '\0'; |
493 | 499 | ||
494 | __buffer_unlock_commit(buffer, event); | 500 | __buffer_unlock_commit(buffer, event); |
501 | ftrace_trace_stack(buffer, irq_flags, 4, pc); | ||
495 | 502 | ||
496 | return size; | 503 | return size; |
497 | } | 504 | } |
@@ -509,6 +516,12 @@ int __trace_bputs(unsigned long ip, const char *str) | |||
509 | struct bputs_entry *entry; | 516 | struct bputs_entry *entry; |
510 | unsigned long irq_flags; | 517 | unsigned long irq_flags; |
511 | int size = sizeof(struct bputs_entry); | 518 | int size = sizeof(struct bputs_entry); |
519 | int pc; | ||
520 | |||
521 | if (!(trace_flags & TRACE_ITER_PRINTK)) | ||
522 | return 0; | ||
523 | |||
524 | pc = preempt_count(); | ||
512 | 525 | ||
513 | if (unlikely(tracing_selftest_running || tracing_disabled)) | 526 | if (unlikely(tracing_selftest_running || tracing_disabled)) |
514 | return 0; | 527 | return 0; |
@@ -516,7 +529,7 @@ int __trace_bputs(unsigned long ip, const char *str) | |||
516 | local_save_flags(irq_flags); | 529 | local_save_flags(irq_flags); |
517 | buffer = global_trace.trace_buffer.buffer; | 530 | buffer = global_trace.trace_buffer.buffer; |
518 | event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, | 531 | event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, |
519 | irq_flags, preempt_count()); | 532 | irq_flags, pc); |
520 | if (!event) | 533 | if (!event) |
521 | return 0; | 534 | return 0; |
522 | 535 | ||
@@ -525,6 +538,7 @@ int __trace_bputs(unsigned long ip, const char *str) | |||
525 | entry->str = str; | 538 | entry->str = str; |
526 | 539 | ||
527 | __buffer_unlock_commit(buffer, event); | 540 | __buffer_unlock_commit(buffer, event); |
541 | ftrace_trace_stack(buffer, irq_flags, 4, pc); | ||
528 | 542 | ||
529 | return 1; | 543 | return 1; |
530 | } | 544 | } |
@@ -809,7 +823,7 @@ static struct { | |||
809 | { trace_clock_local, "local", 1 }, | 823 | { trace_clock_local, "local", 1 }, |
810 | { trace_clock_global, "global", 1 }, | 824 | { trace_clock_global, "global", 1 }, |
811 | { trace_clock_counter, "counter", 0 }, | 825 | { trace_clock_counter, "counter", 0 }, |
812 | { trace_clock_jiffies, "uptime", 1 }, | 826 | { trace_clock_jiffies, "uptime", 0 }, |
813 | { trace_clock, "perf", 1 }, | 827 | { trace_clock, "perf", 1 }, |
814 | { ktime_get_mono_fast_ns, "mono", 1 }, | 828 | { ktime_get_mono_fast_ns, "mono", 1 }, |
815 | ARCH_TRACE_CLOCKS | 829 | ARCH_TRACE_CLOCKS |
@@ -924,30 +938,6 @@ out: | |||
924 | return ret; | 938 | return ret; |
925 | } | 939 | } |
926 | 940 | ||
927 | ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) | ||
928 | { | ||
929 | int len; | ||
930 | int ret; | ||
931 | |||
932 | if (!cnt) | ||
933 | return 0; | ||
934 | |||
935 | if (s->len <= s->readpos) | ||
936 | return -EBUSY; | ||
937 | |||
938 | len = s->len - s->readpos; | ||
939 | if (cnt > len) | ||
940 | cnt = len; | ||
941 | ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); | ||
942 | if (ret == cnt) | ||
943 | return -EFAULT; | ||
944 | |||
945 | cnt -= ret; | ||
946 | |||
947 | s->readpos += cnt; | ||
948 | return cnt; | ||
949 | } | ||
950 | |||
951 | static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) | 941 | static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) |
952 | { | 942 | { |
953 | int len; | 943 | int len; |
@@ -3686,6 +3676,7 @@ static const char readme_msg[] = | |||
3686 | #endif | 3676 | #endif |
3687 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 3677 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
3688 | " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" | 3678 | " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" |
3679 | " set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n" | ||
3689 | " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" | 3680 | " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" |
3690 | #endif | 3681 | #endif |
3691 | #ifdef CONFIG_TRACER_SNAPSHOT | 3682 | #ifdef CONFIG_TRACER_SNAPSHOT |
@@ -4225,10 +4216,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, | |||
4225 | } | 4216 | } |
4226 | 4217 | ||
4227 | static ssize_t | 4218 | static ssize_t |
4228 | tracing_max_lat_read(struct file *filp, char __user *ubuf, | 4219 | tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, |
4229 | size_t cnt, loff_t *ppos) | 4220 | size_t cnt, loff_t *ppos) |
4230 | { | 4221 | { |
4231 | unsigned long *ptr = filp->private_data; | ||
4232 | char buf[64]; | 4222 | char buf[64]; |
4233 | int r; | 4223 | int r; |
4234 | 4224 | ||
@@ -4240,10 +4230,9 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf, | |||
4240 | } | 4230 | } |
4241 | 4231 | ||
4242 | static ssize_t | 4232 | static ssize_t |
4243 | tracing_max_lat_write(struct file *filp, const char __user *ubuf, | 4233 | tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, |
4244 | size_t cnt, loff_t *ppos) | 4234 | size_t cnt, loff_t *ppos) |
4245 | { | 4235 | { |
4246 | unsigned long *ptr = filp->private_data; | ||
4247 | unsigned long val; | 4236 | unsigned long val; |
4248 | int ret; | 4237 | int ret; |
4249 | 4238 | ||
@@ -4256,6 +4245,52 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, | |||
4256 | return cnt; | 4245 | return cnt; |
4257 | } | 4246 | } |
4258 | 4247 | ||
4248 | static ssize_t | ||
4249 | tracing_thresh_read(struct file *filp, char __user *ubuf, | ||
4250 | size_t cnt, loff_t *ppos) | ||
4251 | { | ||
4252 | return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos); | ||
4253 | } | ||
4254 | |||
4255 | static ssize_t | ||
4256 | tracing_thresh_write(struct file *filp, const char __user *ubuf, | ||
4257 | size_t cnt, loff_t *ppos) | ||
4258 | { | ||
4259 | struct trace_array *tr = filp->private_data; | ||
4260 | int ret; | ||
4261 | |||
4262 | mutex_lock(&trace_types_lock); | ||
4263 | ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos); | ||
4264 | if (ret < 0) | ||
4265 | goto out; | ||
4266 | |||
4267 | if (tr->current_trace->update_thresh) { | ||
4268 | ret = tr->current_trace->update_thresh(tr); | ||
4269 | if (ret < 0) | ||
4270 | goto out; | ||
4271 | } | ||
4272 | |||
4273 | ret = cnt; | ||
4274 | out: | ||
4275 | mutex_unlock(&trace_types_lock); | ||
4276 | |||
4277 | return ret; | ||
4278 | } | ||
4279 | |||
4280 | static ssize_t | ||
4281 | tracing_max_lat_read(struct file *filp, char __user *ubuf, | ||
4282 | size_t cnt, loff_t *ppos) | ||
4283 | { | ||
4284 | return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos); | ||
4285 | } | ||
4286 | |||
4287 | static ssize_t | ||
4288 | tracing_max_lat_write(struct file *filp, const char __user *ubuf, | ||
4289 | size_t cnt, loff_t *ppos) | ||
4290 | { | ||
4291 | return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos); | ||
4292 | } | ||
4293 | |||
4259 | static int tracing_open_pipe(struct inode *inode, struct file *filp) | 4294 | static int tracing_open_pipe(struct inode *inode, struct file *filp) |
4260 | { | 4295 | { |
4261 | struct trace_array *tr = inode->i_private; | 4296 | struct trace_array *tr = inode->i_private; |
@@ -5157,6 +5192,13 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp) | |||
5157 | #endif /* CONFIG_TRACER_SNAPSHOT */ | 5192 | #endif /* CONFIG_TRACER_SNAPSHOT */ |
5158 | 5193 | ||
5159 | 5194 | ||
5195 | static const struct file_operations tracing_thresh_fops = { | ||
5196 | .open = tracing_open_generic, | ||
5197 | .read = tracing_thresh_read, | ||
5198 | .write = tracing_thresh_write, | ||
5199 | .llseek = generic_file_llseek, | ||
5200 | }; | ||
5201 | |||
5160 | static const struct file_operations tracing_max_lat_fops = { | 5202 | static const struct file_operations tracing_max_lat_fops = { |
5161 | .open = tracing_open_generic, | 5203 | .open = tracing_open_generic, |
5162 | .read = tracing_max_lat_read, | 5204 | .read = tracing_max_lat_read, |
@@ -6094,10 +6136,8 @@ destroy_trace_option_files(struct trace_option_dentry *topts) | |||
6094 | if (!topts) | 6136 | if (!topts) |
6095 | return; | 6137 | return; |
6096 | 6138 | ||
6097 | for (cnt = 0; topts[cnt].opt; cnt++) { | 6139 | for (cnt = 0; topts[cnt].opt; cnt++) |
6098 | if (topts[cnt].entry) | 6140 | debugfs_remove(topts[cnt].entry); |
6099 | debugfs_remove(topts[cnt].entry); | ||
6100 | } | ||
6101 | 6141 | ||
6102 | kfree(topts); | 6142 | kfree(topts); |
6103 | } | 6143 | } |
@@ -6520,7 +6560,7 @@ static __init int tracer_init_debugfs(void) | |||
6520 | init_tracer_debugfs(&global_trace, d_tracer); | 6560 | init_tracer_debugfs(&global_trace, d_tracer); |
6521 | 6561 | ||
6522 | trace_create_file("tracing_thresh", 0644, d_tracer, | 6562 | trace_create_file("tracing_thresh", 0644, d_tracer, |
6523 | &tracing_thresh, &tracing_max_lat_fops); | 6563 | &global_trace, &tracing_thresh_fops); |
6524 | 6564 | ||
6525 | trace_create_file("README", 0444, d_tracer, | 6565 | trace_create_file("README", 0444, d_tracer, |
6526 | NULL, &tracing_readme_fops); | 6566 | NULL, &tracing_readme_fops); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9258f5a815db..385391fb1d3b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -339,6 +339,7 @@ struct tracer_flags { | |||
339 | * @reset: called when one switches to another tracer | 339 | * @reset: called when one switches to another tracer |
340 | * @start: called when tracing is unpaused (echo 1 > tracing_enabled) | 340 | * @start: called when tracing is unpaused (echo 1 > tracing_enabled) |
341 | * @stop: called when tracing is paused (echo 0 > tracing_enabled) | 341 | * @stop: called when tracing is paused (echo 0 > tracing_enabled) |
342 | * @update_thresh: called when tracing_thresh is updated | ||
342 | * @open: called when the trace file is opened | 343 | * @open: called when the trace file is opened |
343 | * @pipe_open: called when the trace_pipe file is opened | 344 | * @pipe_open: called when the trace_pipe file is opened |
344 | * @close: called when the trace file is released | 345 | * @close: called when the trace file is released |
@@ -357,6 +358,7 @@ struct tracer { | |||
357 | void (*reset)(struct trace_array *tr); | 358 | void (*reset)(struct trace_array *tr); |
358 | void (*start)(struct trace_array *tr); | 359 | void (*start)(struct trace_array *tr); |
359 | void (*stop)(struct trace_array *tr); | 360 | void (*stop)(struct trace_array *tr); |
361 | int (*update_thresh)(struct trace_array *tr); | ||
360 | void (*open)(struct trace_iterator *iter); | 362 | void (*open)(struct trace_iterator *iter); |
361 | void (*pipe_open)(struct trace_iterator *iter); | 363 | void (*pipe_open)(struct trace_iterator *iter); |
362 | void (*close)(struct trace_iterator *iter); | 364 | void (*close)(struct trace_iterator *iter); |
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 26dc348332b7..57b67b1f24d1 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c | |||
@@ -59,13 +59,14 @@ u64 notrace trace_clock(void) | |||
59 | 59 | ||
60 | /* | 60 | /* |
61 | * trace_jiffy_clock(): Simply use jiffies as a clock counter. | 61 | * trace_jiffy_clock(): Simply use jiffies as a clock counter. |
62 | * Note that this use of jiffies_64 is not completely safe on | ||
63 | * 32-bit systems. But the window is tiny, and the effect if | ||
64 | * we are affected is that we will have an obviously bogus | ||
65 | * timestamp on a trace event - i.e. not life threatening. | ||
62 | */ | 66 | */ |
63 | u64 notrace trace_clock_jiffies(void) | 67 | u64 notrace trace_clock_jiffies(void) |
64 | { | 68 | { |
65 | u64 jiffy = jiffies - INITIAL_JIFFIES; | 69 | return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES); |
66 | |||
67 | /* Return nsecs */ | ||
68 | return (u64)jiffies_to_usecs(jiffy) * 1000ULL; | ||
69 | } | 70 | } |
70 | 71 | ||
71 | /* | 72 | /* |
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 5d12bb407b44..4b9c114ee9de 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
@@ -30,6 +30,18 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, | |||
30 | return ret; | 30 | return ret; |
31 | } | 31 | } |
32 | 32 | ||
33 | /* | ||
34 | * We checked and allowed to create parent, | ||
35 | * allow children without checking. | ||
36 | */ | ||
37 | if (p_event->parent) | ||
38 | return 0; | ||
39 | |||
40 | /* | ||
41 | * It's ok to check current process (owner) permissions in here, | ||
42 | * because code below is called only via perf_event_open syscall. | ||
43 | */ | ||
44 | |||
33 | /* The ftrace function trace is allowed only for root. */ | 45 | /* The ftrace function trace is allowed only for root. */ |
34 | if (ftrace_event_is_function(tp_event)) { | 46 | if (ftrace_event_is_function(tp_event)) { |
35 | if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) | 47 | if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f99e0b3bca8c..ef06ce7e9cf8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -8,6 +8,8 @@ | |||
8 | * | 8 | * |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #define pr_fmt(fmt) fmt | ||
12 | |||
11 | #include <linux/workqueue.h> | 13 | #include <linux/workqueue.h> |
12 | #include <linux/spinlock.h> | 14 | #include <linux/spinlock.h> |
13 | #include <linux/kthread.h> | 15 | #include <linux/kthread.h> |
@@ -470,6 +472,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file) | |||
470 | 472 | ||
471 | list_del(&file->list); | 473 | list_del(&file->list); |
472 | remove_subsystem(file->system); | 474 | remove_subsystem(file->system); |
475 | free_event_filter(file->filter); | ||
473 | kmem_cache_free(file_cachep, file); | 476 | kmem_cache_free(file_cachep, file); |
474 | } | 477 | } |
475 | 478 | ||
@@ -1490,7 +1493,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, | |||
1490 | 1493 | ||
1491 | dir->entry = debugfs_create_dir(name, parent); | 1494 | dir->entry = debugfs_create_dir(name, parent); |
1492 | if (!dir->entry) { | 1495 | if (!dir->entry) { |
1493 | pr_warning("Failed to create system directory %s\n", name); | 1496 | pr_warn("Failed to create system directory %s\n", name); |
1494 | __put_system(system); | 1497 | __put_system(system); |
1495 | goto out_free; | 1498 | goto out_free; |
1496 | } | 1499 | } |
@@ -1506,7 +1509,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, | |||
1506 | if (!entry) { | 1509 | if (!entry) { |
1507 | kfree(system->filter); | 1510 | kfree(system->filter); |
1508 | system->filter = NULL; | 1511 | system->filter = NULL; |
1509 | pr_warning("Could not create debugfs '%s/filter' entry\n", name); | 1512 | pr_warn("Could not create debugfs '%s/filter' entry\n", name); |
1510 | } | 1513 | } |
1511 | 1514 | ||
1512 | trace_create_file("enable", 0644, dir->entry, dir, | 1515 | trace_create_file("enable", 0644, dir->entry, dir, |
@@ -1521,8 +1524,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, | |||
1521 | out_fail: | 1524 | out_fail: |
1522 | /* Only print this message if failed on memory allocation */ | 1525 | /* Only print this message if failed on memory allocation */ |
1523 | if (!dir || !system) | 1526 | if (!dir || !system) |
1524 | pr_warning("No memory to create event subsystem %s\n", | 1527 | pr_warn("No memory to create event subsystem %s\n", name); |
1525 | name); | ||
1526 | return NULL; | 1528 | return NULL; |
1527 | } | 1529 | } |
1528 | 1530 | ||
@@ -1550,8 +1552,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) | |||
1550 | name = ftrace_event_name(call); | 1552 | name = ftrace_event_name(call); |
1551 | file->dir = debugfs_create_dir(name, d_events); | 1553 | file->dir = debugfs_create_dir(name, d_events); |
1552 | if (!file->dir) { | 1554 | if (!file->dir) { |
1553 | pr_warning("Could not create debugfs '%s' directory\n", | 1555 | pr_warn("Could not create debugfs '%s' directory\n", name); |
1554 | name); | ||
1555 | return -1; | 1556 | return -1; |
1556 | } | 1557 | } |
1557 | 1558 | ||
@@ -1574,8 +1575,8 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) | |||
1574 | if (list_empty(head)) { | 1575 | if (list_empty(head)) { |
1575 | ret = call->class->define_fields(call); | 1576 | ret = call->class->define_fields(call); |
1576 | if (ret < 0) { | 1577 | if (ret < 0) { |
1577 | pr_warning("Could not initialize trace point" | 1578 | pr_warn("Could not initialize trace point events/%s\n", |
1578 | " events/%s\n", name); | 1579 | name); |
1579 | return -1; | 1580 | return -1; |
1580 | } | 1581 | } |
1581 | } | 1582 | } |
@@ -1620,7 +1621,6 @@ static void event_remove(struct ftrace_event_call *call) | |||
1620 | if (file->event_call != call) | 1621 | if (file->event_call != call) |
1621 | continue; | 1622 | continue; |
1622 | ftrace_event_enable_disable(file, 0); | 1623 | ftrace_event_enable_disable(file, 0); |
1623 | destroy_preds(file); | ||
1624 | /* | 1624 | /* |
1625 | * The do_for_each_event_file() is | 1625 | * The do_for_each_event_file() is |
1626 | * a double loop. After finding the call for this | 1626 | * a double loop. After finding the call for this |
@@ -1648,8 +1648,7 @@ static int event_init(struct ftrace_event_call *call) | |||
1648 | if (call->class->raw_init) { | 1648 | if (call->class->raw_init) { |
1649 | ret = call->class->raw_init(call); | 1649 | ret = call->class->raw_init(call); |
1650 | if (ret < 0 && ret != -ENOSYS) | 1650 | if (ret < 0 && ret != -ENOSYS) |
1651 | pr_warn("Could not initialize trace events/%s\n", | 1651 | pr_warn("Could not initialize trace events/%s\n", name); |
1652 | name); | ||
1653 | } | 1652 | } |
1654 | 1653 | ||
1655 | return ret; | 1654 | return ret; |
@@ -1748,7 +1747,8 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) | |||
1748 | { | 1747 | { |
1749 | event_remove(call); | 1748 | event_remove(call); |
1750 | trace_destroy_fields(call); | 1749 | trace_destroy_fields(call); |
1751 | destroy_call_preds(call); | 1750 | free_event_filter(call->filter); |
1751 | call->filter = NULL; | ||
1752 | } | 1752 | } |
1753 | 1753 | ||
1754 | static int probe_remove_event_call(struct ftrace_event_call *call) | 1754 | static int probe_remove_event_call(struct ftrace_event_call *call) |
@@ -1894,8 +1894,8 @@ __trace_add_event_dirs(struct trace_array *tr) | |||
1894 | list_for_each_entry(call, &ftrace_events, list) { | 1894 | list_for_each_entry(call, &ftrace_events, list) { |
1895 | ret = __trace_add_new_event(call, tr); | 1895 | ret = __trace_add_new_event(call, tr); |
1896 | if (ret < 0) | 1896 | if (ret < 0) |
1897 | pr_warning("Could not create directory for event %s\n", | 1897 | pr_warn("Could not create directory for event %s\n", |
1898 | ftrace_event_name(call)); | 1898 | ftrace_event_name(call)); |
1899 | } | 1899 | } |
1900 | } | 1900 | } |
1901 | 1901 | ||
@@ -2207,8 +2207,8 @@ __trace_early_add_event_dirs(struct trace_array *tr) | |||
2207 | list_for_each_entry(file, &tr->events, list) { | 2207 | list_for_each_entry(file, &tr->events, list) { |
2208 | ret = event_create_dir(tr->event_dir, file); | 2208 | ret = event_create_dir(tr->event_dir, file); |
2209 | if (ret < 0) | 2209 | if (ret < 0) |
2210 | pr_warning("Could not create directory for event %s\n", | 2210 | pr_warn("Could not create directory for event %s\n", |
2211 | ftrace_event_name(file->event_call)); | 2211 | ftrace_event_name(file->event_call)); |
2212 | } | 2212 | } |
2213 | } | 2213 | } |
2214 | 2214 | ||
@@ -2231,8 +2231,8 @@ __trace_early_add_events(struct trace_array *tr) | |||
2231 | 2231 | ||
2232 | ret = __trace_early_add_new_event(call, tr); | 2232 | ret = __trace_early_add_new_event(call, tr); |
2233 | if (ret < 0) | 2233 | if (ret < 0) |
2234 | pr_warning("Could not create early event %s\n", | 2234 | pr_warn("Could not create early event %s\n", |
2235 | ftrace_event_name(call)); | 2235 | ftrace_event_name(call)); |
2236 | } | 2236 | } |
2237 | } | 2237 | } |
2238 | 2238 | ||
@@ -2279,13 +2279,13 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) | |||
2279 | entry = debugfs_create_file("set_event", 0644, parent, | 2279 | entry = debugfs_create_file("set_event", 0644, parent, |
2280 | tr, &ftrace_set_event_fops); | 2280 | tr, &ftrace_set_event_fops); |
2281 | if (!entry) { | 2281 | if (!entry) { |
2282 | pr_warning("Could not create debugfs 'set_event' entry\n"); | 2282 | pr_warn("Could not create debugfs 'set_event' entry\n"); |
2283 | return -ENOMEM; | 2283 | return -ENOMEM; |
2284 | } | 2284 | } |
2285 | 2285 | ||
2286 | d_events = debugfs_create_dir("events", parent); | 2286 | d_events = debugfs_create_dir("events", parent); |
2287 | if (!d_events) { | 2287 | if (!d_events) { |
2288 | pr_warning("Could not create debugfs 'events' directory\n"); | 2288 | pr_warn("Could not create debugfs 'events' directory\n"); |
2289 | return -ENOMEM; | 2289 | return -ENOMEM; |
2290 | } | 2290 | } |
2291 | 2291 | ||
@@ -2461,11 +2461,10 @@ static __init int event_trace_init(void) | |||
2461 | entry = debugfs_create_file("available_events", 0444, d_tracer, | 2461 | entry = debugfs_create_file("available_events", 0444, d_tracer, |
2462 | tr, &ftrace_avail_fops); | 2462 | tr, &ftrace_avail_fops); |
2463 | if (!entry) | 2463 | if (!entry) |
2464 | pr_warning("Could not create debugfs " | 2464 | pr_warn("Could not create debugfs 'available_events' entry\n"); |
2465 | "'available_events' entry\n"); | ||
2466 | 2465 | ||
2467 | if (trace_define_common_fields()) | 2466 | if (trace_define_common_fields()) |
2468 | pr_warning("tracing: Failed to allocate common fields"); | 2467 | pr_warn("tracing: Failed to allocate common fields"); |
2469 | 2468 | ||
2470 | ret = early_event_add_tracer(d_tracer, tr); | 2469 | ret = early_event_add_tracer(d_tracer, tr); |
2471 | if (ret) | 2470 | if (ret) |
@@ -2474,7 +2473,7 @@ static __init int event_trace_init(void) | |||
2474 | #ifdef CONFIG_MODULES | 2473 | #ifdef CONFIG_MODULES |
2475 | ret = register_module_notifier(&trace_module_nb); | 2474 | ret = register_module_notifier(&trace_module_nb); |
2476 | if (ret) | 2475 | if (ret) |
2477 | pr_warning("Failed to register trace events module notifier\n"); | 2476 | pr_warn("Failed to register trace events module notifier\n"); |
2478 | #endif | 2477 | #endif |
2479 | return 0; | 2478 | return 0; |
2480 | } | 2479 | } |
@@ -2578,7 +2577,7 @@ static __init void event_trace_self_tests(void) | |||
2578 | * it and the self test should not be on. | 2577 | * it and the self test should not be on. |
2579 | */ | 2578 | */ |
2580 | if (file->flags & FTRACE_EVENT_FL_ENABLED) { | 2579 | if (file->flags & FTRACE_EVENT_FL_ENABLED) { |
2581 | pr_warning("Enabled event during self test!\n"); | 2580 | pr_warn("Enabled event during self test!\n"); |
2582 | WARN_ON_ONCE(1); | 2581 | WARN_ON_ONCE(1); |
2583 | continue; | 2582 | continue; |
2584 | } | 2583 | } |
@@ -2606,8 +2605,8 @@ static __init void event_trace_self_tests(void) | |||
2606 | 2605 | ||
2607 | ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); | 2606 | ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); |
2608 | if (WARN_ON_ONCE(ret)) { | 2607 | if (WARN_ON_ONCE(ret)) { |
2609 | pr_warning("error enabling system %s\n", | 2608 | pr_warn("error enabling system %s\n", |
2610 | system->name); | 2609 | system->name); |
2611 | continue; | 2610 | continue; |
2612 | } | 2611 | } |
2613 | 2612 | ||
@@ -2615,8 +2614,8 @@ static __init void event_trace_self_tests(void) | |||
2615 | 2614 | ||
2616 | ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); | 2615 | ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); |
2617 | if (WARN_ON_ONCE(ret)) { | 2616 | if (WARN_ON_ONCE(ret)) { |
2618 | pr_warning("error disabling system %s\n", | 2617 | pr_warn("error disabling system %s\n", |
2619 | system->name); | 2618 | system->name); |
2620 | continue; | 2619 | continue; |
2621 | } | 2620 | } |
2622 | 2621 | ||
@@ -2630,7 +2629,7 @@ static __init void event_trace_self_tests(void) | |||
2630 | 2629 | ||
2631 | ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); | 2630 | ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); |
2632 | if (WARN_ON_ONCE(ret)) { | 2631 | if (WARN_ON_ONCE(ret)) { |
2633 | pr_warning("error enabling all events\n"); | 2632 | pr_warn("error enabling all events\n"); |
2634 | return; | 2633 | return; |
2635 | } | 2634 | } |
2636 | 2635 | ||
@@ -2639,7 +2638,7 @@ static __init void event_trace_self_tests(void) | |||
2639 | /* reset sysname */ | 2638 | /* reset sysname */ |
2640 | ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); | 2639 | ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); |
2641 | if (WARN_ON_ONCE(ret)) { | 2640 | if (WARN_ON_ONCE(ret)) { |
2642 | pr_warning("error disabling all events\n"); | 2641 | pr_warn("error disabling all events\n"); |
2643 | return; | 2642 | return; |
2644 | } | 2643 | } |
2645 | 2644 | ||
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8a8631926a07..7a8c1528e141 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -774,17 +774,12 @@ static void __free_preds(struct event_filter *filter) | |||
774 | filter->n_preds = 0; | 774 | filter->n_preds = 0; |
775 | } | 775 | } |
776 | 776 | ||
777 | static void call_filter_disable(struct ftrace_event_call *call) | ||
778 | { | ||
779 | call->flags &= ~TRACE_EVENT_FL_FILTERED; | ||
780 | } | ||
781 | |||
782 | static void filter_disable(struct ftrace_event_file *file) | 777 | static void filter_disable(struct ftrace_event_file *file) |
783 | { | 778 | { |
784 | struct ftrace_event_call *call = file->event_call; | 779 | struct ftrace_event_call *call = file->event_call; |
785 | 780 | ||
786 | if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) | 781 | if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) |
787 | call_filter_disable(call); | 782 | call->flags &= ~TRACE_EVENT_FL_FILTERED; |
788 | else | 783 | else |
789 | file->flags &= ~FTRACE_EVENT_FL_FILTERED; | 784 | file->flags &= ~FTRACE_EVENT_FL_FILTERED; |
790 | } | 785 | } |
@@ -804,32 +799,6 @@ void free_event_filter(struct event_filter *filter) | |||
804 | __free_filter(filter); | 799 | __free_filter(filter); |
805 | } | 800 | } |
806 | 801 | ||
807 | void destroy_call_preds(struct ftrace_event_call *call) | ||
808 | { | ||
809 | __free_filter(call->filter); | ||
810 | call->filter = NULL; | ||
811 | } | ||
812 | |||
813 | static void destroy_file_preds(struct ftrace_event_file *file) | ||
814 | { | ||
815 | __free_filter(file->filter); | ||
816 | file->filter = NULL; | ||
817 | } | ||
818 | |||
819 | /* | ||
820 | * Called when destroying the ftrace_event_file. | ||
821 | * The file is being freed, so we do not need to worry about | ||
822 | * the file being currently used. This is for module code removing | ||
823 | * the tracepoints from within it. | ||
824 | */ | ||
825 | void destroy_preds(struct ftrace_event_file *file) | ||
826 | { | ||
827 | if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) | ||
828 | destroy_call_preds(file->event_call); | ||
829 | else | ||
830 | destroy_file_preds(file); | ||
831 | } | ||
832 | |||
833 | static struct event_filter *__alloc_filter(void) | 802 | static struct event_filter *__alloc_filter(void) |
834 | { | 803 | { |
835 | struct event_filter *filter; | 804 | struct event_filter *filter; |
@@ -873,17 +842,14 @@ static inline void __remove_filter(struct ftrace_event_file *file) | |||
873 | remove_filter_string(file->filter); | 842 | remove_filter_string(file->filter); |
874 | } | 843 | } |
875 | 844 | ||
876 | static void filter_free_subsystem_preds(struct event_subsystem *system, | 845 | static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir, |
877 | struct trace_array *tr) | 846 | struct trace_array *tr) |
878 | { | 847 | { |
879 | struct ftrace_event_file *file; | 848 | struct ftrace_event_file *file; |
880 | struct ftrace_event_call *call; | ||
881 | 849 | ||
882 | list_for_each_entry(file, &tr->events, list) { | 850 | list_for_each_entry(file, &tr->events, list) { |
883 | call = file->event_call; | 851 | if (file->system != dir) |
884 | if (strcmp(call->class->system, system->name) != 0) | ||
885 | continue; | 852 | continue; |
886 | |||
887 | __remove_filter(file); | 853 | __remove_filter(file); |
888 | } | 854 | } |
889 | } | 855 | } |
@@ -901,15 +867,13 @@ static inline void __free_subsystem_filter(struct ftrace_event_file *file) | |||
901 | } | 867 | } |
902 | } | 868 | } |
903 | 869 | ||
904 | static void filter_free_subsystem_filters(struct event_subsystem *system, | 870 | static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir, |
905 | struct trace_array *tr) | 871 | struct trace_array *tr) |
906 | { | 872 | { |
907 | struct ftrace_event_file *file; | 873 | struct ftrace_event_file *file; |
908 | struct ftrace_event_call *call; | ||
909 | 874 | ||
910 | list_for_each_entry(file, &tr->events, list) { | 875 | list_for_each_entry(file, &tr->events, list) { |
911 | call = file->event_call; | 876 | if (file->system != dir) |
912 | if (strcmp(call->class->system, system->name) != 0) | ||
913 | continue; | 877 | continue; |
914 | __free_subsystem_filter(file); | 878 | __free_subsystem_filter(file); |
915 | } | 879 | } |
@@ -1582,7 +1546,6 @@ static int fold_pred_tree(struct event_filter *filter, | |||
1582 | static int replace_preds(struct ftrace_event_call *call, | 1546 | static int replace_preds(struct ftrace_event_call *call, |
1583 | struct event_filter *filter, | 1547 | struct event_filter *filter, |
1584 | struct filter_parse_state *ps, | 1548 | struct filter_parse_state *ps, |
1585 | char *filter_string, | ||
1586 | bool dry_run) | 1549 | bool dry_run) |
1587 | { | 1550 | { |
1588 | char *operand1 = NULL, *operand2 = NULL; | 1551 | char *operand1 = NULL, *operand2 = NULL; |
@@ -1755,13 +1718,12 @@ struct filter_list { | |||
1755 | struct event_filter *filter; | 1718 | struct event_filter *filter; |
1756 | }; | 1719 | }; |
1757 | 1720 | ||
1758 | static int replace_system_preds(struct event_subsystem *system, | 1721 | static int replace_system_preds(struct ftrace_subsystem_dir *dir, |
1759 | struct trace_array *tr, | 1722 | struct trace_array *tr, |
1760 | struct filter_parse_state *ps, | 1723 | struct filter_parse_state *ps, |
1761 | char *filter_string) | 1724 | char *filter_string) |
1762 | { | 1725 | { |
1763 | struct ftrace_event_file *file; | 1726 | struct ftrace_event_file *file; |
1764 | struct ftrace_event_call *call; | ||
1765 | struct filter_list *filter_item; | 1727 | struct filter_list *filter_item; |
1766 | struct filter_list *tmp; | 1728 | struct filter_list *tmp; |
1767 | LIST_HEAD(filter_list); | 1729 | LIST_HEAD(filter_list); |
@@ -1769,15 +1731,14 @@ static int replace_system_preds(struct event_subsystem *system, | |||
1769 | int err; | 1731 | int err; |
1770 | 1732 | ||
1771 | list_for_each_entry(file, &tr->events, list) { | 1733 | list_for_each_entry(file, &tr->events, list) { |
1772 | call = file->event_call; | 1734 | if (file->system != dir) |
1773 | if (strcmp(call->class->system, system->name) != 0) | ||
1774 | continue; | 1735 | continue; |
1775 | 1736 | ||
1776 | /* | 1737 | /* |
1777 | * Try to see if the filter can be applied | 1738 | * Try to see if the filter can be applied |
1778 | * (filter arg is ignored on dry_run) | 1739 | * (filter arg is ignored on dry_run) |
1779 | */ | 1740 | */ |
1780 | err = replace_preds(call, NULL, ps, filter_string, true); | 1741 | err = replace_preds(file->event_call, NULL, ps, true); |
1781 | if (err) | 1742 | if (err) |
1782 | event_set_no_set_filter_flag(file); | 1743 | event_set_no_set_filter_flag(file); |
1783 | else | 1744 | else |
@@ -1787,9 +1748,7 @@ static int replace_system_preds(struct event_subsystem *system, | |||
1787 | list_for_each_entry(file, &tr->events, list) { | 1748 | list_for_each_entry(file, &tr->events, list) { |
1788 | struct event_filter *filter; | 1749 | struct event_filter *filter; |
1789 | 1750 | ||
1790 | call = file->event_call; | 1751 | if (file->system != dir) |
1791 | |||
1792 | if (strcmp(call->class->system, system->name) != 0) | ||
1793 | continue; | 1752 | continue; |
1794 | 1753 | ||
1795 | if (event_no_set_filter_flag(file)) | 1754 | if (event_no_set_filter_flag(file)) |
@@ -1811,7 +1770,7 @@ static int replace_system_preds(struct event_subsystem *system, | |||
1811 | if (err) | 1770 | if (err) |
1812 | goto fail_mem; | 1771 | goto fail_mem; |
1813 | 1772 | ||
1814 | err = replace_preds(call, filter, ps, filter_string, false); | 1773 | err = replace_preds(file->event_call, filter, ps, false); |
1815 | if (err) { | 1774 | if (err) { |
1816 | filter_disable(file); | 1775 | filter_disable(file); |
1817 | parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); | 1776 | parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); |
@@ -1933,7 +1892,7 @@ static int create_filter(struct ftrace_event_call *call, | |||
1933 | 1892 | ||
1934 | err = create_filter_start(filter_str, set_str, &ps, &filter); | 1893 | err = create_filter_start(filter_str, set_str, &ps, &filter); |
1935 | if (!err) { | 1894 | if (!err) { |
1936 | err = replace_preds(call, filter, ps, filter_str, false); | 1895 | err = replace_preds(call, filter, ps, false); |
1937 | if (err && set_str) | 1896 | if (err && set_str) |
1938 | append_filter_err(ps, filter); | 1897 | append_filter_err(ps, filter); |
1939 | } | 1898 | } |
@@ -1959,7 +1918,7 @@ int create_event_filter(struct ftrace_event_call *call, | |||
1959 | * Identical to create_filter() except that it creates a subsystem filter | 1918 | * Identical to create_filter() except that it creates a subsystem filter |
1960 | * and always remembers @filter_str. | 1919 | * and always remembers @filter_str. |
1961 | */ | 1920 | */ |
1962 | static int create_system_filter(struct event_subsystem *system, | 1921 | static int create_system_filter(struct ftrace_subsystem_dir *dir, |
1963 | struct trace_array *tr, | 1922 | struct trace_array *tr, |
1964 | char *filter_str, struct event_filter **filterp) | 1923 | char *filter_str, struct event_filter **filterp) |
1965 | { | 1924 | { |
@@ -1969,7 +1928,7 @@ static int create_system_filter(struct event_subsystem *system, | |||
1969 | 1928 | ||
1970 | err = create_filter_start(filter_str, true, &ps, &filter); | 1929 | err = create_filter_start(filter_str, true, &ps, &filter); |
1971 | if (!err) { | 1930 | if (!err) { |
1972 | err = replace_system_preds(system, tr, ps, filter_str); | 1931 | err = replace_system_preds(dir, tr, ps, filter_str); |
1973 | if (!err) { | 1932 | if (!err) { |
1974 | /* System filters just show a default message */ | 1933 | /* System filters just show a default message */ |
1975 | kfree(filter->filter_string); | 1934 | kfree(filter->filter_string); |
@@ -2053,18 +2012,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, | |||
2053 | } | 2012 | } |
2054 | 2013 | ||
2055 | if (!strcmp(strstrip(filter_string), "0")) { | 2014 | if (!strcmp(strstrip(filter_string), "0")) { |
2056 | filter_free_subsystem_preds(system, tr); | 2015 | filter_free_subsystem_preds(dir, tr); |
2057 | remove_filter_string(system->filter); | 2016 | remove_filter_string(system->filter); |
2058 | filter = system->filter; | 2017 | filter = system->filter; |
2059 | system->filter = NULL; | 2018 | system->filter = NULL; |
2060 | /* Ensure all filters are no longer used */ | 2019 | /* Ensure all filters are no longer used */ |
2061 | synchronize_sched(); | 2020 | synchronize_sched(); |
2062 | filter_free_subsystem_filters(system, tr); | 2021 | filter_free_subsystem_filters(dir, tr); |
2063 | __free_filter(filter); | 2022 | __free_filter(filter); |
2064 | goto out_unlock; | 2023 | goto out_unlock; |
2065 | } | 2024 | } |
2066 | 2025 | ||
2067 | err = create_system_filter(system, tr, filter_string, &filter); | 2026 | err = create_system_filter(dir, tr, filter_string, &filter); |
2068 | if (filter) { | 2027 | if (filter) { |
2069 | /* | 2028 | /* |
2070 | * No event actually uses the system filter | 2029 | * No event actually uses the system filter |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4de3e57f723c..f0a0c982cde3 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -15,6 +15,33 @@ | |||
15 | #include "trace.h" | 15 | #include "trace.h" |
16 | #include "trace_output.h" | 16 | #include "trace_output.h" |
17 | 17 | ||
18 | static bool kill_ftrace_graph; | ||
19 | |||
20 | /** | ||
21 | * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called | ||
22 | * | ||
23 | * ftrace_graph_stop() is called when a severe error is detected in | ||
24 | * the function graph tracing. This function is called by the critical | ||
25 | * paths of function graph to keep those paths from doing any more harm. | ||
26 | */ | ||
27 | bool ftrace_graph_is_dead(void) | ||
28 | { | ||
29 | return kill_ftrace_graph; | ||
30 | } | ||
31 | |||
32 | /** | ||
33 | * ftrace_graph_stop - set to permanently disable function graph tracincg | ||
34 | * | ||
35 | * In case of an error int function graph tracing, this is called | ||
36 | * to try to keep function graph tracing from causing any more harm. | ||
37 | * Usually this is pretty severe and this is called to try to at least | ||
38 | * get a warning out to the user. | ||
39 | */ | ||
40 | void ftrace_graph_stop(void) | ||
41 | { | ||
42 | kill_ftrace_graph = true; | ||
43 | } | ||
44 | |||
18 | /* When set, irq functions will be ignored */ | 45 | /* When set, irq functions will be ignored */ |
19 | static int ftrace_graph_skip_irqs; | 46 | static int ftrace_graph_skip_irqs; |
20 | 47 | ||
@@ -92,6 +119,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, | |||
92 | unsigned long long calltime; | 119 | unsigned long long calltime; |
93 | int index; | 120 | int index; |
94 | 121 | ||
122 | if (unlikely(ftrace_graph_is_dead())) | ||
123 | return -EBUSY; | ||
124 | |||
95 | if (!current->ret_stack) | 125 | if (!current->ret_stack) |
96 | return -EBUSY; | 126 | return -EBUSY; |
97 | 127 | ||
@@ -323,7 +353,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
323 | return ret; | 353 | return ret; |
324 | } | 354 | } |
325 | 355 | ||
326 | int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) | 356 | static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) |
327 | { | 357 | { |
328 | if (tracing_thresh) | 358 | if (tracing_thresh) |
329 | return 1; | 359 | return 1; |
@@ -412,7 +442,7 @@ void set_graph_array(struct trace_array *tr) | |||
412 | smp_mb(); | 442 | smp_mb(); |
413 | } | 443 | } |
414 | 444 | ||
415 | void trace_graph_thresh_return(struct ftrace_graph_ret *trace) | 445 | static void trace_graph_thresh_return(struct ftrace_graph_ret *trace) |
416 | { | 446 | { |
417 | if (tracing_thresh && | 447 | if (tracing_thresh && |
418 | (trace->rettime - trace->calltime < tracing_thresh)) | 448 | (trace->rettime - trace->calltime < tracing_thresh)) |
@@ -445,6 +475,12 @@ static void graph_trace_reset(struct trace_array *tr) | |||
445 | unregister_ftrace_graph(); | 475 | unregister_ftrace_graph(); |
446 | } | 476 | } |
447 | 477 | ||
478 | static int graph_trace_update_thresh(struct trace_array *tr) | ||
479 | { | ||
480 | graph_trace_reset(tr); | ||
481 | return graph_trace_init(tr); | ||
482 | } | ||
483 | |||
448 | static int max_bytes_for_cpu; | 484 | static int max_bytes_for_cpu; |
449 | 485 | ||
450 | static enum print_line_t | 486 | static enum print_line_t |
@@ -1399,7 +1435,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) | |||
1399 | seq_printf(s, " | | | |\n"); | 1435 | seq_printf(s, " | | | |\n"); |
1400 | } | 1436 | } |
1401 | 1437 | ||
1402 | void print_graph_headers(struct seq_file *s) | 1438 | static void print_graph_headers(struct seq_file *s) |
1403 | { | 1439 | { |
1404 | print_graph_headers_flags(s, tracer_flags.val); | 1440 | print_graph_headers_flags(s, tracer_flags.val); |
1405 | } | 1441 | } |
@@ -1495,6 +1531,7 @@ static struct trace_event graph_trace_ret_event = { | |||
1495 | 1531 | ||
1496 | static struct tracer graph_trace __tracer_data = { | 1532 | static struct tracer graph_trace __tracer_data = { |
1497 | .name = "function_graph", | 1533 | .name = "function_graph", |
1534 | .update_thresh = graph_trace_update_thresh, | ||
1498 | .open = graph_trace_open, | 1535 | .open = graph_trace_open, |
1499 | .pipe_open = graph_trace_open, | 1536 | .pipe_open = graph_trace_open, |
1500 | .close = graph_trace_close, | 1537 | .close = graph_trace_close, |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index f3dad80c20b2..c6977d5a9b12 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -20,23 +20,6 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; | |||
20 | 20 | ||
21 | static int next_event_type = __TRACE_LAST_TYPE + 1; | 21 | static int next_event_type = __TRACE_LAST_TYPE + 1; |
22 | 22 | ||
23 | int trace_print_seq(struct seq_file *m, struct trace_seq *s) | ||
24 | { | ||
25 | int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; | ||
26 | int ret; | ||
27 | |||
28 | ret = seq_write(m, s->buffer, len); | ||
29 | |||
30 | /* | ||
31 | * Only reset this buffer if we successfully wrote to the | ||
32 | * seq_file buffer. | ||
33 | */ | ||
34 | if (!ret) | ||
35 | trace_seq_init(s); | ||
36 | |||
37 | return ret; | ||
38 | } | ||
39 | |||
40 | enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) | 23 | enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) |
41 | { | 24 | { |
42 | struct trace_seq *s = &iter->seq; | 25 | struct trace_seq *s = &iter->seq; |
@@ -85,257 +68,6 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) | |||
85 | return TRACE_TYPE_HANDLED; | 68 | return TRACE_TYPE_HANDLED; |
86 | } | 69 | } |
87 | 70 | ||
88 | /** | ||
89 | * trace_seq_printf - sequence printing of trace information | ||
90 | * @s: trace sequence descriptor | ||
91 | * @fmt: printf format string | ||
92 | * | ||
93 | * It returns 0 if the trace oversizes the buffer's free | ||
94 | * space, 1 otherwise. | ||
95 | * | ||
96 | * The tracer may use either sequence operations or its own | ||
97 | * copy to user routines. To simplify formating of a trace | ||
98 | * trace_seq_printf is used to store strings into a special | ||
99 | * buffer (@s). Then the output may be either used by | ||
100 | * the sequencer or pulled into another buffer. | ||
101 | */ | ||
102 | int | ||
103 | trace_seq_printf(struct trace_seq *s, const char *fmt, ...) | ||
104 | { | ||
105 | int len = (PAGE_SIZE - 1) - s->len; | ||
106 | va_list ap; | ||
107 | int ret; | ||
108 | |||
109 | if (s->full || !len) | ||
110 | return 0; | ||
111 | |||
112 | va_start(ap, fmt); | ||
113 | ret = vsnprintf(s->buffer + s->len, len, fmt, ap); | ||
114 | va_end(ap); | ||
115 | |||
116 | /* If we can't write it all, don't bother writing anything */ | ||
117 | if (ret >= len) { | ||
118 | s->full = 1; | ||
119 | return 0; | ||
120 | } | ||
121 | |||
122 | s->len += ret; | ||
123 | |||
124 | return 1; | ||
125 | } | ||
126 | EXPORT_SYMBOL_GPL(trace_seq_printf); | ||
127 | |||
128 | /** | ||
129 | * trace_seq_bitmask - put a list of longs as a bitmask print output | ||
130 | * @s: trace sequence descriptor | ||
131 | * @maskp: points to an array of unsigned longs that represent a bitmask | ||
132 | * @nmaskbits: The number of bits that are valid in @maskp | ||
133 | * | ||
134 | * It returns 0 if the trace oversizes the buffer's free | ||
135 | * space, 1 otherwise. | ||
136 | * | ||
137 | * Writes a ASCII representation of a bitmask string into @s. | ||
138 | */ | ||
139 | int | ||
140 | trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, | ||
141 | int nmaskbits) | ||
142 | { | ||
143 | int len = (PAGE_SIZE - 1) - s->len; | ||
144 | int ret; | ||
145 | |||
146 | if (s->full || !len) | ||
147 | return 0; | ||
148 | |||
149 | ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); | ||
150 | s->len += ret; | ||
151 | |||
152 | return 1; | ||
153 | } | ||
154 | EXPORT_SYMBOL_GPL(trace_seq_bitmask); | ||
155 | |||
156 | /** | ||
157 | * trace_seq_vprintf - sequence printing of trace information | ||
158 | * @s: trace sequence descriptor | ||
159 | * @fmt: printf format string | ||
160 | * | ||
161 | * The tracer may use either sequence operations or its own | ||
162 | * copy to user routines. To simplify formating of a trace | ||
163 | * trace_seq_printf is used to store strings into a special | ||
164 | * buffer (@s). Then the output may be either used by | ||
165 | * the sequencer or pulled into another buffer. | ||
166 | */ | ||
167 | int | ||
168 | trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) | ||
169 | { | ||
170 | int len = (PAGE_SIZE - 1) - s->len; | ||
171 | int ret; | ||
172 | |||
173 | if (s->full || !len) | ||
174 | return 0; | ||
175 | |||
176 | ret = vsnprintf(s->buffer + s->len, len, fmt, args); | ||
177 | |||
178 | /* If we can't write it all, don't bother writing anything */ | ||
179 | if (ret >= len) { | ||
180 | s->full = 1; | ||
181 | return 0; | ||
182 | } | ||
183 | |||
184 | s->len += ret; | ||
185 | |||
186 | return len; | ||
187 | } | ||
188 | EXPORT_SYMBOL_GPL(trace_seq_vprintf); | ||
189 | |||
190 | int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) | ||
191 | { | ||
192 | int len = (PAGE_SIZE - 1) - s->len; | ||
193 | int ret; | ||
194 | |||
195 | if (s->full || !len) | ||
196 | return 0; | ||
197 | |||
198 | ret = bstr_printf(s->buffer + s->len, len, fmt, binary); | ||
199 | |||
200 | /* If we can't write it all, don't bother writing anything */ | ||
201 | if (ret >= len) { | ||
202 | s->full = 1; | ||
203 | return 0; | ||
204 | } | ||
205 | |||
206 | s->len += ret; | ||
207 | |||
208 | return len; | ||
209 | } | ||
210 | |||
211 | /** | ||
212 | * trace_seq_puts - trace sequence printing of simple string | ||
213 | * @s: trace sequence descriptor | ||
214 | * @str: simple string to record | ||
215 | * | ||
216 | * The tracer may use either the sequence operations or its own | ||
217 | * copy to user routines. This function records a simple string | ||
218 | * into a special buffer (@s) for later retrieval by a sequencer | ||
219 | * or other mechanism. | ||
220 | */ | ||
221 | int trace_seq_puts(struct trace_seq *s, const char *str) | ||
222 | { | ||
223 | int len = strlen(str); | ||
224 | |||
225 | if (s->full) | ||
226 | return 0; | ||
227 | |||
228 | if (len > ((PAGE_SIZE - 1) - s->len)) { | ||
229 | s->full = 1; | ||
230 | return 0; | ||
231 | } | ||
232 | |||
233 | memcpy(s->buffer + s->len, str, len); | ||
234 | s->len += len; | ||
235 | |||
236 | return len; | ||
237 | } | ||
238 | |||
239 | int trace_seq_putc(struct trace_seq *s, unsigned char c) | ||
240 | { | ||
241 | if (s->full) | ||
242 | return 0; | ||
243 | |||
244 | if (s->len >= (PAGE_SIZE - 1)) { | ||
245 | s->full = 1; | ||
246 | return 0; | ||
247 | } | ||
248 | |||
249 | s->buffer[s->len++] = c; | ||
250 | |||
251 | return 1; | ||
252 | } | ||
253 | EXPORT_SYMBOL(trace_seq_putc); | ||
254 | |||
255 | int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) | ||
256 | { | ||
257 | if (s->full) | ||
258 | return 0; | ||
259 | |||
260 | if (len > ((PAGE_SIZE - 1) - s->len)) { | ||
261 | s->full = 1; | ||
262 | return 0; | ||
263 | } | ||
264 | |||
265 | memcpy(s->buffer + s->len, mem, len); | ||
266 | s->len += len; | ||
267 | |||
268 | return len; | ||
269 | } | ||
270 | |||
271 | int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len) | ||
272 | { | ||
273 | unsigned char hex[HEX_CHARS]; | ||
274 | const unsigned char *data = mem; | ||
275 | int i, j; | ||
276 | |||
277 | if (s->full) | ||
278 | return 0; | ||
279 | |||
280 | #ifdef __BIG_ENDIAN | ||
281 | for (i = 0, j = 0; i < len; i++) { | ||
282 | #else | ||
283 | for (i = len-1, j = 0; i >= 0; i--) { | ||
284 | #endif | ||
285 | hex[j++] = hex_asc_hi(data[i]); | ||
286 | hex[j++] = hex_asc_lo(data[i]); | ||
287 | } | ||
288 | hex[j++] = ' '; | ||
289 | |||
290 | return trace_seq_putmem(s, hex, j); | ||
291 | } | ||
292 | |||
293 | void *trace_seq_reserve(struct trace_seq *s, size_t len) | ||
294 | { | ||
295 | void *ret; | ||
296 | |||
297 | if (s->full) | ||
298 | return NULL; | ||
299 | |||
300 | if (len > ((PAGE_SIZE - 1) - s->len)) { | ||
301 | s->full = 1; | ||
302 | return NULL; | ||
303 | } | ||
304 | |||
305 | ret = s->buffer + s->len; | ||
306 | s->len += len; | ||
307 | |||
308 | return ret; | ||
309 | } | ||
310 | |||
311 | int trace_seq_path(struct trace_seq *s, const struct path *path) | ||
312 | { | ||
313 | unsigned char *p; | ||
314 | |||
315 | if (s->full) | ||
316 | return 0; | ||
317 | |||
318 | if (s->len >= (PAGE_SIZE - 1)) { | ||
319 | s->full = 1; | ||
320 | return 0; | ||
321 | } | ||
322 | |||
323 | p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); | ||
324 | if (!IS_ERR(p)) { | ||
325 | p = mangle_path(s->buffer + s->len, p, "\n"); | ||
326 | if (p) { | ||
327 | s->len = p - s->buffer; | ||
328 | return 1; | ||
329 | } | ||
330 | } else { | ||
331 | s->buffer[s->len++] = '?'; | ||
332 | return 1; | ||
333 | } | ||
334 | |||
335 | s->full = 1; | ||
336 | return 0; | ||
337 | } | ||
338 | |||
339 | const char * | 71 | const char * |
340 | ftrace_print_flags_seq(struct trace_seq *p, const char *delim, | 72 | ftrace_print_flags_seq(struct trace_seq *p, const char *delim, |
341 | unsigned long flags, | 73 | unsigned long flags, |
@@ -343,7 +75,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, | |||
343 | { | 75 | { |
344 | unsigned long mask; | 76 | unsigned long mask; |
345 | const char *str; | 77 | const char *str; |
346 | const char *ret = p->buffer + p->len; | 78 | const char *ret = trace_seq_buffer_ptr(p); |
347 | int i, first = 1; | 79 | int i, first = 1; |
348 | 80 | ||
349 | for (i = 0; flag_array[i].name && flags; i++) { | 81 | for (i = 0; flag_array[i].name && flags; i++) { |
@@ -379,7 +111,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, | |||
379 | const struct trace_print_flags *symbol_array) | 111 | const struct trace_print_flags *symbol_array) |
380 | { | 112 | { |
381 | int i; | 113 | int i; |
382 | const char *ret = p->buffer + p->len; | 114 | const char *ret = trace_seq_buffer_ptr(p); |
383 | 115 | ||
384 | for (i = 0; symbol_array[i].name; i++) { | 116 | for (i = 0; symbol_array[i].name; i++) { |
385 | 117 | ||
@@ -390,7 +122,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, | |||
390 | break; | 122 | break; |
391 | } | 123 | } |
392 | 124 | ||
393 | if (ret == (const char *)(p->buffer + p->len)) | 125 | if (ret == (const char *)(trace_seq_buffer_ptr(p))) |
394 | trace_seq_printf(p, "0x%lx", val); | 126 | trace_seq_printf(p, "0x%lx", val); |
395 | 127 | ||
396 | trace_seq_putc(p, 0); | 128 | trace_seq_putc(p, 0); |
@@ -405,7 +137,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, | |||
405 | const struct trace_print_flags_u64 *symbol_array) | 137 | const struct trace_print_flags_u64 *symbol_array) |
406 | { | 138 | { |
407 | int i; | 139 | int i; |
408 | const char *ret = p->buffer + p->len; | 140 | const char *ret = trace_seq_buffer_ptr(p); |
409 | 141 | ||
410 | for (i = 0; symbol_array[i].name; i++) { | 142 | for (i = 0; symbol_array[i].name; i++) { |
411 | 143 | ||
@@ -416,7 +148,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, | |||
416 | break; | 148 | break; |
417 | } | 149 | } |
418 | 150 | ||
419 | if (ret == (const char *)(p->buffer + p->len)) | 151 | if (ret == (const char *)(trace_seq_buffer_ptr(p))) |
420 | trace_seq_printf(p, "0x%llx", val); | 152 | trace_seq_printf(p, "0x%llx", val); |
421 | 153 | ||
422 | trace_seq_putc(p, 0); | 154 | trace_seq_putc(p, 0); |
@@ -430,7 +162,7 @@ const char * | |||
430 | ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, | 162 | ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, |
431 | unsigned int bitmask_size) | 163 | unsigned int bitmask_size) |
432 | { | 164 | { |
433 | const char *ret = p->buffer + p->len; | 165 | const char *ret = trace_seq_buffer_ptr(p); |
434 | 166 | ||
435 | trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); | 167 | trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); |
436 | trace_seq_putc(p, 0); | 168 | trace_seq_putc(p, 0); |
@@ -443,7 +175,7 @@ const char * | |||
443 | ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) | 175 | ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) |
444 | { | 176 | { |
445 | int i; | 177 | int i; |
446 | const char *ret = p->buffer + p->len; | 178 | const char *ret = trace_seq_buffer_ptr(p); |
447 | 179 | ||
448 | for (i = 0; i < buf_len; i++) | 180 | for (i = 0; i < buf_len; i++) |
449 | trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); | 181 | trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); |
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 127a9d8c8357..80b25b585a70 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h | |||
@@ -35,9 +35,6 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); | |||
35 | extern int __unregister_ftrace_event(struct trace_event *event); | 35 | extern int __unregister_ftrace_event(struct trace_event *event); |
36 | extern struct rw_semaphore trace_event_sem; | 36 | extern struct rw_semaphore trace_event_sem; |
37 | 37 | ||
38 | #define MAX_MEMHEX_BYTES 8 | ||
39 | #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) | ||
40 | |||
41 | #define SEQ_PUT_FIELD_RET(s, x) \ | 38 | #define SEQ_PUT_FIELD_RET(s, x) \ |
42 | do { \ | 39 | do { \ |
43 | if (!trace_seq_putmem(s, &(x), sizeof(x))) \ | 40 | if (!trace_seq_putmem(s, &(x), sizeof(x))) \ |
@@ -46,7 +43,6 @@ do { \ | |||
46 | 43 | ||
47 | #define SEQ_PUT_HEX_FIELD_RET(s, x) \ | 44 | #define SEQ_PUT_HEX_FIELD_RET(s, x) \ |
48 | do { \ | 45 | do { \ |
49 | BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \ | ||
50 | if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ | 46 | if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ |
51 | return TRACE_TYPE_PARTIAL_LINE; \ | 47 | return TRACE_TYPE_PARTIAL_LINE; \ |
52 | } while (0) | 48 | } while (0) |
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c new file mode 100644 index 000000000000..1f24ed99dca2 --- /dev/null +++ b/kernel/trace/trace_seq.c | |||
@@ -0,0 +1,428 @@ | |||
1 | /* | ||
2 | * trace_seq.c | ||
3 | * | ||
4 | * Copyright (C) 2008-2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> | ||
5 | * | ||
6 | * The trace_seq is a handy tool that allows you to pass a descriptor around | ||
7 | * to a buffer that other functions can write to. It is similar to the | ||
8 | * seq_file functionality but has some differences. | ||
9 | * | ||
10 | * To use it, the trace_seq must be initialized with trace_seq_init(). | ||
11 | * This will set up the counters within the descriptor. You can call | ||
12 | * trace_seq_init() more than once to reset the trace_seq to start | ||
13 | * from scratch. | ||
14 | * | ||
15 | * The buffer size is currently PAGE_SIZE, although it may become dynamic | ||
16 | * in the future. | ||
17 | * | ||
18 | * A write to the buffer will either succed or fail. That is, unlike | ||
19 | * sprintf() there will not be a partial write (well it may write into | ||
20 | * the buffer but it wont update the pointers). This allows users to | ||
21 | * try to write something into the trace_seq buffer and if it fails | ||
22 | * they can flush it and try again. | ||
23 | * | ||
24 | */ | ||
25 | #include <linux/uaccess.h> | ||
26 | #include <linux/seq_file.h> | ||
27 | #include <linux/trace_seq.h> | ||
28 | |||
29 | /* How much buffer is left on the trace_seq? */ | ||
30 | #define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len) | ||
31 | |||
32 | /* How much buffer is written? */ | ||
33 | #define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1)) | ||
34 | |||
35 | /** | ||
36 | * trace_print_seq - move the contents of trace_seq into a seq_file | ||
37 | * @m: the seq_file descriptor that is the destination | ||
38 | * @s: the trace_seq descriptor that is the source. | ||
39 | * | ||
40 | * Returns 0 on success and non zero on error. If it succeeds to | ||
41 | * write to the seq_file it will reset the trace_seq, otherwise | ||
42 | * it does not modify the trace_seq to let the caller try again. | ||
43 | */ | ||
44 | int trace_print_seq(struct seq_file *m, struct trace_seq *s) | ||
45 | { | ||
46 | unsigned int len = TRACE_SEQ_BUF_USED(s); | ||
47 | int ret; | ||
48 | |||
49 | ret = seq_write(m, s->buffer, len); | ||
50 | |||
51 | /* | ||
52 | * Only reset this buffer if we successfully wrote to the | ||
53 | * seq_file buffer. This lets the caller try again or | ||
54 | * do something else with the contents. | ||
55 | */ | ||
56 | if (!ret) | ||
57 | trace_seq_init(s); | ||
58 | |||
59 | return ret; | ||
60 | } | ||
61 | |||
62 | /** | ||
63 | * trace_seq_printf - sequence printing of trace information | ||
64 | * @s: trace sequence descriptor | ||
65 | * @fmt: printf format string | ||
66 | * | ||
67 | * The tracer may use either sequence operations or its own | ||
68 | * copy to user routines. To simplify formating of a trace | ||
69 | * trace_seq_printf() is used to store strings into a special | ||
70 | * buffer (@s). Then the output may be either used by | ||
71 | * the sequencer or pulled into another buffer. | ||
72 | * | ||
73 | * Returns 1 if we successfully written all the contents to | ||
74 | * the buffer. | ||
75 | * Returns 0 if we the length to write is bigger than the | ||
76 | * reserved buffer space. In this case, nothing gets written. | ||
77 | */ | ||
78 | int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) | ||
79 | { | ||
80 | unsigned int len = TRACE_SEQ_BUF_LEFT(s); | ||
81 | va_list ap; | ||
82 | int ret; | ||
83 | |||
84 | if (s->full || !len) | ||
85 | return 0; | ||
86 | |||
87 | va_start(ap, fmt); | ||
88 | ret = vsnprintf(s->buffer + s->len, len, fmt, ap); | ||
89 | va_end(ap); | ||
90 | |||
91 | /* If we can't write it all, don't bother writing anything */ | ||
92 | if (ret >= len) { | ||
93 | s->full = 1; | ||
94 | return 0; | ||
95 | } | ||
96 | |||
97 | s->len += ret; | ||
98 | |||
99 | return 1; | ||
100 | } | ||
101 | EXPORT_SYMBOL_GPL(trace_seq_printf); | ||
102 | |||
103 | /** | ||
104 | * trace_seq_bitmask - write a bitmask array in its ASCII representation | ||
105 | * @s: trace sequence descriptor | ||
106 | * @maskp: points to an array of unsigned longs that represent a bitmask | ||
107 | * @nmaskbits: The number of bits that are valid in @maskp | ||
108 | * | ||
109 | * Writes a ASCII representation of a bitmask string into @s. | ||
110 | * | ||
111 | * Returns 1 if we successfully written all the contents to | ||
112 | * the buffer. | ||
113 | * Returns 0 if we the length to write is bigger than the | ||
114 | * reserved buffer space. In this case, nothing gets written. | ||
115 | */ | ||
116 | int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, | ||
117 | int nmaskbits) | ||
118 | { | ||
119 | unsigned int len = TRACE_SEQ_BUF_LEFT(s); | ||
120 | int ret; | ||
121 | |||
122 | if (s->full || !len) | ||
123 | return 0; | ||
124 | |||
125 | ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); | ||
126 | s->len += ret; | ||
127 | |||
128 | return 1; | ||
129 | } | ||
130 | EXPORT_SYMBOL_GPL(trace_seq_bitmask); | ||
131 | |||
132 | /** | ||
133 | * trace_seq_vprintf - sequence printing of trace information | ||
134 | * @s: trace sequence descriptor | ||
135 | * @fmt: printf format string | ||
136 | * | ||
137 | * The tracer may use either sequence operations or its own | ||
138 | * copy to user routines. To simplify formating of a trace | ||
139 | * trace_seq_printf is used to store strings into a special | ||
140 | * buffer (@s). Then the output may be either used by | ||
141 | * the sequencer or pulled into another buffer. | ||
142 | * | ||
143 | * Returns how much it wrote to the buffer. | ||
144 | */ | ||
145 | int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) | ||
146 | { | ||
147 | unsigned int len = TRACE_SEQ_BUF_LEFT(s); | ||
148 | int ret; | ||
149 | |||
150 | if (s->full || !len) | ||
151 | return 0; | ||
152 | |||
153 | ret = vsnprintf(s->buffer + s->len, len, fmt, args); | ||
154 | |||
155 | /* If we can't write it all, don't bother writing anything */ | ||
156 | if (ret >= len) { | ||
157 | s->full = 1; | ||
158 | return 0; | ||
159 | } | ||
160 | |||
161 | s->len += ret; | ||
162 | |||
163 | return len; | ||
164 | } | ||
165 | EXPORT_SYMBOL_GPL(trace_seq_vprintf); | ||
166 | |||
167 | /** | ||
168 | * trace_seq_bprintf - Write the printf string from binary arguments | ||
169 | * @s: trace sequence descriptor | ||
170 | * @fmt: The format string for the @binary arguments | ||
171 | * @binary: The binary arguments for @fmt. | ||
172 | * | ||
173 | * When recording in a fast path, a printf may be recorded with just | ||
174 | * saving the format and the arguments as they were passed to the | ||
175 | * function, instead of wasting cycles converting the arguments into | ||
176 | * ASCII characters. Instead, the arguments are saved in a 32 bit | ||
177 | * word array that is defined by the format string constraints. | ||
178 | * | ||
179 | * This function will take the format and the binary array and finish | ||
180 | * the conversion into the ASCII string within the buffer. | ||
181 | * | ||
182 | * Returns how much it wrote to the buffer. | ||
183 | */ | ||
184 | int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) | ||
185 | { | ||
186 | unsigned int len = TRACE_SEQ_BUF_LEFT(s); | ||
187 | int ret; | ||
188 | |||
189 | if (s->full || !len) | ||
190 | return 0; | ||
191 | |||
192 | ret = bstr_printf(s->buffer + s->len, len, fmt, binary); | ||
193 | |||
194 | /* If we can't write it all, don't bother writing anything */ | ||
195 | if (ret >= len) { | ||
196 | s->full = 1; | ||
197 | return 0; | ||
198 | } | ||
199 | |||
200 | s->len += ret; | ||
201 | |||
202 | return len; | ||
203 | } | ||
204 | EXPORT_SYMBOL_GPL(trace_seq_bprintf); | ||
205 | |||
206 | /** | ||
207 | * trace_seq_puts - trace sequence printing of simple string | ||
208 | * @s: trace sequence descriptor | ||
209 | * @str: simple string to record | ||
210 | * | ||
211 | * The tracer may use either the sequence operations or its own | ||
212 | * copy to user routines. This function records a simple string | ||
213 | * into a special buffer (@s) for later retrieval by a sequencer | ||
214 | * or other mechanism. | ||
215 | * | ||
216 | * Returns how much it wrote to the buffer. | ||
217 | */ | ||
218 | int trace_seq_puts(struct trace_seq *s, const char *str) | ||
219 | { | ||
220 | unsigned int len = strlen(str); | ||
221 | |||
222 | if (s->full) | ||
223 | return 0; | ||
224 | |||
225 | if (len > TRACE_SEQ_BUF_LEFT(s)) { | ||
226 | s->full = 1; | ||
227 | return 0; | ||
228 | } | ||
229 | |||
230 | memcpy(s->buffer + s->len, str, len); | ||
231 | s->len += len; | ||
232 | |||
233 | return len; | ||
234 | } | ||
235 | EXPORT_SYMBOL_GPL(trace_seq_puts); | ||
236 | |||
237 | /** | ||
238 | * trace_seq_putc - trace sequence printing of simple character | ||
239 | * @s: trace sequence descriptor | ||
240 | * @c: simple character to record | ||
241 | * | ||
242 | * The tracer may use either the sequence operations or its own | ||
243 | * copy to user routines. This function records a simple charater | ||
244 | * into a special buffer (@s) for later retrieval by a sequencer | ||
245 | * or other mechanism. | ||
246 | * | ||
247 | * Returns how much it wrote to the buffer. | ||
248 | */ | ||
249 | int trace_seq_putc(struct trace_seq *s, unsigned char c) | ||
250 | { | ||
251 | if (s->full) | ||
252 | return 0; | ||
253 | |||
254 | if (TRACE_SEQ_BUF_LEFT(s) < 1) { | ||
255 | s->full = 1; | ||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | s->buffer[s->len++] = c; | ||
260 | |||
261 | return 1; | ||
262 | } | ||
263 | EXPORT_SYMBOL_GPL(trace_seq_putc); | ||
264 | |||
265 | /** | ||
266 | * trace_seq_putmem - write raw data into the trace_seq buffer | ||
267 | * @s: trace sequence descriptor | ||
268 | * @mem: The raw memory to copy into the buffer | ||
269 | * @len: The length of the raw memory to copy (in bytes) | ||
270 | * | ||
271 | * There may be cases where raw memory needs to be written into the | ||
272 | * buffer and a strcpy() would not work. Using this function allows | ||
273 | * for such cases. | ||
274 | * | ||
275 | * Returns how much it wrote to the buffer. | ||
276 | */ | ||
277 | int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) | ||
278 | { | ||
279 | if (s->full) | ||
280 | return 0; | ||
281 | |||
282 | if (len > TRACE_SEQ_BUF_LEFT(s)) { | ||
283 | s->full = 1; | ||
284 | return 0; | ||
285 | } | ||
286 | |||
287 | memcpy(s->buffer + s->len, mem, len); | ||
288 | s->len += len; | ||
289 | |||
290 | return len; | ||
291 | } | ||
292 | EXPORT_SYMBOL_GPL(trace_seq_putmem); | ||
293 | |||
294 | #define MAX_MEMHEX_BYTES 8U | ||
295 | #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) | ||
296 | |||
297 | /** | ||
298 | * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex | ||
299 | * @s: trace sequence descriptor | ||
300 | * @mem: The raw memory to write its hex ASCII representation of | ||
301 | * @len: The length of the raw memory to copy (in bytes) | ||
302 | * | ||
303 | * This is similar to trace_seq_putmem() except instead of just copying the | ||
304 | * raw memory into the buffer it writes its ASCII representation of it | ||
305 | * in hex characters. | ||
306 | * | ||
307 | * Returns how much it wrote to the buffer. | ||
308 | */ | ||
309 | int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, | ||
310 | unsigned int len) | ||
311 | { | ||
312 | unsigned char hex[HEX_CHARS]; | ||
313 | const unsigned char *data = mem; | ||
314 | unsigned int start_len; | ||
315 | int i, j; | ||
316 | int cnt = 0; | ||
317 | |||
318 | if (s->full) | ||
319 | return 0; | ||
320 | |||
321 | while (len) { | ||
322 | start_len = min(len, HEX_CHARS - 1); | ||
323 | #ifdef __BIG_ENDIAN | ||
324 | for (i = 0, j = 0; i < start_len; i++) { | ||
325 | #else | ||
326 | for (i = start_len-1, j = 0; i >= 0; i--) { | ||
327 | #endif | ||
328 | hex[j++] = hex_asc_hi(data[i]); | ||
329 | hex[j++] = hex_asc_lo(data[i]); | ||
330 | } | ||
331 | if (WARN_ON_ONCE(j == 0 || j/2 > len)) | ||
332 | break; | ||
333 | |||
334 | /* j increments twice per loop */ | ||
335 | len -= j / 2; | ||
336 | hex[j++] = ' '; | ||
337 | |||
338 | cnt += trace_seq_putmem(s, hex, j); | ||
339 | } | ||
340 | return cnt; | ||
341 | } | ||
342 | EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); | ||
343 | |||
344 | /** | ||
345 | * trace_seq_path - copy a path into the sequence buffer | ||
346 | * @s: trace sequence descriptor | ||
347 | * @path: path to write into the sequence buffer. | ||
348 | * | ||
349 | * Write a path name into the sequence buffer. | ||
350 | * | ||
351 | * Returns 1 if we successfully written all the contents to | ||
352 | * the buffer. | ||
353 | * Returns 0 if we the length to write is bigger than the | ||
354 | * reserved buffer space. In this case, nothing gets written. | ||
355 | */ | ||
356 | int trace_seq_path(struct trace_seq *s, const struct path *path) | ||
357 | { | ||
358 | unsigned char *p; | ||
359 | |||
360 | if (s->full) | ||
361 | return 0; | ||
362 | |||
363 | if (TRACE_SEQ_BUF_LEFT(s) < 1) { | ||
364 | s->full = 1; | ||
365 | return 0; | ||
366 | } | ||
367 | |||
368 | p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); | ||
369 | if (!IS_ERR(p)) { | ||
370 | p = mangle_path(s->buffer + s->len, p, "\n"); | ||
371 | if (p) { | ||
372 | s->len = p - s->buffer; | ||
373 | return 1; | ||
374 | } | ||
375 | } else { | ||
376 | s->buffer[s->len++] = '?'; | ||
377 | return 1; | ||
378 | } | ||
379 | |||
380 | s->full = 1; | ||
381 | return 0; | ||
382 | } | ||
383 | EXPORT_SYMBOL_GPL(trace_seq_path); | ||
384 | |||
385 | /** | ||
386 | * trace_seq_to_user - copy the squence buffer to user space | ||
387 | * @s: trace sequence descriptor | ||
388 | * @ubuf: The userspace memory location to copy to | ||
389 | * @cnt: The amount to copy | ||
390 | * | ||
391 | * Copies the sequence buffer into the userspace memory pointed to | ||
392 | * by @ubuf. It starts from the last read position (@s->readpos) | ||
393 | * and writes up to @cnt characters or till it reaches the end of | ||
394 | * the content in the buffer (@s->len), which ever comes first. | ||
395 | * | ||
396 | * On success, it returns a positive number of the number of bytes | ||
397 | * it copied. | ||
398 | * | ||
399 | * On failure it returns -EBUSY if all of the content in the | ||
400 | * sequence has been already read, which includes nothing in the | ||
401 | * sequenc (@s->len == @s->readpos). | ||
402 | * | ||
403 | * Returns -EFAULT if the copy to userspace fails. | ||
404 | */ | ||
405 | int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) | ||
406 | { | ||
407 | int len; | ||
408 | int ret; | ||
409 | |||
410 | if (!cnt) | ||
411 | return 0; | ||
412 | |||
413 | if (s->len <= s->readpos) | ||
414 | return -EBUSY; | ||
415 | |||
416 | len = s->len - s->readpos; | ||
417 | if (cnt > len) | ||
418 | cnt = len; | ||
419 | ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); | ||
420 | if (ret == cnt) | ||
421 | return -EFAULT; | ||
422 | |||
423 | cnt -= ret; | ||
424 | |||
425 | s->readpos += cnt; | ||
426 | return cnt; | ||
427 | } | ||
428 | EXPORT_SYMBOL_GPL(trace_seq_to_user); | ||
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3c9b97e6b1f4..33ff6a24b802 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -265,7 +265,6 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) | |||
265 | if (is_ret) | 265 | if (is_ret) |
266 | tu->consumer.ret_handler = uretprobe_dispatcher; | 266 | tu->consumer.ret_handler = uretprobe_dispatcher; |
267 | init_trace_uprobe_filter(&tu->filter); | 267 | init_trace_uprobe_filter(&tu->filter); |
268 | tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; | ||
269 | return tu; | 268 | return tu; |
270 | 269 | ||
271 | error: | 270 | error: |
@@ -1292,7 +1291,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) | |||
1292 | kfree(call->print_fmt); | 1291 | kfree(call->print_fmt); |
1293 | return -ENODEV; | 1292 | return -ENODEV; |
1294 | } | 1293 | } |
1295 | call->flags = 0; | 1294 | |
1296 | call->class->reg = trace_uprobe_register; | 1295 | call->class->reg = trace_uprobe_register; |
1297 | call->data = tu; | 1296 | call->data = tu; |
1298 | ret = trace_add_event_call(call); | 1297 | ret = trace_add_event_call(call); |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 35974ac69600..5dbe22aa3efd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -265,7 +265,6 @@ struct workqueue_struct { | |||
265 | 265 | ||
266 | static struct kmem_cache *pwq_cache; | 266 | static struct kmem_cache *pwq_cache; |
267 | 267 | ||
268 | static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */ | ||
269 | static cpumask_var_t *wq_numa_possible_cpumask; | 268 | static cpumask_var_t *wq_numa_possible_cpumask; |
270 | /* possible CPUs of each node */ | 269 | /* possible CPUs of each node */ |
271 | 270 | ||
@@ -758,13 +757,6 @@ static bool too_many_workers(struct worker_pool *pool) | |||
758 | int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ | 757 | int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ |
759 | int nr_busy = pool->nr_workers - nr_idle; | 758 | int nr_busy = pool->nr_workers - nr_idle; |
760 | 759 | ||
761 | /* | ||
762 | * nr_idle and idle_list may disagree if idle rebinding is in | ||
763 | * progress. Never return %true if idle_list is empty. | ||
764 | */ | ||
765 | if (list_empty(&pool->idle_list)) | ||
766 | return false; | ||
767 | |||
768 | return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; | 760 | return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; |
769 | } | 761 | } |
770 | 762 | ||
@@ -850,7 +842,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) | |||
850 | pool = worker->pool; | 842 | pool = worker->pool; |
851 | 843 | ||
852 | /* this can only happen on the local cpu */ | 844 | /* this can only happen on the local cpu */ |
853 | if (WARN_ON_ONCE(cpu != raw_smp_processor_id())) | 845 | if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu)) |
854 | return NULL; | 846 | return NULL; |
855 | 847 | ||
856 | /* | 848 | /* |
@@ -874,35 +866,22 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) | |||
874 | * worker_set_flags - set worker flags and adjust nr_running accordingly | 866 | * worker_set_flags - set worker flags and adjust nr_running accordingly |
875 | * @worker: self | 867 | * @worker: self |
876 | * @flags: flags to set | 868 | * @flags: flags to set |
877 | * @wakeup: wakeup an idle worker if necessary | ||
878 | * | 869 | * |
879 | * Set @flags in @worker->flags and adjust nr_running accordingly. If | 870 | * Set @flags in @worker->flags and adjust nr_running accordingly. |
880 | * nr_running becomes zero and @wakeup is %true, an idle worker is | ||
881 | * woken up. | ||
882 | * | 871 | * |
883 | * CONTEXT: | 872 | * CONTEXT: |
884 | * spin_lock_irq(pool->lock) | 873 | * spin_lock_irq(pool->lock) |
885 | */ | 874 | */ |
886 | static inline void worker_set_flags(struct worker *worker, unsigned int flags, | 875 | static inline void worker_set_flags(struct worker *worker, unsigned int flags) |
887 | bool wakeup) | ||
888 | { | 876 | { |
889 | struct worker_pool *pool = worker->pool; | 877 | struct worker_pool *pool = worker->pool; |
890 | 878 | ||
891 | WARN_ON_ONCE(worker->task != current); | 879 | WARN_ON_ONCE(worker->task != current); |
892 | 880 | ||
893 | /* | 881 | /* If transitioning into NOT_RUNNING, adjust nr_running. */ |
894 | * If transitioning into NOT_RUNNING, adjust nr_running and | ||
895 | * wake up an idle worker as necessary if requested by | ||
896 | * @wakeup. | ||
897 | */ | ||
898 | if ((flags & WORKER_NOT_RUNNING) && | 882 | if ((flags & WORKER_NOT_RUNNING) && |
899 | !(worker->flags & WORKER_NOT_RUNNING)) { | 883 | !(worker->flags & WORKER_NOT_RUNNING)) { |
900 | if (wakeup) { | 884 | atomic_dec(&pool->nr_running); |
901 | if (atomic_dec_and_test(&pool->nr_running) && | ||
902 | !list_empty(&pool->worklist)) | ||
903 | wake_up_worker(pool); | ||
904 | } else | ||
905 | atomic_dec(&pool->nr_running); | ||
906 | } | 885 | } |
907 | 886 | ||
908 | worker->flags |= flags; | 887 | worker->flags |= flags; |
@@ -1232,7 +1211,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, | |||
1232 | pwq_activate_delayed_work(work); | 1211 | pwq_activate_delayed_work(work); |
1233 | 1212 | ||
1234 | list_del_init(&work->entry); | 1213 | list_del_init(&work->entry); |
1235 | pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work)); | 1214 | pwq_dec_nr_in_flight(pwq, get_work_color(work)); |
1236 | 1215 | ||
1237 | /* work->data points to pwq iff queued, point to pool */ | 1216 | /* work->data points to pwq iff queued, point to pool */ |
1238 | set_work_pool_and_keep_pending(work, pool->id); | 1217 | set_work_pool_and_keep_pending(work, pool->id); |
@@ -1560,7 +1539,7 @@ static void worker_enter_idle(struct worker *worker) | |||
1560 | (worker->hentry.next || worker->hentry.pprev))) | 1539 | (worker->hentry.next || worker->hentry.pprev))) |
1561 | return; | 1540 | return; |
1562 | 1541 | ||
1563 | /* can't use worker_set_flags(), also called from start_worker() */ | 1542 | /* can't use worker_set_flags(), also called from create_worker() */ |
1564 | worker->flags |= WORKER_IDLE; | 1543 | worker->flags |= WORKER_IDLE; |
1565 | pool->nr_idle++; | 1544 | pool->nr_idle++; |
1566 | worker->last_active = jiffies; | 1545 | worker->last_active = jiffies; |
@@ -1602,11 +1581,11 @@ static void worker_leave_idle(struct worker *worker) | |||
1602 | list_del_init(&worker->entry); | 1581 | list_del_init(&worker->entry); |
1603 | } | 1582 | } |
1604 | 1583 | ||
1605 | static struct worker *alloc_worker(void) | 1584 | static struct worker *alloc_worker(int node) |
1606 | { | 1585 | { |
1607 | struct worker *worker; | 1586 | struct worker *worker; |
1608 | 1587 | ||
1609 | worker = kzalloc(sizeof(*worker), GFP_KERNEL); | 1588 | worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node); |
1610 | if (worker) { | 1589 | if (worker) { |
1611 | INIT_LIST_HEAD(&worker->entry); | 1590 | INIT_LIST_HEAD(&worker->entry); |
1612 | INIT_LIST_HEAD(&worker->scheduled); | 1591 | INIT_LIST_HEAD(&worker->scheduled); |
@@ -1670,6 +1649,9 @@ static void worker_detach_from_pool(struct worker *worker, | |||
1670 | detach_completion = pool->detach_completion; | 1649 | detach_completion = pool->detach_completion; |
1671 | mutex_unlock(&pool->attach_mutex); | 1650 | mutex_unlock(&pool->attach_mutex); |
1672 | 1651 | ||
1652 | /* clear leftover flags without pool->lock after it is detached */ | ||
1653 | worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); | ||
1654 | |||
1673 | if (detach_completion) | 1655 | if (detach_completion) |
1674 | complete(detach_completion); | 1656 | complete(detach_completion); |
1675 | } | 1657 | } |
@@ -1678,8 +1660,7 @@ static void worker_detach_from_pool(struct worker *worker, | |||
1678 | * create_worker - create a new workqueue worker | 1660 | * create_worker - create a new workqueue worker |
1679 | * @pool: pool the new worker will belong to | 1661 | * @pool: pool the new worker will belong to |
1680 | * | 1662 | * |
1681 | * Create a new worker which is attached to @pool. The new worker must be | 1663 | * Create and start a new worker which is attached to @pool. |
1682 | * started by start_worker(). | ||
1683 | * | 1664 | * |
1684 | * CONTEXT: | 1665 | * CONTEXT: |
1685 | * Might sleep. Does GFP_KERNEL allocations. | 1666 | * Might sleep. Does GFP_KERNEL allocations. |
@@ -1698,7 +1679,7 @@ static struct worker *create_worker(struct worker_pool *pool) | |||
1698 | if (id < 0) | 1679 | if (id < 0) |
1699 | goto fail; | 1680 | goto fail; |
1700 | 1681 | ||
1701 | worker = alloc_worker(); | 1682 | worker = alloc_worker(pool->node); |
1702 | if (!worker) | 1683 | if (!worker) |
1703 | goto fail; | 1684 | goto fail; |
1704 | 1685 | ||
@@ -1724,6 +1705,13 @@ static struct worker *create_worker(struct worker_pool *pool) | |||
1724 | /* successful, attach the worker to the pool */ | 1705 | /* successful, attach the worker to the pool */ |
1725 | worker_attach_to_pool(worker, pool); | 1706 | worker_attach_to_pool(worker, pool); |
1726 | 1707 | ||
1708 | /* start the newly created worker */ | ||
1709 | spin_lock_irq(&pool->lock); | ||
1710 | worker->pool->nr_workers++; | ||
1711 | worker_enter_idle(worker); | ||
1712 | wake_up_process(worker->task); | ||
1713 | spin_unlock_irq(&pool->lock); | ||
1714 | |||
1727 | return worker; | 1715 | return worker; |
1728 | 1716 | ||
1729 | fail: | 1717 | fail: |
@@ -1734,44 +1722,6 @@ fail: | |||
1734 | } | 1722 | } |
1735 | 1723 | ||
1736 | /** | 1724 | /** |
1737 | * start_worker - start a newly created worker | ||
1738 | * @worker: worker to start | ||
1739 | * | ||
1740 | * Make the pool aware of @worker and start it. | ||
1741 | * | ||
1742 | * CONTEXT: | ||
1743 | * spin_lock_irq(pool->lock). | ||
1744 | */ | ||
1745 | static void start_worker(struct worker *worker) | ||
1746 | { | ||
1747 | worker->pool->nr_workers++; | ||
1748 | worker_enter_idle(worker); | ||
1749 | wake_up_process(worker->task); | ||
1750 | } | ||
1751 | |||
1752 | /** | ||
1753 | * create_and_start_worker - create and start a worker for a pool | ||
1754 | * @pool: the target pool | ||
1755 | * | ||
1756 | * Grab the managership of @pool and create and start a new worker for it. | ||
1757 | * | ||
1758 | * Return: 0 on success. A negative error code otherwise. | ||
1759 | */ | ||
1760 | static int create_and_start_worker(struct worker_pool *pool) | ||
1761 | { | ||
1762 | struct worker *worker; | ||
1763 | |||
1764 | worker = create_worker(pool); | ||
1765 | if (worker) { | ||
1766 | spin_lock_irq(&pool->lock); | ||
1767 | start_worker(worker); | ||
1768 | spin_unlock_irq(&pool->lock); | ||
1769 | } | ||
1770 | |||
1771 | return worker ? 0 : -ENOMEM; | ||
1772 | } | ||
1773 | |||
1774 | /** | ||
1775 | * destroy_worker - destroy a workqueue worker | 1725 | * destroy_worker - destroy a workqueue worker |
1776 | * @worker: worker to be destroyed | 1726 | * @worker: worker to be destroyed |
1777 | * | 1727 | * |
@@ -1909,23 +1859,10 @@ restart: | |||
1909 | mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); | 1859 | mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); |
1910 | 1860 | ||
1911 | while (true) { | 1861 | while (true) { |
1912 | struct worker *worker; | 1862 | if (create_worker(pool) || !need_to_create_worker(pool)) |
1913 | |||
1914 | worker = create_worker(pool); | ||
1915 | if (worker) { | ||
1916 | del_timer_sync(&pool->mayday_timer); | ||
1917 | spin_lock_irq(&pool->lock); | ||
1918 | start_worker(worker); | ||
1919 | if (WARN_ON_ONCE(need_to_create_worker(pool))) | ||
1920 | goto restart; | ||
1921 | return true; | ||
1922 | } | ||
1923 | |||
1924 | if (!need_to_create_worker(pool)) | ||
1925 | break; | 1863 | break; |
1926 | 1864 | ||
1927 | __set_current_state(TASK_INTERRUPTIBLE); | 1865 | schedule_timeout_interruptible(CREATE_COOLDOWN); |
1928 | schedule_timeout(CREATE_COOLDOWN); | ||
1929 | 1866 | ||
1930 | if (!need_to_create_worker(pool)) | 1867 | if (!need_to_create_worker(pool)) |
1931 | break; | 1868 | break; |
@@ -1933,6 +1870,11 @@ restart: | |||
1933 | 1870 | ||
1934 | del_timer_sync(&pool->mayday_timer); | 1871 | del_timer_sync(&pool->mayday_timer); |
1935 | spin_lock_irq(&pool->lock); | 1872 | spin_lock_irq(&pool->lock); |
1873 | /* | ||
1874 | * This is necessary even after a new worker was just successfully | ||
1875 | * created as @pool->lock was dropped and the new worker might have | ||
1876 | * already become busy. | ||
1877 | */ | ||
1936 | if (need_to_create_worker(pool)) | 1878 | if (need_to_create_worker(pool)) |
1937 | goto restart; | 1879 | goto restart; |
1938 | return true; | 1880 | return true; |
@@ -2020,13 +1962,8 @@ __acquires(&pool->lock) | |||
2020 | 1962 | ||
2021 | lockdep_copy_map(&lockdep_map, &work->lockdep_map); | 1963 | lockdep_copy_map(&lockdep_map, &work->lockdep_map); |
2022 | #endif | 1964 | #endif |
2023 | /* | 1965 | /* ensure we're on the correct CPU */ |
2024 | * Ensure we're on the correct CPU. DISASSOCIATED test is | 1966 | WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && |
2025 | * necessary to avoid spurious warnings from rescuers servicing the | ||
2026 | * unbound or a disassociated pool. | ||
2027 | */ | ||
2028 | WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && | ||
2029 | !(pool->flags & POOL_DISASSOCIATED) && | ||
2030 | raw_smp_processor_id() != pool->cpu); | 1967 | raw_smp_processor_id() != pool->cpu); |
2031 | 1968 | ||
2032 | /* | 1969 | /* |
@@ -2052,17 +1989,22 @@ __acquires(&pool->lock) | |||
2052 | list_del_init(&work->entry); | 1989 | list_del_init(&work->entry); |
2053 | 1990 | ||
2054 | /* | 1991 | /* |
2055 | * CPU intensive works don't participate in concurrency | 1992 | * CPU intensive works don't participate in concurrency management. |
2056 | * management. They're the scheduler's responsibility. | 1993 | * They're the scheduler's responsibility. This takes @worker out |
1994 | * of concurrency management and the next code block will chain | ||
1995 | * execution of the pending work items. | ||
2057 | */ | 1996 | */ |
2058 | if (unlikely(cpu_intensive)) | 1997 | if (unlikely(cpu_intensive)) |
2059 | worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); | 1998 | worker_set_flags(worker, WORKER_CPU_INTENSIVE); |
2060 | 1999 | ||
2061 | /* | 2000 | /* |
2062 | * Unbound pool isn't concurrency managed and work items should be | 2001 | * Wake up another worker if necessary. The condition is always |
2063 | * executed ASAP. Wake up another worker if necessary. | 2002 | * false for normal per-cpu workers since nr_running would always |
2003 | * be >= 1 at this point. This is used to chain execution of the | ||
2004 | * pending work items for WORKER_NOT_RUNNING workers such as the | ||
2005 | * UNBOUND and CPU_INTENSIVE ones. | ||
2064 | */ | 2006 | */ |
2065 | if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) | 2007 | if (need_more_worker(pool)) |
2066 | wake_up_worker(pool); | 2008 | wake_up_worker(pool); |
2067 | 2009 | ||
2068 | /* | 2010 | /* |
@@ -2218,7 +2160,7 @@ recheck: | |||
2218 | } | 2160 | } |
2219 | } while (keep_working(pool)); | 2161 | } while (keep_working(pool)); |
2220 | 2162 | ||
2221 | worker_set_flags(worker, WORKER_PREP, false); | 2163 | worker_set_flags(worker, WORKER_PREP); |
2222 | sleep: | 2164 | sleep: |
2223 | /* | 2165 | /* |
2224 | * pool->lock is held and there's no work to process and no need to | 2166 | * pool->lock is held and there's no work to process and no need to |
@@ -2311,29 +2253,27 @@ repeat: | |||
2311 | move_linked_works(work, scheduled, &n); | 2253 | move_linked_works(work, scheduled, &n); |
2312 | 2254 | ||
2313 | process_scheduled_works(rescuer); | 2255 | process_scheduled_works(rescuer); |
2314 | spin_unlock_irq(&pool->lock); | ||
2315 | |||
2316 | worker_detach_from_pool(rescuer, pool); | ||
2317 | |||
2318 | spin_lock_irq(&pool->lock); | ||
2319 | 2256 | ||
2320 | /* | 2257 | /* |
2321 | * Put the reference grabbed by send_mayday(). @pool won't | 2258 | * Put the reference grabbed by send_mayday(). @pool won't |
2322 | * go away while we're holding its lock. | 2259 | * go away while we're still attached to it. |
2323 | */ | 2260 | */ |
2324 | put_pwq(pwq); | 2261 | put_pwq(pwq); |
2325 | 2262 | ||
2326 | /* | 2263 | /* |
2327 | * Leave this pool. If keep_working() is %true, notify a | 2264 | * Leave this pool. If need_more_worker() is %true, notify a |
2328 | * regular worker; otherwise, we end up with 0 concurrency | 2265 | * regular worker; otherwise, we end up with 0 concurrency |
2329 | * and stalling the execution. | 2266 | * and stalling the execution. |
2330 | */ | 2267 | */ |
2331 | if (keep_working(pool)) | 2268 | if (need_more_worker(pool)) |
2332 | wake_up_worker(pool); | 2269 | wake_up_worker(pool); |
2333 | 2270 | ||
2334 | rescuer->pool = NULL; | 2271 | rescuer->pool = NULL; |
2335 | spin_unlock(&pool->lock); | 2272 | spin_unlock_irq(&pool->lock); |
2336 | spin_lock(&wq_mayday_lock); | 2273 | |
2274 | worker_detach_from_pool(rescuer, pool); | ||
2275 | |||
2276 | spin_lock_irq(&wq_mayday_lock); | ||
2337 | } | 2277 | } |
2338 | 2278 | ||
2339 | spin_unlock_irq(&wq_mayday_lock); | 2279 | spin_unlock_irq(&wq_mayday_lock); |
@@ -3458,7 +3398,7 @@ static void put_unbound_pool(struct worker_pool *pool) | |||
3458 | return; | 3398 | return; |
3459 | 3399 | ||
3460 | /* sanity checks */ | 3400 | /* sanity checks */ |
3461 | if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || | 3401 | if (WARN_ON(!(pool->cpu < 0)) || |
3462 | WARN_ON(!list_empty(&pool->worklist))) | 3402 | WARN_ON(!list_empty(&pool->worklist))) |
3463 | return; | 3403 | return; |
3464 | 3404 | ||
@@ -3524,7 +3464,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) | |||
3524 | hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { | 3464 | hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { |
3525 | if (wqattrs_equal(pool->attrs, attrs)) { | 3465 | if (wqattrs_equal(pool->attrs, attrs)) { |
3526 | pool->refcnt++; | 3466 | pool->refcnt++; |
3527 | goto out_unlock; | 3467 | return pool; |
3528 | } | 3468 | } |
3529 | } | 3469 | } |
3530 | 3470 | ||
@@ -3557,12 +3497,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) | |||
3557 | goto fail; | 3497 | goto fail; |
3558 | 3498 | ||
3559 | /* create and start the initial worker */ | 3499 | /* create and start the initial worker */ |
3560 | if (create_and_start_worker(pool) < 0) | 3500 | if (!create_worker(pool)) |
3561 | goto fail; | 3501 | goto fail; |
3562 | 3502 | ||
3563 | /* install */ | 3503 | /* install */ |
3564 | hash_add(unbound_pool_hash, &pool->hash_node, hash); | 3504 | hash_add(unbound_pool_hash, &pool->hash_node, hash); |
3565 | out_unlock: | 3505 | |
3566 | return pool; | 3506 | return pool; |
3567 | fail: | 3507 | fail: |
3568 | if (pool) | 3508 | if (pool) |
@@ -3591,11 +3531,6 @@ static void pwq_unbound_release_workfn(struct work_struct *work) | |||
3591 | if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) | 3531 | if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) |
3592 | return; | 3532 | return; |
3593 | 3533 | ||
3594 | /* | ||
3595 | * Unlink @pwq. Synchronization against wq->mutex isn't strictly | ||
3596 | * necessary on release but do it anyway. It's easier to verify | ||
3597 | * and consistent with the linking path. | ||
3598 | */ | ||
3599 | mutex_lock(&wq->mutex); | 3534 | mutex_lock(&wq->mutex); |
3600 | list_del_rcu(&pwq->pwqs_node); | 3535 | list_del_rcu(&pwq->pwqs_node); |
3601 | is_last = list_empty(&wq->pwqs); | 3536 | is_last = list_empty(&wq->pwqs); |
@@ -3692,10 +3627,7 @@ static void link_pwq(struct pool_workqueue *pwq) | |||
3692 | if (!list_empty(&pwq->pwqs_node)) | 3627 | if (!list_empty(&pwq->pwqs_node)) |
3693 | return; | 3628 | return; |
3694 | 3629 | ||
3695 | /* | 3630 | /* set the matching work_color */ |
3696 | * Set the matching work_color. This is synchronized with | ||
3697 | * wq->mutex to avoid confusing flush_workqueue(). | ||
3698 | */ | ||
3699 | pwq->work_color = wq->work_color; | 3631 | pwq->work_color = wq->work_color; |
3700 | 3632 | ||
3701 | /* sync max_active to the current setting */ | 3633 | /* sync max_active to the current setting */ |
@@ -3832,7 +3764,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, | |||
3832 | if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) | 3764 | if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) |
3833 | return -EINVAL; | 3765 | return -EINVAL; |
3834 | 3766 | ||
3835 | pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL); | 3767 | pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL); |
3836 | new_attrs = alloc_workqueue_attrs(GFP_KERNEL); | 3768 | new_attrs = alloc_workqueue_attrs(GFP_KERNEL); |
3837 | tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); | 3769 | tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); |
3838 | if (!pwq_tbl || !new_attrs || !tmp_attrs) | 3770 | if (!pwq_tbl || !new_attrs || !tmp_attrs) |
@@ -4080,7 +4012,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
4080 | 4012 | ||
4081 | /* allocate wq and format name */ | 4013 | /* allocate wq and format name */ |
4082 | if (flags & WQ_UNBOUND) | 4014 | if (flags & WQ_UNBOUND) |
4083 | tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); | 4015 | tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]); |
4084 | 4016 | ||
4085 | wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); | 4017 | wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); |
4086 | if (!wq) | 4018 | if (!wq) |
@@ -4122,7 +4054,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
4122 | if (flags & WQ_MEM_RECLAIM) { | 4054 | if (flags & WQ_MEM_RECLAIM) { |
4123 | struct worker *rescuer; | 4055 | struct worker *rescuer; |
4124 | 4056 | ||
4125 | rescuer = alloc_worker(); | 4057 | rescuer = alloc_worker(NUMA_NO_NODE); |
4126 | if (!rescuer) | 4058 | if (!rescuer) |
4127 | goto err_destroy; | 4059 | goto err_destroy; |
4128 | 4060 | ||
@@ -4470,8 +4402,6 @@ static void wq_unbind_fn(struct work_struct *work) | |||
4470 | struct worker *worker; | 4402 | struct worker *worker; |
4471 | 4403 | ||
4472 | for_each_cpu_worker_pool(pool, cpu) { | 4404 | for_each_cpu_worker_pool(pool, cpu) { |
4473 | WARN_ON_ONCE(cpu != smp_processor_id()); | ||
4474 | |||
4475 | mutex_lock(&pool->attach_mutex); | 4405 | mutex_lock(&pool->attach_mutex); |
4476 | spin_lock_irq(&pool->lock); | 4406 | spin_lock_irq(&pool->lock); |
4477 | 4407 | ||
@@ -4543,6 +4473,7 @@ static void rebind_workers(struct worker_pool *pool) | |||
4543 | pool->attrs->cpumask) < 0); | 4473 | pool->attrs->cpumask) < 0); |
4544 | 4474 | ||
4545 | spin_lock_irq(&pool->lock); | 4475 | spin_lock_irq(&pool->lock); |
4476 | pool->flags &= ~POOL_DISASSOCIATED; | ||
4546 | 4477 | ||
4547 | for_each_pool_worker(worker, pool) { | 4478 | for_each_pool_worker(worker, pool) { |
4548 | unsigned int worker_flags = worker->flags; | 4479 | unsigned int worker_flags = worker->flags; |
@@ -4632,7 +4563,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, | |||
4632 | for_each_cpu_worker_pool(pool, cpu) { | 4563 | for_each_cpu_worker_pool(pool, cpu) { |
4633 | if (pool->nr_workers) | 4564 | if (pool->nr_workers) |
4634 | continue; | 4565 | continue; |
4635 | if (create_and_start_worker(pool) < 0) | 4566 | if (!create_worker(pool)) |
4636 | return NOTIFY_BAD; | 4567 | return NOTIFY_BAD; |
4637 | } | 4568 | } |
4638 | break; | 4569 | break; |
@@ -4644,15 +4575,10 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, | |||
4644 | for_each_pool(pool, pi) { | 4575 | for_each_pool(pool, pi) { |
4645 | mutex_lock(&pool->attach_mutex); | 4576 | mutex_lock(&pool->attach_mutex); |
4646 | 4577 | ||
4647 | if (pool->cpu == cpu) { | 4578 | if (pool->cpu == cpu) |
4648 | spin_lock_irq(&pool->lock); | ||
4649 | pool->flags &= ~POOL_DISASSOCIATED; | ||
4650 | spin_unlock_irq(&pool->lock); | ||
4651 | |||
4652 | rebind_workers(pool); | 4579 | rebind_workers(pool); |
4653 | } else if (pool->cpu < 0) { | 4580 | else if (pool->cpu < 0) |
4654 | restore_unbound_workers_cpumask(pool, cpu); | 4581 | restore_unbound_workers_cpumask(pool, cpu); |
4655 | } | ||
4656 | 4582 | ||
4657 | mutex_unlock(&pool->attach_mutex); | 4583 | mutex_unlock(&pool->attach_mutex); |
4658 | } | 4584 | } |
@@ -4856,10 +4782,6 @@ static void __init wq_numa_init(void) | |||
4856 | cpumask_var_t *tbl; | 4782 | cpumask_var_t *tbl; |
4857 | int node, cpu; | 4783 | int node, cpu; |
4858 | 4784 | ||
4859 | /* determine NUMA pwq table len - highest node id + 1 */ | ||
4860 | for_each_node(node) | ||
4861 | wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1); | ||
4862 | |||
4863 | if (num_possible_nodes() <= 1) | 4785 | if (num_possible_nodes() <= 1) |
4864 | return; | 4786 | return; |
4865 | 4787 | ||
@@ -4876,7 +4798,7 @@ static void __init wq_numa_init(void) | |||
4876 | * available. Build one from cpu_to_node() which should have been | 4798 | * available. Build one from cpu_to_node() which should have been |
4877 | * fully initialized by now. | 4799 | * fully initialized by now. |
4878 | */ | 4800 | */ |
4879 | tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL); | 4801 | tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL); |
4880 | BUG_ON(!tbl); | 4802 | BUG_ON(!tbl); |
4881 | 4803 | ||
4882 | for_each_node(node) | 4804 | for_each_node(node) |
@@ -4936,7 +4858,7 @@ static int __init init_workqueues(void) | |||
4936 | 4858 | ||
4937 | for_each_cpu_worker_pool(pool, cpu) { | 4859 | for_each_cpu_worker_pool(pool, cpu) { |
4938 | pool->flags &= ~POOL_DISASSOCIATED; | 4860 | pool->flags &= ~POOL_DISASSOCIATED; |
4939 | BUG_ON(create_and_start_worker(pool) < 0); | 4861 | BUG_ON(!create_worker(pool)); |
4940 | } | 4862 | } |
4941 | } | 4863 | } |
4942 | 4864 | ||