aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2018-04-05 03:20:34 -0400
committerIngo Molnar <mingo@kernel.org>2018-04-05 03:20:34 -0400
commitea2a6af517714c52a1209795a03e863e96b460bb (patch)
tree3bd443bc9b23ceeaf3743eaf2d6d35ec63c620c9 /kernel
parent1b5d43cfb69759d8ef8d30469cea31d0c037aed5 (diff)
parent642e7fd23353e22290e3d51719fcb658dc252342 (diff)
Merge branch 'linus' into sched/urgent, to pick up fixes and updates
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/compat.c55
-rw-r--r--kernel/cpu.c60
-rw-r--r--kernel/events/core.c779
-rw-r--r--kernel/events/hw_breakpoint.c124
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c11
-rw-r--r--kernel/kexec.c52
-rw-r--r--kernel/locking/lockdep.c26
-rw-r--r--kernel/locking/rtmutex.c3
-rw-r--r--kernel/locking/rtmutex_common.h11
-rw-r--r--kernel/locking/rwsem.c4
-rw-r--r--kernel/locking/rwsem.h8
-rw-r--r--kernel/module.c4
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/pid_namespace.c6
-rw-r--r--kernel/power/hibernate.c2
-rw-r--r--kernel/power/suspend.c2
-rw-r--r--kernel/power/user.c2
-rw-r--r--kernel/rcu/rcu.h38
-rw-r--r--kernel/rcu/rcuperf.c21
-rw-r--r--kernel/rcu/rcutorture.c72
-rw-r--r--kernel/rcu/srcutree.c29
-rw-r--r--kernel/rcu/tree.c72
-rw-r--r--kernel/rcu/tree.h36
-rw-r--r--kernel/rcu/tree_exp.h36
-rw-r--r--kernel/rcu/tree_plugin.h34
-rw-r--r--kernel/sched/Makefile5
-rw-r--r--kernel/sched/autogroup.c21
-rw-r--r--kernel/sched/autogroup.h12
-rw-r--r--kernel/sched/clock.c36
-rw-r--r--kernel/sched/completion.c11
-rw-r--r--kernel/sched/core.c194
-rw-r--r--kernel/sched/cpuacct.c33
-rw-r--r--kernel/sched/cpudeadline.c23
-rw-r--r--kernel/sched/cpudeadline.h29
-rw-r--r--kernel/sched/cpufreq.c1
-rw-r--r--kernel/sched/cpufreq_schedutil.c221
-rw-r--r--kernel/sched/cpupri.c15
-rw-r--r--kernel/sched/cpupri.h25
-rw-r--r--kernel/sched/cputime.c58
-rw-r--r--kernel/sched/deadline.c82
-rw-r--r--kernel/sched/debug.c103
-rw-r--r--kernel/sched/fair.c1415
-rw-r--r--kernel/sched/features.h5
-rw-r--r--kernel/sched/idle.c142
-rw-r--r--kernel/sched/idle_task.c110
-rw-r--r--kernel/sched/isolation.c14
-rw-r--r--kernel/sched/loadavg.c34
-rw-r--r--kernel/sched/membarrier.c27
-rw-r--r--kernel/sched/rt.c60
-rw-r--r--kernel/sched/sched.h650
-rw-r--r--kernel/sched/stats.c20
-rw-r--r--kernel/sched/stats.h86
-rw-r--r--kernel/sched/stop_task.c11
-rw-r--r--kernel/sched/swait.c6
-rw-r--r--kernel/sched/topology.c46
-rw-r--r--kernel/sched/wait.c13
-rw-r--r--kernel/sched/wait_bit.c127
-rw-r--r--kernel/signal.c29
-rw-r--r--kernel/sys.c74
-rw-r--r--kernel/sys_ni.c617
-rw-r--r--kernel/time/Kconfig10
-rw-r--r--kernel/time/tick-sched.c44
-rw-r--r--kernel/trace/trace_event_perf.c102
-rw-r--r--kernel/trace/trace_kprobe.c91
-rw-r--r--kernel/trace/trace_probe.h11
-rw-r--r--kernel/trace/trace_uprobe.c86
-rw-r--r--kernel/uid16.c25
-rw-r--r--kernel/uid16.h14
-rw-r--r--kernel/umh.c4
-rw-r--r--kernel/workqueue.c3
71 files changed, 3840 insertions, 2396 deletions
diff --git a/kernel/compat.c b/kernel/compat.c
index 3f5fa8902e7d..6d21894806b4 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -488,61 +488,6 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
488} 488}
489EXPORT_SYMBOL_GPL(get_compat_sigset); 489EXPORT_SYMBOL_GPL(get_compat_sigset);
490 490
491#ifdef CONFIG_NUMA
492COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
493 compat_uptr_t __user *, pages32,
494 const int __user *, nodes,
495 int __user *, status,
496 int, flags)
497{
498 const void __user * __user *pages;
499 int i;
500
501 pages = compat_alloc_user_space(nr_pages * sizeof(void *));
502 for (i = 0; i < nr_pages; i++) {
503 compat_uptr_t p;
504
505 if (get_user(p, pages32 + i) ||
506 put_user(compat_ptr(p), pages + i))
507 return -EFAULT;
508 }
509 return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
510}
511
512COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
513 compat_ulong_t, maxnode,
514 const compat_ulong_t __user *, old_nodes,
515 const compat_ulong_t __user *, new_nodes)
516{
517 unsigned long __user *old = NULL;
518 unsigned long __user *new = NULL;
519 nodemask_t tmp_mask;
520 unsigned long nr_bits;
521 unsigned long size;
522
523 nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
524 size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
525 if (old_nodes) {
526 if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
527 return -EFAULT;
528 old = compat_alloc_user_space(new_nodes ? size * 2 : size);
529 if (new_nodes)
530 new = old + size / sizeof(unsigned long);
531 if (copy_to_user(old, nodes_addr(tmp_mask), size))
532 return -EFAULT;
533 }
534 if (new_nodes) {
535 if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
536 return -EFAULT;
537 if (new == NULL)
538 new = compat_alloc_user_space(size);
539 if (copy_to_user(new, nodes_addr(tmp_mask), size))
540 return -EFAULT;
541 }
542 return sys_migrate_pages(pid, nr_bits + 1, old, new);
543}
544#endif
545
546/* 491/*
547 * Allocate user-space memory for the duration of a single system call, 492 * Allocate user-space memory for the duration of a single system call,
548 * in order to marshall parameters inside a compat thunk. 493 * in order to marshall parameters inside a compat thunk.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 53f7dc65f9a3..0db8938fbb23 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -124,24 +124,11 @@ struct cpuhp_step {
124}; 124};
125 125
126static DEFINE_MUTEX(cpuhp_state_mutex); 126static DEFINE_MUTEX(cpuhp_state_mutex);
127static struct cpuhp_step cpuhp_bp_states[]; 127static struct cpuhp_step cpuhp_hp_states[];
128static struct cpuhp_step cpuhp_ap_states[];
129
130static bool cpuhp_is_ap_state(enum cpuhp_state state)
131{
132 /*
133 * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
134 * purposes as that state is handled explicitly in cpu_down.
135 */
136 return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
137}
138 128
139static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) 129static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
140{ 130{
141 struct cpuhp_step *sp; 131 return cpuhp_hp_states + state;
142
143 sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
144 return sp + state;
145} 132}
146 133
147/** 134/**
@@ -239,6 +226,15 @@ err:
239} 226}
240 227
241#ifdef CONFIG_SMP 228#ifdef CONFIG_SMP
229static bool cpuhp_is_ap_state(enum cpuhp_state state)
230{
231 /*
232 * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
233 * purposes as that state is handled explicitly in cpu_down.
234 */
235 return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
236}
237
242static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) 238static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
243{ 239{
244 struct completion *done = bringup ? &st->done_up : &st->done_down; 240 struct completion *done = bringup ? &st->done_up : &st->done_down;
@@ -1224,7 +1220,7 @@ int __boot_cpu_id;
1224#endif /* CONFIG_SMP */ 1220#endif /* CONFIG_SMP */
1225 1221
1226/* Boot processor state steps */ 1222/* Boot processor state steps */
1227static struct cpuhp_step cpuhp_bp_states[] = { 1223static struct cpuhp_step cpuhp_hp_states[] = {
1228 [CPUHP_OFFLINE] = { 1224 [CPUHP_OFFLINE] = {
1229 .name = "offline", 1225 .name = "offline",
1230 .startup.single = NULL, 1226 .startup.single = NULL,
@@ -1289,24 +1285,6 @@ static struct cpuhp_step cpuhp_bp_states[] = {
1289 .teardown.single = NULL, 1285 .teardown.single = NULL,
1290 .cant_stop = true, 1286 .cant_stop = true,
1291 }, 1287 },
1292 /*
1293 * Handled on controll processor until the plugged processor manages
1294 * this itself.
1295 */
1296 [CPUHP_TEARDOWN_CPU] = {
1297 .name = "cpu:teardown",
1298 .startup.single = NULL,
1299 .teardown.single = takedown_cpu,
1300 .cant_stop = true,
1301 },
1302#else
1303 [CPUHP_BRINGUP_CPU] = { },
1304#endif
1305};
1306
1307/* Application processor state steps */
1308static struct cpuhp_step cpuhp_ap_states[] = {
1309#ifdef CONFIG_SMP
1310 /* Final state before CPU kills itself */ 1288 /* Final state before CPU kills itself */
1311 [CPUHP_AP_IDLE_DEAD] = { 1289 [CPUHP_AP_IDLE_DEAD] = {
1312 .name = "idle:dead", 1290 .name = "idle:dead",
@@ -1340,6 +1318,16 @@ static struct cpuhp_step cpuhp_ap_states[] = {
1340 [CPUHP_AP_ONLINE] = { 1318 [CPUHP_AP_ONLINE] = {
1341 .name = "ap:online", 1319 .name = "ap:online",
1342 }, 1320 },
1321 /*
1322 * Handled on controll processor until the plugged processor manages
1323 * this itself.
1324 */
1325 [CPUHP_TEARDOWN_CPU] = {
1326 .name = "cpu:teardown",
1327 .startup.single = NULL,
1328 .teardown.single = takedown_cpu,
1329 .cant_stop = true,
1330 },
1343 /* Handle smpboot threads park/unpark */ 1331 /* Handle smpboot threads park/unpark */
1344 [CPUHP_AP_SMPBOOT_THREADS] = { 1332 [CPUHP_AP_SMPBOOT_THREADS] = {
1345 .name = "smpboot/threads:online", 1333 .name = "smpboot/threads:online",
@@ -1408,11 +1396,11 @@ static int cpuhp_reserve_state(enum cpuhp_state state)
1408 1396
1409 switch (state) { 1397 switch (state) {
1410 case CPUHP_AP_ONLINE_DYN: 1398 case CPUHP_AP_ONLINE_DYN:
1411 step = cpuhp_ap_states + CPUHP_AP_ONLINE_DYN; 1399 step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN;
1412 end = CPUHP_AP_ONLINE_DYN_END; 1400 end = CPUHP_AP_ONLINE_DYN_END;
1413 break; 1401 break;
1414 case CPUHP_BP_PREPARE_DYN: 1402 case CPUHP_BP_PREPARE_DYN:
1415 step = cpuhp_bp_states + CPUHP_BP_PREPARE_DYN; 1403 step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN;
1416 end = CPUHP_BP_PREPARE_DYN_END; 1404 end = CPUHP_BP_PREPARE_DYN_END;
1417 break; 1405 break;
1418 default: 1406 default:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 709a55b9ad97..fc1c330c6bd6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -430,7 +430,7 @@ static void update_perf_cpu_limits(void)
430 WRITE_ONCE(perf_sample_allowed_ns, tmp); 430 WRITE_ONCE(perf_sample_allowed_ns, tmp);
431} 431}
432 432
433static int perf_rotate_context(struct perf_cpu_context *cpuctx); 433static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
434 434
435int perf_proc_update_handler(struct ctl_table *table, int write, 435int perf_proc_update_handler(struct ctl_table *table, int write,
436 void __user *buffer, size_t *lenp, 436 void __user *buffer, size_t *lenp,
@@ -643,7 +643,7 @@ static void perf_event_update_sibling_time(struct perf_event *leader)
643{ 643{
644 struct perf_event *sibling; 644 struct perf_event *sibling;
645 645
646 list_for_each_entry(sibling, &leader->sibling_list, group_entry) 646 for_each_sibling_event(sibling, leader)
647 perf_event_update_time(sibling); 647 perf_event_update_time(sibling);
648} 648}
649 649
@@ -948,27 +948,39 @@ list_update_cgroup_event(struct perf_event *event,
948 if (!is_cgroup_event(event)) 948 if (!is_cgroup_event(event))
949 return; 949 return;
950 950
951 if (add && ctx->nr_cgroups++)
952 return;
953 else if (!add && --ctx->nr_cgroups)
954 return;
955 /* 951 /*
956 * Because cgroup events are always per-cpu events, 952 * Because cgroup events are always per-cpu events,
957 * this will always be called from the right CPU. 953 * this will always be called from the right CPU.
958 */ 954 */
959 cpuctx = __get_cpu_context(ctx); 955 cpuctx = __get_cpu_context(ctx);
960 cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; 956
961 /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/ 957 /*
962 if (add) { 958 * Since setting cpuctx->cgrp is conditional on the current @cgrp
959 * matching the event's cgroup, we must do this for every new event,
960 * because if the first would mismatch, the second would not try again
961 * and we would leave cpuctx->cgrp unset.
962 */
963 if (add && !cpuctx->cgrp) {
963 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); 964 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
964 965
965 list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
966 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) 966 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
967 cpuctx->cgrp = cgrp; 967 cpuctx->cgrp = cgrp;
968 } else {
969 list_del(cpuctx_entry);
970 cpuctx->cgrp = NULL;
971 } 968 }
969
970 if (add && ctx->nr_cgroups++)
971 return;
972 else if (!add && --ctx->nr_cgroups)
973 return;
974
975 /* no cgroup running */
976 if (!add)
977 cpuctx->cgrp = NULL;
978
979 cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
980 if (add)
981 list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
982 else
983 list_del(cpuctx_entry);
972} 984}
973 985
974#else /* !CONFIG_CGROUP_PERF */ 986#else /* !CONFIG_CGROUP_PERF */
@@ -1052,7 +1064,7 @@ list_update_cgroup_event(struct perf_event *event,
1052static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) 1064static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1053{ 1065{
1054 struct perf_cpu_context *cpuctx; 1066 struct perf_cpu_context *cpuctx;
1055 int rotations = 0; 1067 bool rotations;
1056 1068
1057 lockdep_assert_irqs_disabled(); 1069 lockdep_assert_irqs_disabled();
1058 1070
@@ -1471,8 +1483,21 @@ static enum event_type_t get_event_type(struct perf_event *event)
1471 return event_type; 1483 return event_type;
1472} 1484}
1473 1485
1474static struct list_head * 1486/*
1475ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) 1487 * Helper function to initialize event group nodes.
1488 */
1489static void init_event_group(struct perf_event *event)
1490{
1491 RB_CLEAR_NODE(&event->group_node);
1492 event->group_index = 0;
1493}
1494
1495/*
1496 * Extract pinned or flexible groups from the context
1497 * based on event attrs bits.
1498 */
1499static struct perf_event_groups *
1500get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1476{ 1501{
1477 if (event->attr.pinned) 1502 if (event->attr.pinned)
1478 return &ctx->pinned_groups; 1503 return &ctx->pinned_groups;
@@ -1481,6 +1506,156 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1481} 1506}
1482 1507
1483/* 1508/*
1509 * Helper function to initializes perf_event_group trees.
1510 */
1511static void perf_event_groups_init(struct perf_event_groups *groups)
1512{
1513 groups->tree = RB_ROOT;
1514 groups->index = 0;
1515}
1516
1517/*
1518 * Compare function for event groups;
1519 *
1520 * Implements complex key that first sorts by CPU and then by virtual index
1521 * which provides ordering when rotating groups for the same CPU.
1522 */
1523static bool
1524perf_event_groups_less(struct perf_event *left, struct perf_event *right)
1525{
1526 if (left->cpu < right->cpu)
1527 return true;
1528 if (left->cpu > right->cpu)
1529 return false;
1530
1531 if (left->group_index < right->group_index)
1532 return true;
1533 if (left->group_index > right->group_index)
1534 return false;
1535
1536 return false;
1537}
1538
1539/*
1540 * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
1541 * key (see perf_event_groups_less). This places it last inside the CPU
1542 * subtree.
1543 */
1544static void
1545perf_event_groups_insert(struct perf_event_groups *groups,
1546 struct perf_event *event)
1547{
1548 struct perf_event *node_event;
1549 struct rb_node *parent;
1550 struct rb_node **node;
1551
1552 event->group_index = ++groups->index;
1553
1554 node = &groups->tree.rb_node;
1555 parent = *node;
1556
1557 while (*node) {
1558 parent = *node;
1559 node_event = container_of(*node, struct perf_event, group_node);
1560
1561 if (perf_event_groups_less(event, node_event))
1562 node = &parent->rb_left;
1563 else
1564 node = &parent->rb_right;
1565 }
1566
1567 rb_link_node(&event->group_node, parent, node);
1568 rb_insert_color(&event->group_node, &groups->tree);
1569}
1570
1571/*
1572 * Helper function to insert event into the pinned or flexible groups.
1573 */
1574static void
1575add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1576{
1577 struct perf_event_groups *groups;
1578
1579 groups = get_event_groups(event, ctx);
1580 perf_event_groups_insert(groups, event);
1581}
1582
1583/*
1584 * Delete a group from a tree.
1585 */
1586static void
1587perf_event_groups_delete(struct perf_event_groups *groups,
1588 struct perf_event *event)
1589{
1590 WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1591 RB_EMPTY_ROOT(&groups->tree));
1592
1593 rb_erase(&event->group_node, &groups->tree);
1594 init_event_group(event);
1595}
1596
1597/*
1598 * Helper function to delete event from its groups.
1599 */
1600static void
1601del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1602{
1603 struct perf_event_groups *groups;
1604
1605 groups = get_event_groups(event, ctx);
1606 perf_event_groups_delete(groups, event);
1607}
1608
1609/*
1610 * Get the leftmost event in the @cpu subtree.
1611 */
1612static struct perf_event *
1613perf_event_groups_first(struct perf_event_groups *groups, int cpu)
1614{
1615 struct perf_event *node_event = NULL, *match = NULL;
1616 struct rb_node *node = groups->tree.rb_node;
1617
1618 while (node) {
1619 node_event = container_of(node, struct perf_event, group_node);
1620
1621 if (cpu < node_event->cpu) {
1622 node = node->rb_left;
1623 } else if (cpu > node_event->cpu) {
1624 node = node->rb_right;
1625 } else {
1626 match = node_event;
1627 node = node->rb_left;
1628 }
1629 }
1630
1631 return match;
1632}
1633
1634/*
1635 * Like rb_entry_next_safe() for the @cpu subtree.
1636 */
1637static struct perf_event *
1638perf_event_groups_next(struct perf_event *event)
1639{
1640 struct perf_event *next;
1641
1642 next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
1643 if (next && next->cpu == event->cpu)
1644 return next;
1645
1646 return NULL;
1647}
1648
1649/*
1650 * Iterate through the whole groups tree.
1651 */
1652#define perf_event_groups_for_each(event, groups) \
1653 for (event = rb_entry_safe(rb_first(&((groups)->tree)), \
1654 typeof(*event), group_node); event; \
1655 event = rb_entry_safe(rb_next(&event->group_node), \
1656 typeof(*event), group_node))
1657
1658/*
1484 * Add a event from the lists for its context. 1659 * Add a event from the lists for its context.
1485 * Must be called with ctx->mutex and ctx->lock held. 1660 * Must be called with ctx->mutex and ctx->lock held.
1486 */ 1661 */
@@ -1500,12 +1675,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1500 * perf_group_detach can, at all times, locate all siblings. 1675 * perf_group_detach can, at all times, locate all siblings.
1501 */ 1676 */
1502 if (event->group_leader == event) { 1677 if (event->group_leader == event) {
1503 struct list_head *list;
1504
1505 event->group_caps = event->event_caps; 1678 event->group_caps = event->event_caps;
1506 1679 add_event_to_groups(event, ctx);
1507 list = ctx_group_list(event, ctx);
1508 list_add_tail(&event->group_entry, list);
1509 } 1680 }
1510 1681
1511 list_update_cgroup_event(event, ctx, true); 1682 list_update_cgroup_event(event, ctx, true);
@@ -1663,12 +1834,12 @@ static void perf_group_attach(struct perf_event *event)
1663 1834
1664 group_leader->group_caps &= event->event_caps; 1835 group_leader->group_caps &= event->event_caps;
1665 1836
1666 list_add_tail(&event->group_entry, &group_leader->sibling_list); 1837 list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1667 group_leader->nr_siblings++; 1838 group_leader->nr_siblings++;
1668 1839
1669 perf_event__header_size(group_leader); 1840 perf_event__header_size(group_leader);
1670 1841
1671 list_for_each_entry(pos, &group_leader->sibling_list, group_entry) 1842 for_each_sibling_event(pos, group_leader)
1672 perf_event__header_size(pos); 1843 perf_event__header_size(pos);
1673} 1844}
1674 1845
@@ -1699,7 +1870,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1699 list_del_rcu(&event->event_entry); 1870 list_del_rcu(&event->event_entry);
1700 1871
1701 if (event->group_leader == event) 1872 if (event->group_leader == event)
1702 list_del_init(&event->group_entry); 1873 del_event_from_groups(event, ctx);
1703 1874
1704 /* 1875 /*
1705 * If event was in error state, then keep it 1876 * If event was in error state, then keep it
@@ -1717,9 +1888,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1717static void perf_group_detach(struct perf_event *event) 1888static void perf_group_detach(struct perf_event *event)
1718{ 1889{
1719 struct perf_event *sibling, *tmp; 1890 struct perf_event *sibling, *tmp;
1720 struct list_head *list = NULL; 1891 struct perf_event_context *ctx = event->ctx;
1721 1892
1722 lockdep_assert_held(&event->ctx->lock); 1893 lockdep_assert_held(&ctx->lock);
1723 1894
1724 /* 1895 /*
1725 * We can have double detach due to exit/hot-unplug + close. 1896 * We can have double detach due to exit/hot-unplug + close.
@@ -1733,34 +1904,42 @@ static void perf_group_detach(struct perf_event *event)
1733 * If this is a sibling, remove it from its group. 1904 * If this is a sibling, remove it from its group.
1734 */ 1905 */
1735 if (event->group_leader != event) { 1906 if (event->group_leader != event) {
1736 list_del_init(&event->group_entry); 1907 list_del_init(&event->sibling_list);
1737 event->group_leader->nr_siblings--; 1908 event->group_leader->nr_siblings--;
1738 goto out; 1909 goto out;
1739 } 1910 }
1740 1911
1741 if (!list_empty(&event->group_entry))
1742 list = &event->group_entry;
1743
1744 /* 1912 /*
1745 * If this was a group event with sibling events then 1913 * If this was a group event with sibling events then
1746 * upgrade the siblings to singleton events by adding them 1914 * upgrade the siblings to singleton events by adding them
1747 * to whatever list we are on. 1915 * to whatever list we are on.
1748 */ 1916 */
1749 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 1917 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
1750 if (list) 1918
1751 list_move_tail(&sibling->group_entry, list);
1752 sibling->group_leader = sibling; 1919 sibling->group_leader = sibling;
1920 list_del_init(&sibling->sibling_list);
1753 1921
1754 /* Inherit group flags from the previous leader */ 1922 /* Inherit group flags from the previous leader */
1755 sibling->group_caps = event->group_caps; 1923 sibling->group_caps = event->group_caps;
1756 1924
1925 if (!RB_EMPTY_NODE(&event->group_node)) {
1926 add_event_to_groups(sibling, event->ctx);
1927
1928 if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
1929 struct list_head *list = sibling->attr.pinned ?
1930 &ctx->pinned_active : &ctx->flexible_active;
1931
1932 list_add_tail(&sibling->active_list, list);
1933 }
1934 }
1935
1757 WARN_ON_ONCE(sibling->ctx != event->ctx); 1936 WARN_ON_ONCE(sibling->ctx != event->ctx);
1758 } 1937 }
1759 1938
1760out: 1939out:
1761 perf_event__header_size(event->group_leader); 1940 perf_event__header_size(event->group_leader);
1762 1941
1763 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) 1942 for_each_sibling_event(tmp, event->group_leader)
1764 perf_event__header_size(tmp); 1943 perf_event__header_size(tmp);
1765} 1944}
1766 1945
@@ -1783,13 +1962,13 @@ static inline int __pmu_filter_match(struct perf_event *event)
1783 */ 1962 */
1784static inline int pmu_filter_match(struct perf_event *event) 1963static inline int pmu_filter_match(struct perf_event *event)
1785{ 1964{
1786 struct perf_event *child; 1965 struct perf_event *sibling;
1787 1966
1788 if (!__pmu_filter_match(event)) 1967 if (!__pmu_filter_match(event))
1789 return 0; 1968 return 0;
1790 1969
1791 list_for_each_entry(child, &event->sibling_list, group_entry) { 1970 for_each_sibling_event(sibling, event) {
1792 if (!__pmu_filter_match(child)) 1971 if (!__pmu_filter_match(sibling))
1793 return 0; 1972 return 0;
1794 } 1973 }
1795 1974
@@ -1816,6 +1995,13 @@ event_sched_out(struct perf_event *event,
1816 if (event->state != PERF_EVENT_STATE_ACTIVE) 1995 if (event->state != PERF_EVENT_STATE_ACTIVE)
1817 return; 1996 return;
1818 1997
1998 /*
1999 * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
2000 * we can schedule events _OUT_ individually through things like
2001 * __perf_remove_from_context().
2002 */
2003 list_del_init(&event->active_list);
2004
1819 perf_pmu_disable(event->pmu); 2005 perf_pmu_disable(event->pmu);
1820 2006
1821 event->pmu->del(event, 0); 2007 event->pmu->del(event, 0);
@@ -1856,7 +2042,7 @@ group_sched_out(struct perf_event *group_event,
1856 /* 2042 /*
1857 * Schedule out siblings (if any): 2043 * Schedule out siblings (if any):
1858 */ 2044 */
1859 list_for_each_entry(event, &group_event->sibling_list, group_entry) 2045 for_each_sibling_event(event, group_event)
1860 event_sched_out(event, cpuctx, ctx); 2046 event_sched_out(event, cpuctx, ctx);
1861 2047
1862 perf_pmu_enable(ctx->pmu); 2048 perf_pmu_enable(ctx->pmu);
@@ -2135,7 +2321,7 @@ group_sched_in(struct perf_event *group_event,
2135 /* 2321 /*
2136 * Schedule in siblings as one group (if any): 2322 * Schedule in siblings as one group (if any):
2137 */ 2323 */
2138 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 2324 for_each_sibling_event(event, group_event) {
2139 if (event_sched_in(event, cpuctx, ctx)) { 2325 if (event_sched_in(event, cpuctx, ctx)) {
2140 partial_group = event; 2326 partial_group = event;
2141 goto group_error; 2327 goto group_error;
@@ -2151,7 +2337,7 @@ group_error:
2151 * partial group before returning: 2337 * partial group before returning:
2152 * The events up to the failed event are scheduled out normally. 2338 * The events up to the failed event are scheduled out normally.
2153 */ 2339 */
2154 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 2340 for_each_sibling_event(event, group_event) {
2155 if (event == partial_group) 2341 if (event == partial_group)
2156 break; 2342 break;
2157 2343
@@ -2328,6 +2514,18 @@ static int __perf_install_in_context(void *info)
2328 raw_spin_lock(&task_ctx->lock); 2514 raw_spin_lock(&task_ctx->lock);
2329 } 2515 }
2330 2516
2517#ifdef CONFIG_CGROUP_PERF
2518 if (is_cgroup_event(event)) {
2519 /*
2520 * If the current cgroup doesn't match the event's
2521 * cgroup, we should not try to schedule it.
2522 */
2523 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2524 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2525 event->cgrp->css.cgroup);
2526 }
2527#endif
2528
2331 if (reprogram) { 2529 if (reprogram) {
2332 ctx_sched_out(ctx, cpuctx, EVENT_TIME); 2530 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2333 add_event_to_ctx(event, ctx); 2531 add_event_to_ctx(event, ctx);
@@ -2661,12 +2859,47 @@ int perf_event_refresh(struct perf_event *event, int refresh)
2661} 2859}
2662EXPORT_SYMBOL_GPL(perf_event_refresh); 2860EXPORT_SYMBOL_GPL(perf_event_refresh);
2663 2861
2862static int perf_event_modify_breakpoint(struct perf_event *bp,
2863 struct perf_event_attr *attr)
2864{
2865 int err;
2866
2867 _perf_event_disable(bp);
2868
2869 err = modify_user_hw_breakpoint_check(bp, attr, true);
2870 if (err) {
2871 if (!bp->attr.disabled)
2872 _perf_event_enable(bp);
2873
2874 return err;
2875 }
2876
2877 if (!attr->disabled)
2878 _perf_event_enable(bp);
2879 return 0;
2880}
2881
2882static int perf_event_modify_attr(struct perf_event *event,
2883 struct perf_event_attr *attr)
2884{
2885 if (event->attr.type != attr->type)
2886 return -EINVAL;
2887
2888 switch (event->attr.type) {
2889 case PERF_TYPE_BREAKPOINT:
2890 return perf_event_modify_breakpoint(event, attr);
2891 default:
2892 /* Place holder for future additions. */
2893 return -EOPNOTSUPP;
2894 }
2895}
2896
2664static void ctx_sched_out(struct perf_event_context *ctx, 2897static void ctx_sched_out(struct perf_event_context *ctx,
2665 struct perf_cpu_context *cpuctx, 2898 struct perf_cpu_context *cpuctx,
2666 enum event_type_t event_type) 2899 enum event_type_t event_type)
2667{ 2900{
2901 struct perf_event *event, *tmp;
2668 int is_active = ctx->is_active; 2902 int is_active = ctx->is_active;
2669 struct perf_event *event;
2670 2903
2671 lockdep_assert_held(&ctx->lock); 2904 lockdep_assert_held(&ctx->lock);
2672 2905
@@ -2713,12 +2946,12 @@ static void ctx_sched_out(struct perf_event_context *ctx,
2713 2946
2714 perf_pmu_disable(ctx->pmu); 2947 perf_pmu_disable(ctx->pmu);
2715 if (is_active & EVENT_PINNED) { 2948 if (is_active & EVENT_PINNED) {
2716 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 2949 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
2717 group_sched_out(event, cpuctx, ctx); 2950 group_sched_out(event, cpuctx, ctx);
2718 } 2951 }
2719 2952
2720 if (is_active & EVENT_FLEXIBLE) { 2953 if (is_active & EVENT_FLEXIBLE) {
2721 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 2954 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
2722 group_sched_out(event, cpuctx, ctx); 2955 group_sched_out(event, cpuctx, ctx);
2723 } 2956 }
2724 perf_pmu_enable(ctx->pmu); 2957 perf_pmu_enable(ctx->pmu);
@@ -3005,53 +3238,116 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3005 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); 3238 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3006} 3239}
3007 3240
3008static void 3241static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
3009ctx_pinned_sched_in(struct perf_event_context *ctx, 3242 int (*func)(struct perf_event *, void *), void *data)
3010 struct perf_cpu_context *cpuctx)
3011{ 3243{
3012 struct perf_event *event; 3244 struct perf_event **evt, *evt1, *evt2;
3245 int ret;
3013 3246
3014 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 3247 evt1 = perf_event_groups_first(groups, -1);
3015 if (event->state <= PERF_EVENT_STATE_OFF) 3248 evt2 = perf_event_groups_first(groups, cpu);
3016 continue; 3249
3017 if (!event_filter_match(event)) 3250 while (evt1 || evt2) {
3018 continue; 3251 if (evt1 && evt2) {
3252 if (evt1->group_index < evt2->group_index)
3253 evt = &evt1;
3254 else
3255 evt = &evt2;
3256 } else if (evt1) {
3257 evt = &evt1;
3258 } else {
3259 evt = &evt2;
3260 }
3019 3261
3020 if (group_can_go_on(event, cpuctx, 1)) 3262 ret = func(*evt, data);
3021 group_sched_in(event, cpuctx, ctx); 3263 if (ret)
3264 return ret;
3022 3265
3023 /* 3266 *evt = perf_event_groups_next(*evt);
3024 * If this pinned group hasn't been scheduled,
3025 * put it in error state.
3026 */
3027 if (event->state == PERF_EVENT_STATE_INACTIVE)
3028 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3029 } 3267 }
3268
3269 return 0;
3270}
3271
3272struct sched_in_data {
3273 struct perf_event_context *ctx;
3274 struct perf_cpu_context *cpuctx;
3275 int can_add_hw;
3276};
3277
3278static int pinned_sched_in(struct perf_event *event, void *data)
3279{
3280 struct sched_in_data *sid = data;
3281
3282 if (event->state <= PERF_EVENT_STATE_OFF)
3283 return 0;
3284
3285 if (!event_filter_match(event))
3286 return 0;
3287
3288 if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3289 if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3290 list_add_tail(&event->active_list, &sid->ctx->pinned_active);
3291 }
3292
3293 /*
3294 * If this pinned group hasn't been scheduled,
3295 * put it in error state.
3296 */
3297 if (event->state == PERF_EVENT_STATE_INACTIVE)
3298 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3299
3300 return 0;
3301}
3302
3303static int flexible_sched_in(struct perf_event *event, void *data)
3304{
3305 struct sched_in_data *sid = data;
3306
3307 if (event->state <= PERF_EVENT_STATE_OFF)
3308 return 0;
3309
3310 if (!event_filter_match(event))
3311 return 0;
3312
3313 if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3314 if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3315 list_add_tail(&event->active_list, &sid->ctx->flexible_active);
3316 else
3317 sid->can_add_hw = 0;
3318 }
3319
3320 return 0;
3321}
3322
3323static void
3324ctx_pinned_sched_in(struct perf_event_context *ctx,
3325 struct perf_cpu_context *cpuctx)
3326{
3327 struct sched_in_data sid = {
3328 .ctx = ctx,
3329 .cpuctx = cpuctx,
3330 .can_add_hw = 1,
3331 };
3332
3333 visit_groups_merge(&ctx->pinned_groups,
3334 smp_processor_id(),
3335 pinned_sched_in, &sid);
3030} 3336}
3031 3337
3032static void 3338static void
3033ctx_flexible_sched_in(struct perf_event_context *ctx, 3339ctx_flexible_sched_in(struct perf_event_context *ctx,
3034 struct perf_cpu_context *cpuctx) 3340 struct perf_cpu_context *cpuctx)
3035{ 3341{
3036 struct perf_event *event; 3342 struct sched_in_data sid = {
3037 int can_add_hw = 1; 3343 .ctx = ctx,
3038 3344 .cpuctx = cpuctx,
3039 list_for_each_entry(event, &ctx->flexible_groups, group_entry) { 3345 .can_add_hw = 1,
3040 /* Ignore events in OFF or ERROR state */ 3346 };
3041 if (event->state <= PERF_EVENT_STATE_OFF)
3042 continue;
3043 /*
3044 * Listen to the 'cpu' scheduling filter constraint
3045 * of events:
3046 */
3047 if (!event_filter_match(event))
3048 continue;
3049 3347
3050 if (group_can_go_on(event, cpuctx, can_add_hw)) { 3348 visit_groups_merge(&ctx->flexible_groups,
3051 if (group_sched_in(event, cpuctx, ctx)) 3349 smp_processor_id(),
3052 can_add_hw = 0; 3350 flexible_sched_in, &sid);
3053 }
3054 }
3055} 3351}
3056 3352
3057static void 3353static void
@@ -3132,7 +3428,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
3132 * However, if task's ctx is not carrying any pinned 3428 * However, if task's ctx is not carrying any pinned
3133 * events, no need to flip the cpuctx's events around. 3429 * events, no need to flip the cpuctx's events around.
3134 */ 3430 */
3135 if (!list_empty(&ctx->pinned_groups)) 3431 if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3136 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 3432 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3137 perf_event_sched_in(cpuctx, ctx, task); 3433 perf_event_sched_in(cpuctx, ctx, task);
3138 perf_pmu_enable(ctx->pmu); 3434 perf_pmu_enable(ctx->pmu);
@@ -3361,55 +3657,81 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3361} 3657}
3362 3658
3363/* 3659/*
3364 * Round-robin a context's events: 3660 * Move @event to the tail of the @ctx's elegible events.
3365 */ 3661 */
3366static void rotate_ctx(struct perf_event_context *ctx) 3662static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
3367{ 3663{
3368 /* 3664 /*
3369 * Rotate the first entry last of non-pinned groups. Rotation might be 3665 * Rotate the first entry last of non-pinned groups. Rotation might be
3370 * disabled by the inheritance code. 3666 * disabled by the inheritance code.
3371 */ 3667 */
3372 if (!ctx->rotate_disable) 3668 if (ctx->rotate_disable)
3373 list_rotate_left(&ctx->flexible_groups); 3669 return;
3670
3671 perf_event_groups_delete(&ctx->flexible_groups, event);
3672 perf_event_groups_insert(&ctx->flexible_groups, event);
3374} 3673}
3375 3674
3376static int perf_rotate_context(struct perf_cpu_context *cpuctx) 3675static inline struct perf_event *
3676ctx_first_active(struct perf_event_context *ctx)
3377{ 3677{
3678 return list_first_entry_or_null(&ctx->flexible_active,
3679 struct perf_event, active_list);
3680}
3681
3682static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
3683{
3684 struct perf_event *cpu_event = NULL, *task_event = NULL;
3685 bool cpu_rotate = false, task_rotate = false;
3378 struct perf_event_context *ctx = NULL; 3686 struct perf_event_context *ctx = NULL;
3379 int rotate = 0; 3687
3688 /*
3689 * Since we run this from IRQ context, nobody can install new
3690 * events, thus the event count values are stable.
3691 */
3380 3692
3381 if (cpuctx->ctx.nr_events) { 3693 if (cpuctx->ctx.nr_events) {
3382 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 3694 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3383 rotate = 1; 3695 cpu_rotate = true;
3384 } 3696 }
3385 3697
3386 ctx = cpuctx->task_ctx; 3698 ctx = cpuctx->task_ctx;
3387 if (ctx && ctx->nr_events) { 3699 if (ctx && ctx->nr_events) {
3388 if (ctx->nr_events != ctx->nr_active) 3700 if (ctx->nr_events != ctx->nr_active)
3389 rotate = 1; 3701 task_rotate = true;
3390 } 3702 }
3391 3703
3392 if (!rotate) 3704 if (!(cpu_rotate || task_rotate))
3393 goto done; 3705 return false;
3394 3706
3395 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 3707 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3396 perf_pmu_disable(cpuctx->ctx.pmu); 3708 perf_pmu_disable(cpuctx->ctx.pmu);
3397 3709
3398 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 3710 if (task_rotate)
3399 if (ctx) 3711 task_event = ctx_first_active(ctx);
3712 if (cpu_rotate)
3713 cpu_event = ctx_first_active(&cpuctx->ctx);
3714
3715 /*
3716 * As per the order given at ctx_resched() first 'pop' task flexible
3717 * and then, if needed CPU flexible.
3718 */
3719 if (task_event || (ctx && cpu_event))
3400 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); 3720 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3721 if (cpu_event)
3722 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3401 3723
3402 rotate_ctx(&cpuctx->ctx); 3724 if (task_event)
3403 if (ctx) 3725 rotate_ctx(ctx, task_event);
3404 rotate_ctx(ctx); 3726 if (cpu_event)
3727 rotate_ctx(&cpuctx->ctx, cpu_event);
3405 3728
3406 perf_event_sched_in(cpuctx, ctx, current); 3729 perf_event_sched_in(cpuctx, ctx, current);
3407 3730
3408 perf_pmu_enable(cpuctx->ctx.pmu); 3731 perf_pmu_enable(cpuctx->ctx.pmu);
3409 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 3732 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3410done:
3411 3733
3412 return rotate; 3734 return true;
3413} 3735}
3414 3736
3415void perf_event_task_tick(void) 3737void perf_event_task_tick(void)
@@ -3554,7 +3876,7 @@ static void __perf_event_read(void *info)
3554 3876
3555 pmu->read(event); 3877 pmu->read(event);
3556 3878
3557 list_for_each_entry(sub, &event->sibling_list, group_entry) { 3879 for_each_sibling_event(sub, event) {
3558 if (sub->state == PERF_EVENT_STATE_ACTIVE) { 3880 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3559 /* 3881 /*
3560 * Use sibling's PMU rather than @event's since 3882 * Use sibling's PMU rather than @event's since
@@ -3728,9 +4050,11 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
3728 raw_spin_lock_init(&ctx->lock); 4050 raw_spin_lock_init(&ctx->lock);
3729 mutex_init(&ctx->mutex); 4051 mutex_init(&ctx->mutex);
3730 INIT_LIST_HEAD(&ctx->active_ctx_list); 4052 INIT_LIST_HEAD(&ctx->active_ctx_list);
3731 INIT_LIST_HEAD(&ctx->pinned_groups); 4053 perf_event_groups_init(&ctx->pinned_groups);
3732 INIT_LIST_HEAD(&ctx->flexible_groups); 4054 perf_event_groups_init(&ctx->flexible_groups);
3733 INIT_LIST_HEAD(&ctx->event_list); 4055 INIT_LIST_HEAD(&ctx->event_list);
4056 INIT_LIST_HEAD(&ctx->pinned_active);
4057 INIT_LIST_HEAD(&ctx->flexible_active);
3734 atomic_set(&ctx->refcount, 1); 4058 atomic_set(&ctx->refcount, 1);
3735} 4059}
3736 4060
@@ -4400,7 +4724,7 @@ static int __perf_read_group_add(struct perf_event *leader,
4400 if (read_format & PERF_FORMAT_ID) 4724 if (read_format & PERF_FORMAT_ID)
4401 values[n++] = primary_event_id(leader); 4725 values[n++] = primary_event_id(leader);
4402 4726
4403 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 4727 for_each_sibling_event(sub, leader) {
4404 values[n++] += perf_event_count(sub); 4728 values[n++] += perf_event_count(sub);
4405 if (read_format & PERF_FORMAT_ID) 4729 if (read_format & PERF_FORMAT_ID)
4406 values[n++] = primary_event_id(sub); 4730 values[n++] = primary_event_id(sub);
@@ -4594,7 +4918,7 @@ static void perf_event_for_each(struct perf_event *event,
4594 event = event->group_leader; 4918 event = event->group_leader;
4595 4919
4596 perf_event_for_each_child(event, func); 4920 perf_event_for_each_child(event, func);
4597 list_for_each_entry(sibling, &event->sibling_list, group_entry) 4921 for_each_sibling_event(sibling, event)
4598 perf_event_for_each_child(sibling, func); 4922 perf_event_for_each_child(sibling, func);
4599} 4923}
4600 4924
@@ -4676,6 +5000,8 @@ static int perf_event_set_output(struct perf_event *event,
4676 struct perf_event *output_event); 5000 struct perf_event *output_event);
4677static int perf_event_set_filter(struct perf_event *event, void __user *arg); 5001static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4678static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); 5002static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
5003static int perf_copy_attr(struct perf_event_attr __user *uattr,
5004 struct perf_event_attr *attr);
4679 5005
4680static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) 5006static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
4681{ 5007{
@@ -4748,6 +5074,17 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
4748 5074
4749 case PERF_EVENT_IOC_QUERY_BPF: 5075 case PERF_EVENT_IOC_QUERY_BPF:
4750 return perf_event_query_prog_array(event, (void __user *)arg); 5076 return perf_event_query_prog_array(event, (void __user *)arg);
5077
5078 case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5079 struct perf_event_attr new_attr;
5080 int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5081 &new_attr);
5082
5083 if (err)
5084 return err;
5085
5086 return perf_event_modify_attr(event, &new_attr);
5087 }
4751 default: 5088 default:
4752 return -ENOTTY; 5089 return -ENOTTY;
4753 } 5090 }
@@ -5743,7 +6080,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
5743 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 6080 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5744 values[n++] = running; 6081 values[n++] = running;
5745 6082
5746 if (leader != event) 6083 if ((leader != event) &&
6084 (leader->state == PERF_EVENT_STATE_ACTIVE))
5747 leader->pmu->read(leader); 6085 leader->pmu->read(leader);
5748 6086
5749 values[n++] = perf_event_count(leader); 6087 values[n++] = perf_event_count(leader);
@@ -5752,7 +6090,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
5752 6090
5753 __output_copy(handle, values, n * sizeof(u64)); 6091 __output_copy(handle, values, n * sizeof(u64));
5754 6092
5755 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 6093 for_each_sibling_event(sub, leader) {
5756 n = 0; 6094 n = 0;
5757 6095
5758 if ((sub != event) && 6096 if ((sub != event) &&
@@ -8009,9 +8347,119 @@ static struct pmu perf_tracepoint = {
8009 .read = perf_swevent_read, 8347 .read = perf_swevent_read,
8010}; 8348};
8011 8349
8350#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
8351/*
8352 * Flags in config, used by dynamic PMU kprobe and uprobe
8353 * The flags should match following PMU_FORMAT_ATTR().
8354 *
8355 * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
8356 * if not set, create kprobe/uprobe
8357 */
8358enum perf_probe_config {
8359 PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */
8360};
8361
8362PMU_FORMAT_ATTR(retprobe, "config:0");
8363
8364static struct attribute *probe_attrs[] = {
8365 &format_attr_retprobe.attr,
8366 NULL,
8367};
8368
8369static struct attribute_group probe_format_group = {
8370 .name = "format",
8371 .attrs = probe_attrs,
8372};
8373
8374static const struct attribute_group *probe_attr_groups[] = {
8375 &probe_format_group,
8376 NULL,
8377};
8378#endif
8379
8380#ifdef CONFIG_KPROBE_EVENTS
8381static int perf_kprobe_event_init(struct perf_event *event);
8382static struct pmu perf_kprobe = {
8383 .task_ctx_nr = perf_sw_context,
8384 .event_init = perf_kprobe_event_init,
8385 .add = perf_trace_add,
8386 .del = perf_trace_del,
8387 .start = perf_swevent_start,
8388 .stop = perf_swevent_stop,
8389 .read = perf_swevent_read,
8390 .attr_groups = probe_attr_groups,
8391};
8392
8393static int perf_kprobe_event_init(struct perf_event *event)
8394{
8395 int err;
8396 bool is_retprobe;
8397
8398 if (event->attr.type != perf_kprobe.type)
8399 return -ENOENT;
8400 /*
8401 * no branch sampling for probe events
8402 */
8403 if (has_branch_stack(event))
8404 return -EOPNOTSUPP;
8405
8406 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
8407 err = perf_kprobe_init(event, is_retprobe);
8408 if (err)
8409 return err;
8410
8411 event->destroy = perf_kprobe_destroy;
8412
8413 return 0;
8414}
8415#endif /* CONFIG_KPROBE_EVENTS */
8416
8417#ifdef CONFIG_UPROBE_EVENTS
8418static int perf_uprobe_event_init(struct perf_event *event);
8419static struct pmu perf_uprobe = {
8420 .task_ctx_nr = perf_sw_context,
8421 .event_init = perf_uprobe_event_init,
8422 .add = perf_trace_add,
8423 .del = perf_trace_del,
8424 .start = perf_swevent_start,
8425 .stop = perf_swevent_stop,
8426 .read = perf_swevent_read,
8427 .attr_groups = probe_attr_groups,
8428};
8429
8430static int perf_uprobe_event_init(struct perf_event *event)
8431{
8432 int err;
8433 bool is_retprobe;
8434
8435 if (event->attr.type != perf_uprobe.type)
8436 return -ENOENT;
8437 /*
8438 * no branch sampling for probe events
8439 */
8440 if (has_branch_stack(event))
8441 return -EOPNOTSUPP;
8442
8443 is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
8444 err = perf_uprobe_init(event, is_retprobe);
8445 if (err)
8446 return err;
8447
8448 event->destroy = perf_uprobe_destroy;
8449
8450 return 0;
8451}
8452#endif /* CONFIG_UPROBE_EVENTS */
8453
8012static inline void perf_tp_register(void) 8454static inline void perf_tp_register(void)
8013{ 8455{
8014 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); 8456 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
8457#ifdef CONFIG_KPROBE_EVENTS
8458 perf_pmu_register(&perf_kprobe, "kprobe", -1);
8459#endif
8460#ifdef CONFIG_UPROBE_EVENTS
8461 perf_pmu_register(&perf_uprobe, "uprobe", -1);
8462#endif
8015} 8463}
8016 8464
8017static void perf_event_free_filter(struct perf_event *event) 8465static void perf_event_free_filter(struct perf_event *event)
@@ -8088,13 +8536,32 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
8088} 8536}
8089#endif 8537#endif
8090 8538
8539/*
8540 * returns true if the event is a tracepoint, or a kprobe/upprobe created
8541 * with perf_event_open()
8542 */
8543static inline bool perf_event_is_tracing(struct perf_event *event)
8544{
8545 if (event->pmu == &perf_tracepoint)
8546 return true;
8547#ifdef CONFIG_KPROBE_EVENTS
8548 if (event->pmu == &perf_kprobe)
8549 return true;
8550#endif
8551#ifdef CONFIG_UPROBE_EVENTS
8552 if (event->pmu == &perf_uprobe)
8553 return true;
8554#endif
8555 return false;
8556}
8557
8091static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) 8558static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8092{ 8559{
8093 bool is_kprobe, is_tracepoint, is_syscall_tp; 8560 bool is_kprobe, is_tracepoint, is_syscall_tp;
8094 struct bpf_prog *prog; 8561 struct bpf_prog *prog;
8095 int ret; 8562 int ret;
8096 8563
8097 if (event->attr.type != PERF_TYPE_TRACEPOINT) 8564 if (!perf_event_is_tracing(event))
8098 return perf_event_set_bpf_handler(event, prog_fd); 8565 return perf_event_set_bpf_handler(event, prog_fd);
8099 8566
8100 is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; 8567 is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
@@ -8140,7 +8607,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8140 8607
8141static void perf_event_free_bpf_prog(struct perf_event *event) 8608static void perf_event_free_bpf_prog(struct perf_event *event)
8142{ 8609{
8143 if (event->attr.type != PERF_TYPE_TRACEPOINT) { 8610 if (!perf_event_is_tracing(event)) {
8144 perf_event_free_bpf_handler(event); 8611 perf_event_free_bpf_handler(event);
8145 return; 8612 return;
8146 } 8613 }
@@ -8336,7 +8803,8 @@ restart:
8336 * * for kernel addresses: <start address>[/<size>] 8803 * * for kernel addresses: <start address>[/<size>]
8337 * * for object files: <start address>[/<size>]@</path/to/object/file> 8804 * * for object files: <start address>[/<size>]@</path/to/object/file>
8338 * 8805 *
8339 * if <size> is not specified, the range is treated as a single address. 8806 * if <size> is not specified or is zero, the range is treated as a single
8807 * address; not valid for ACTION=="filter".
8340 */ 8808 */
8341enum { 8809enum {
8342 IF_ACT_NONE = -1, 8810 IF_ACT_NONE = -1,
@@ -8386,6 +8854,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
8386 return -ENOMEM; 8854 return -ENOMEM;
8387 8855
8388 while ((start = strsep(&fstr, " ,\n")) != NULL) { 8856 while ((start = strsep(&fstr, " ,\n")) != NULL) {
8857 static const enum perf_addr_filter_action_t actions[] = {
8858 [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
8859 [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START,
8860 [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP,
8861 };
8389 ret = -EINVAL; 8862 ret = -EINVAL;
8390 8863
8391 if (!*start) 8864 if (!*start)
@@ -8402,12 +8875,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
8402 switch (token) { 8875 switch (token) {
8403 case IF_ACT_FILTER: 8876 case IF_ACT_FILTER:
8404 case IF_ACT_START: 8877 case IF_ACT_START:
8405 filter->filter = 1;
8406
8407 case IF_ACT_STOP: 8878 case IF_ACT_STOP:
8408 if (state != IF_STATE_ACTION) 8879 if (state != IF_STATE_ACTION)
8409 goto fail; 8880 goto fail;
8410 8881
8882 filter->action = actions[token];
8411 state = IF_STATE_SOURCE; 8883 state = IF_STATE_SOURCE;
8412 break; 8884 break;
8413 8885
@@ -8420,15 +8892,12 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
8420 if (state != IF_STATE_SOURCE) 8892 if (state != IF_STATE_SOURCE)
8421 goto fail; 8893 goto fail;
8422 8894
8423 if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
8424 filter->range = 1;
8425
8426 *args[0].to = 0; 8895 *args[0].to = 0;
8427 ret = kstrtoul(args[0].from, 0, &filter->offset); 8896 ret = kstrtoul(args[0].from, 0, &filter->offset);
8428 if (ret) 8897 if (ret)
8429 goto fail; 8898 goto fail;
8430 8899
8431 if (filter->range) { 8900 if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
8432 *args[1].to = 0; 8901 *args[1].to = 0;
8433 ret = kstrtoul(args[1].from, 0, &filter->size); 8902 ret = kstrtoul(args[1].from, 0, &filter->size);
8434 if (ret) 8903 if (ret)
@@ -8436,7 +8905,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
8436 } 8905 }
8437 8906
8438 if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { 8907 if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
8439 int fpos = filter->range ? 2 : 1; 8908 int fpos = token == IF_SRC_FILE ? 2 : 1;
8440 8909
8441 filename = match_strdup(&args[fpos]); 8910 filename = match_strdup(&args[fpos]);
8442 if (!filename) { 8911 if (!filename) {
@@ -8462,6 +8931,14 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
8462 if (kernel && event->attr.exclude_kernel) 8931 if (kernel && event->attr.exclude_kernel)
8463 goto fail; 8932 goto fail;
8464 8933
8934 /*
8935 * ACTION "filter" must have a non-zero length region
8936 * specified.
8937 */
8938 if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
8939 !filter->size)
8940 goto fail;
8941
8465 if (!kernel) { 8942 if (!kernel) {
8466 if (!filename) 8943 if (!filename)
8467 goto fail; 8944 goto fail;
@@ -8559,47 +9036,36 @@ fail_clear_files:
8559 return ret; 9036 return ret;
8560} 9037}
8561 9038
8562static int
8563perf_tracepoint_set_filter(struct perf_event *event, char *filter_str)
8564{
8565 struct perf_event_context *ctx = event->ctx;
8566 int ret;
8567
8568 /*
8569 * Beware, here be dragons!!
8570 *
8571 * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint
8572 * stuff does not actually need it. So temporarily drop ctx->mutex. As per
8573 * perf_event_ctx_lock() we already have a reference on ctx.
8574 *
8575 * This can result in event getting moved to a different ctx, but that
8576 * does not affect the tracepoint state.
8577 */
8578 mutex_unlock(&ctx->mutex);
8579 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
8580 mutex_lock(&ctx->mutex);
8581
8582 return ret;
8583}
8584
8585static int perf_event_set_filter(struct perf_event *event, void __user *arg) 9039static int perf_event_set_filter(struct perf_event *event, void __user *arg)
8586{ 9040{
8587 char *filter_str;
8588 int ret = -EINVAL; 9041 int ret = -EINVAL;
8589 9042 char *filter_str;
8590 if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
8591 !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
8592 !has_addr_filter(event))
8593 return -EINVAL;
8594 9043
8595 filter_str = strndup_user(arg, PAGE_SIZE); 9044 filter_str = strndup_user(arg, PAGE_SIZE);
8596 if (IS_ERR(filter_str)) 9045 if (IS_ERR(filter_str))
8597 return PTR_ERR(filter_str); 9046 return PTR_ERR(filter_str);
8598 9047
8599 if (IS_ENABLED(CONFIG_EVENT_TRACING) && 9048#ifdef CONFIG_EVENT_TRACING
8600 event->attr.type == PERF_TYPE_TRACEPOINT) 9049 if (perf_event_is_tracing(event)) {
8601 ret = perf_tracepoint_set_filter(event, filter_str); 9050 struct perf_event_context *ctx = event->ctx;
8602 else if (has_addr_filter(event)) 9051
9052 /*
9053 * Beware, here be dragons!!
9054 *
9055 * the tracepoint muck will deadlock against ctx->mutex, but
9056 * the tracepoint stuff does not actually need it. So
9057 * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
9058 * already have a reference on ctx.
9059 *
9060 * This can result in event getting moved to a different ctx,
9061 * but that does not affect the tracepoint state.
9062 */
9063 mutex_unlock(&ctx->mutex);
9064 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
9065 mutex_lock(&ctx->mutex);
9066 } else
9067#endif
9068 if (has_addr_filter(event))
8603 ret = perf_event_set_addr_filter(event, filter_str); 9069 ret = perf_event_set_addr_filter(event, filter_str);
8604 9070
8605 kfree(filter_str); 9071 kfree(filter_str);
@@ -9452,9 +9918,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
9452 mutex_init(&event->child_mutex); 9918 mutex_init(&event->child_mutex);
9453 INIT_LIST_HEAD(&event->child_list); 9919 INIT_LIST_HEAD(&event->child_list);
9454 9920
9455 INIT_LIST_HEAD(&event->group_entry);
9456 INIT_LIST_HEAD(&event->event_entry); 9921 INIT_LIST_HEAD(&event->event_entry);
9457 INIT_LIST_HEAD(&event->sibling_list); 9922 INIT_LIST_HEAD(&event->sibling_list);
9923 INIT_LIST_HEAD(&event->active_list);
9924 init_event_group(event);
9458 INIT_LIST_HEAD(&event->rb_entry); 9925 INIT_LIST_HEAD(&event->rb_entry);
9459 INIT_LIST_HEAD(&event->active_entry); 9926 INIT_LIST_HEAD(&event->active_entry);
9460 INIT_LIST_HEAD(&event->addr_filters.list); 9927 INIT_LIST_HEAD(&event->addr_filters.list);
@@ -9729,6 +10196,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
9729 ret = -EINVAL; 10196 ret = -EINVAL;
9730 } 10197 }
9731 10198
10199 if (!attr->sample_max_stack)
10200 attr->sample_max_stack = sysctl_perf_event_max_stack;
10201
9732 if (attr->sample_type & PERF_SAMPLE_REGS_INTR) 10202 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
9733 ret = perf_reg_validate(attr->sample_regs_intr); 10203 ret = perf_reg_validate(attr->sample_regs_intr);
9734out: 10204out:
@@ -9942,9 +10412,6 @@ SYSCALL_DEFINE5(perf_event_open,
9942 perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 10412 perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
9943 return -EACCES; 10413 return -EACCES;
9944 10414
9945 if (!attr.sample_max_stack)
9946 attr.sample_max_stack = sysctl_perf_event_max_stack;
9947
9948 /* 10415 /*
9949 * In cgroup mode, the pid argument is used to pass the fd 10416 * In cgroup mode, the pid argument is used to pass the fd
9950 * opened to the cgroup directory in cgroupfs. The cpu argument 10417 * opened to the cgroup directory in cgroupfs. The cpu argument
@@ -10218,8 +10685,7 @@ SYSCALL_DEFINE5(perf_event_open,
10218 perf_remove_from_context(group_leader, 0); 10685 perf_remove_from_context(group_leader, 0);
10219 put_ctx(gctx); 10686 put_ctx(gctx);
10220 10687
10221 list_for_each_entry(sibling, &group_leader->sibling_list, 10688 for_each_sibling_event(sibling, group_leader) {
10222 group_entry) {
10223 perf_remove_from_context(sibling, 0); 10689 perf_remove_from_context(sibling, 0);
10224 put_ctx(gctx); 10690 put_ctx(gctx);
10225 } 10691 }
@@ -10240,8 +10706,7 @@ SYSCALL_DEFINE5(perf_event_open,
10240 * By installing siblings first we NO-OP because they're not 10706 * By installing siblings first we NO-OP because they're not
10241 * reachable through the group lists. 10707 * reachable through the group lists.
10242 */ 10708 */
10243 list_for_each_entry(sibling, &group_leader->sibling_list, 10709 for_each_sibling_event(sibling, group_leader) {
10244 group_entry) {
10245 perf_event__state_init(sibling); 10710 perf_event__state_init(sibling);
10246 perf_install_in_context(ctx, sibling, sibling->cpu); 10711 perf_install_in_context(ctx, sibling, sibling->cpu);
10247 get_ctx(ctx); 10712 get_ctx(ctx);
@@ -10880,7 +11345,7 @@ static int inherit_group(struct perf_event *parent_event,
10880 * case inherit_event() will create individual events, similar to what 11345 * case inherit_event() will create individual events, similar to what
10881 * perf_group_detach() would do anyway. 11346 * perf_group_detach() would do anyway.
10882 */ 11347 */
10883 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { 11348 for_each_sibling_event(sub, parent_event) {
10884 child_ctr = inherit_event(sub, parent, parent_ctx, 11349 child_ctr = inherit_event(sub, parent, parent_ctx,
10885 child, leader, child_ctx); 11350 child, leader, child_ctx);
10886 if (IS_ERR(child_ctr)) 11351 if (IS_ERR(child_ctr))
@@ -10979,7 +11444,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
10979 * We dont have to disable NMIs - we are only looking at 11444 * We dont have to disable NMIs - we are only looking at
10980 * the list, not manipulating it: 11445 * the list, not manipulating it:
10981 */ 11446 */
10982 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { 11447 perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
10983 ret = inherit_task_group(event, parent, parent_ctx, 11448 ret = inherit_task_group(event, parent, parent_ctx,
10984 child, ctxn, &inherited_all); 11449 child, ctxn, &inherited_all);
10985 if (ret) 11450 if (ret)
@@ -10995,7 +11460,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
10995 parent_ctx->rotate_disable = 1; 11460 parent_ctx->rotate_disable = 1;
10996 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); 11461 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
10997 11462
10998 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 11463 perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
10999 ret = inherit_task_group(event, parent, parent_ctx, 11464 ret = inherit_task_group(event, parent, parent_ctx,
11000 child, ctxn, &inherited_all); 11465 child, ctxn, &inherited_all);
11001 if (ret) 11466 if (ret)
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 3f8cb1e14588..6e28d2866be5 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -44,6 +44,7 @@
44#include <linux/list.h> 44#include <linux/list.h>
45#include <linux/cpu.h> 45#include <linux/cpu.h>
46#include <linux/smp.h> 46#include <linux/smp.h>
47#include <linux/bug.h>
47 48
48#include <linux/hw_breakpoint.h> 49#include <linux/hw_breakpoint.h>
49/* 50/*
@@ -85,9 +86,9 @@ __weak int hw_breakpoint_weight(struct perf_event *bp)
85 return 1; 86 return 1;
86} 87}
87 88
88static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) 89static inline enum bp_type_idx find_slot_idx(u64 bp_type)
89{ 90{
90 if (bp->attr.bp_type & HW_BREAKPOINT_RW) 91 if (bp_type & HW_BREAKPOINT_RW)
91 return TYPE_DATA; 92 return TYPE_DATA;
92 93
93 return TYPE_INST; 94 return TYPE_INST;
@@ -122,7 +123,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
122 123
123 list_for_each_entry(iter, &bp_task_head, hw.bp_list) { 124 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
124 if (iter->hw.target == tsk && 125 if (iter->hw.target == tsk &&
125 find_slot_idx(iter) == type && 126 find_slot_idx(iter->attr.bp_type) == type &&
126 (iter->cpu < 0 || cpu == iter->cpu)) 127 (iter->cpu < 0 || cpu == iter->cpu))
127 count += hw_breakpoint_weight(iter); 128 count += hw_breakpoint_weight(iter);
128 } 129 }
@@ -277,7 +278,7 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
277 * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *)) 278 * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *))
278 * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM 279 * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM
279 */ 280 */
280static int __reserve_bp_slot(struct perf_event *bp) 281static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
281{ 282{
282 struct bp_busy_slots slots = {0}; 283 struct bp_busy_slots slots = {0};
283 enum bp_type_idx type; 284 enum bp_type_idx type;
@@ -288,11 +289,11 @@ static int __reserve_bp_slot(struct perf_event *bp)
288 return -ENOMEM; 289 return -ENOMEM;
289 290
290 /* Basic checks */ 291 /* Basic checks */
291 if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || 292 if (bp_type == HW_BREAKPOINT_EMPTY ||
292 bp->attr.bp_type == HW_BREAKPOINT_INVALID) 293 bp_type == HW_BREAKPOINT_INVALID)
293 return -EINVAL; 294 return -EINVAL;
294 295
295 type = find_slot_idx(bp); 296 type = find_slot_idx(bp_type);
296 weight = hw_breakpoint_weight(bp); 297 weight = hw_breakpoint_weight(bp);
297 298
298 fetch_bp_busy_slots(&slots, bp, type); 299 fetch_bp_busy_slots(&slots, bp, type);
@@ -317,19 +318,19 @@ int reserve_bp_slot(struct perf_event *bp)
317 318
318 mutex_lock(&nr_bp_mutex); 319 mutex_lock(&nr_bp_mutex);
319 320
320 ret = __reserve_bp_slot(bp); 321 ret = __reserve_bp_slot(bp, bp->attr.bp_type);
321 322
322 mutex_unlock(&nr_bp_mutex); 323 mutex_unlock(&nr_bp_mutex);
323 324
324 return ret; 325 return ret;
325} 326}
326 327
327static void __release_bp_slot(struct perf_event *bp) 328static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
328{ 329{
329 enum bp_type_idx type; 330 enum bp_type_idx type;
330 int weight; 331 int weight;
331 332
332 type = find_slot_idx(bp); 333 type = find_slot_idx(bp_type);
333 weight = hw_breakpoint_weight(bp); 334 weight = hw_breakpoint_weight(bp);
334 toggle_bp_slot(bp, false, type, weight); 335 toggle_bp_slot(bp, false, type, weight);
335} 336}
@@ -339,11 +340,43 @@ void release_bp_slot(struct perf_event *bp)
339 mutex_lock(&nr_bp_mutex); 340 mutex_lock(&nr_bp_mutex);
340 341
341 arch_unregister_hw_breakpoint(bp); 342 arch_unregister_hw_breakpoint(bp);
342 __release_bp_slot(bp); 343 __release_bp_slot(bp, bp->attr.bp_type);
343 344
344 mutex_unlock(&nr_bp_mutex); 345 mutex_unlock(&nr_bp_mutex);
345} 346}
346 347
348static int __modify_bp_slot(struct perf_event *bp, u64 old_type)
349{
350 int err;
351
352 __release_bp_slot(bp, old_type);
353
354 err = __reserve_bp_slot(bp, bp->attr.bp_type);
355 if (err) {
356 /*
357 * Reserve the old_type slot back in case
358 * there's no space for the new type.
359 *
360 * This must succeed, because we just released
361 * the old_type slot in the __release_bp_slot
362 * call above. If not, something is broken.
363 */
364 WARN_ON(__reserve_bp_slot(bp, old_type));
365 }
366
367 return err;
368}
369
370static int modify_bp_slot(struct perf_event *bp, u64 old_type)
371{
372 int ret;
373
374 mutex_lock(&nr_bp_mutex);
375 ret = __modify_bp_slot(bp, old_type);
376 mutex_unlock(&nr_bp_mutex);
377 return ret;
378}
379
347/* 380/*
348 * Allow the kernel debugger to reserve breakpoint slots without 381 * Allow the kernel debugger to reserve breakpoint slots without
349 * taking a lock using the dbg_* variant of for the reserve and 382 * taking a lock using the dbg_* variant of for the reserve and
@@ -354,7 +387,7 @@ int dbg_reserve_bp_slot(struct perf_event *bp)
354 if (mutex_is_locked(&nr_bp_mutex)) 387 if (mutex_is_locked(&nr_bp_mutex))
355 return -1; 388 return -1;
356 389
357 return __reserve_bp_slot(bp); 390 return __reserve_bp_slot(bp, bp->attr.bp_type);
358} 391}
359 392
360int dbg_release_bp_slot(struct perf_event *bp) 393int dbg_release_bp_slot(struct perf_event *bp)
@@ -362,7 +395,7 @@ int dbg_release_bp_slot(struct perf_event *bp)
362 if (mutex_is_locked(&nr_bp_mutex)) 395 if (mutex_is_locked(&nr_bp_mutex))
363 return -1; 396 return -1;
364 397
365 __release_bp_slot(bp); 398 __release_bp_slot(bp, bp->attr.bp_type);
366 399
367 return 0; 400 return 0;
368} 401}
@@ -423,20 +456,45 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
423} 456}
424EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); 457EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
425 458
459int
460modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr,
461 bool check)
462{
463 u64 old_addr = bp->attr.bp_addr;
464 u64 old_len = bp->attr.bp_len;
465 int old_type = bp->attr.bp_type;
466 bool modify = attr->bp_type != old_type;
467 int err = 0;
468
469 bp->attr.bp_addr = attr->bp_addr;
470 bp->attr.bp_type = attr->bp_type;
471 bp->attr.bp_len = attr->bp_len;
472
473 if (check && memcmp(&bp->attr, attr, sizeof(*attr)))
474 return -EINVAL;
475
476 err = validate_hw_breakpoint(bp);
477 if (!err && modify)
478 err = modify_bp_slot(bp, old_type);
479
480 if (err) {
481 bp->attr.bp_addr = old_addr;
482 bp->attr.bp_type = old_type;
483 bp->attr.bp_len = old_len;
484 return err;
485 }
486
487 bp->attr.disabled = attr->disabled;
488 return 0;
489}
490
426/** 491/**
427 * modify_user_hw_breakpoint - modify a user-space hardware breakpoint 492 * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
428 * @bp: the breakpoint structure to modify 493 * @bp: the breakpoint structure to modify
429 * @attr: new breakpoint attributes 494 * @attr: new breakpoint attributes
430 * @triggered: callback to trigger when we hit the breakpoint
431 * @tsk: pointer to 'task_struct' of the process to which the address belongs
432 */ 495 */
433int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) 496int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
434{ 497{
435 u64 old_addr = bp->attr.bp_addr;
436 u64 old_len = bp->attr.bp_len;
437 int old_type = bp->attr.bp_type;
438 int err = 0;
439
440 /* 498 /*
441 * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it 499 * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
442 * will not be possible to raise IPIs that invoke __perf_event_disable. 500 * will not be possible to raise IPIs that invoke __perf_event_disable.
@@ -448,30 +506,14 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
448 else 506 else
449 perf_event_disable(bp); 507 perf_event_disable(bp);
450 508
451 bp->attr.bp_addr = attr->bp_addr; 509 if (!attr->disabled) {
452 bp->attr.bp_type = attr->bp_type; 510 int err = modify_user_hw_breakpoint_check(bp, attr, false);
453 bp->attr.bp_len = attr->bp_len;
454
455 if (attr->disabled)
456 goto end;
457 511
458 err = validate_hw_breakpoint(bp); 512 if (err)
459 if (!err) 513 return err;
460 perf_event_enable(bp); 514 perf_event_enable(bp);
461 515 bp->attr.disabled = 0;
462 if (err) {
463 bp->attr.bp_addr = old_addr;
464 bp->attr.bp_type = old_type;
465 bp->attr.bp_len = old_len;
466 if (!bp->attr.disabled)
467 perf_event_enable(bp);
468
469 return err;
470 } 516 }
471
472end:
473 bp->attr.disabled = attr->disabled;
474
475 return 0; 517 return 0;
476} 518}
477EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); 519EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
diff --git a/kernel/exit.c b/kernel/exit.c
index 995453d9fb55..c3c7ac560114 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1691,7 +1691,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1691 */ 1691 */
1692SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) 1692SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1693{ 1693{
1694 return sys_wait4(pid, stat_addr, options, NULL); 1694 return kernel_wait4(pid, stat_addr, options, NULL);
1695} 1695}
1696 1696
1697#endif 1697#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index e5d9d405ae4e..f71b67dc156d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1198,8 +1198,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
1198 * not set up a proper pointer then tough luck. 1198 * not set up a proper pointer then tough luck.
1199 */ 1199 */
1200 put_user(0, tsk->clear_child_tid); 1200 put_user(0, tsk->clear_child_tid);
1201 sys_futex(tsk->clear_child_tid, FUTEX_WAKE, 1201 do_futex(tsk->clear_child_tid, FUTEX_WAKE,
1202 1, NULL, NULL, 0); 1202 1, NULL, NULL, 0, 0);
1203 } 1203 }
1204 tsk->clear_child_tid = NULL; 1204 tsk->clear_child_tid = NULL;
1205 } 1205 }
@@ -2354,7 +2354,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
2354 * constructed. Here we are modifying the current, active, 2354 * constructed. Here we are modifying the current, active,
2355 * task_struct. 2355 * task_struct.
2356 */ 2356 */
2357SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) 2357int ksys_unshare(unsigned long unshare_flags)
2358{ 2358{
2359 struct fs_struct *fs, *new_fs = NULL; 2359 struct fs_struct *fs, *new_fs = NULL;
2360 struct files_struct *fd, *new_fd = NULL; 2360 struct files_struct *fd, *new_fd = NULL;
@@ -2470,6 +2470,11 @@ bad_unshare_out:
2470 return err; 2470 return err;
2471} 2471}
2472 2472
2473SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
2474{
2475 return ksys_unshare(unshare_flags);
2476}
2477
2473/* 2478/*
2474 * Helper to unshare the files of the current task. 2479 * Helper to unshare the files of the current task.
2475 * We don't want to expose copy_files internals to 2480 * We don't want to expose copy_files internals to
diff --git a/kernel/kexec.c b/kernel/kexec.c
index e62ec4dc6620..aed8fb2564b3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -192,11 +192,9 @@ out:
192 * that to happen you need to do that yourself. 192 * that to happen you need to do that yourself.
193 */ 193 */
194 194
195SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, 195static inline int kexec_load_check(unsigned long nr_segments,
196 struct kexec_segment __user *, segments, unsigned long, flags) 196 unsigned long flags)
197{ 197{
198 int result;
199
200 /* We only trust the superuser with rebooting the system. */ 198 /* We only trust the superuser with rebooting the system. */
201 if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) 199 if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
202 return -EPERM; 200 return -EPERM;
@@ -208,17 +206,29 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
208 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) 206 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
209 return -EINVAL; 207 return -EINVAL;
210 208
211 /* Verify we are on the appropriate architecture */
212 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
213 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
214 return -EINVAL;
215
216 /* Put an artificial cap on the number 209 /* Put an artificial cap on the number
217 * of segments passed to kexec_load. 210 * of segments passed to kexec_load.
218 */ 211 */
219 if (nr_segments > KEXEC_SEGMENT_MAX) 212 if (nr_segments > KEXEC_SEGMENT_MAX)
220 return -EINVAL; 213 return -EINVAL;
221 214
215 return 0;
216}
217
218SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
219 struct kexec_segment __user *, segments, unsigned long, flags)
220{
221 int result;
222
223 result = kexec_load_check(nr_segments, flags);
224 if (result)
225 return result;
226
227 /* Verify we are on the appropriate architecture */
228 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
229 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
230 return -EINVAL;
231
222 /* Because we write directly to the reserved memory 232 /* Because we write directly to the reserved memory
223 * region when loading crash kernels we need a mutex here to 233 * region when loading crash kernels we need a mutex here to
224 * prevent multiple crash kernels from attempting to load 234 * prevent multiple crash kernels from attempting to load
@@ -247,15 +257,16 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
247 struct kexec_segment out, __user *ksegments; 257 struct kexec_segment out, __user *ksegments;
248 unsigned long i, result; 258 unsigned long i, result;
249 259
260 result = kexec_load_check(nr_segments, flags);
261 if (result)
262 return result;
263
250 /* Don't allow clients that don't understand the native 264 /* Don't allow clients that don't understand the native
251 * architecture to do anything. 265 * architecture to do anything.
252 */ 266 */
253 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) 267 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
254 return -EINVAL; 268 return -EINVAL;
255 269
256 if (nr_segments > KEXEC_SEGMENT_MAX)
257 return -EINVAL;
258
259 ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); 270 ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
260 for (i = 0; i < nr_segments; i++) { 271 for (i = 0; i < nr_segments; i++) {
261 result = copy_from_user(&in, &segments[i], sizeof(in)); 272 result = copy_from_user(&in, &segments[i], sizeof(in));
@@ -272,6 +283,21 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
272 return -EFAULT; 283 return -EFAULT;
273 } 284 }
274 285
275 return sys_kexec_load(entry, nr_segments, ksegments, flags); 286 /* Because we write directly to the reserved memory
287 * region when loading crash kernels we need a mutex here to
288 * prevent multiple crash kernels from attempting to load
289 * simultaneously, and to prevent a crash kernel from loading
290 * over the top of a in use crash kernel.
291 *
292 * KISS: always take the mutex.
293 */
294 if (!mutex_trylock(&kexec_mutex))
295 return -EBUSY;
296
297 result = do_kexec_load(entry, nr_segments, ksegments, flags);
298
299 mutex_unlock(&kexec_mutex);
300
301 return result;
276} 302}
277#endif 303#endif
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 89b5f83f1969..023386338269 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -556,9 +556,9 @@ static void print_lock(struct held_lock *hlock)
556 return; 556 return;
557 } 557 }
558 558
559 printk(KERN_CONT "%p", hlock->instance);
559 print_lock_name(lock_classes + class_idx - 1); 560 print_lock_name(lock_classes + class_idx - 1);
560 printk(KERN_CONT ", at: [<%p>] %pS\n", 561 printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
561 (void *)hlock->acquire_ip, (void *)hlock->acquire_ip);
562} 562}
563 563
564static void lockdep_print_held_locks(struct task_struct *curr) 564static void lockdep_print_held_locks(struct task_struct *curr)
@@ -808,7 +808,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
808 if (verbose(class)) { 808 if (verbose(class)) {
809 graph_unlock(); 809 graph_unlock();
810 810
811 printk("\nnew class %p: %s", class->key, class->name); 811 printk("\nnew class %px: %s", class->key, class->name);
812 if (class->name_version > 1) 812 if (class->name_version > 1)
813 printk(KERN_CONT "#%d", class->name_version); 813 printk(KERN_CONT "#%d", class->name_version);
814 printk(KERN_CONT "\n"); 814 printk(KERN_CONT "\n");
@@ -1407,7 +1407,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
1407 } 1407 }
1408 printk("%*s }\n", depth, ""); 1408 printk("%*s }\n", depth, "");
1409 1409
1410 printk("%*s ... key at: [<%p>] %pS\n", 1410 printk("%*s ... key at: [<%px>] %pS\n",
1411 depth, "", class->key, class->key); 1411 depth, "", class->key, class->key);
1412} 1412}
1413 1413
@@ -2340,7 +2340,7 @@ cache_hit:
2340 2340
2341 if (very_verbose(class)) { 2341 if (very_verbose(class)) {
2342 printk("\nhash chain already cached, key: " 2342 printk("\nhash chain already cached, key: "
2343 "%016Lx tail class: [%p] %s\n", 2343 "%016Lx tail class: [%px] %s\n",
2344 (unsigned long long)chain_key, 2344 (unsigned long long)chain_key,
2345 class->key, class->name); 2345 class->key, class->name);
2346 } 2346 }
@@ -2349,7 +2349,7 @@ cache_hit:
2349 } 2349 }
2350 2350
2351 if (very_verbose(class)) { 2351 if (very_verbose(class)) {
2352 printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", 2352 printk("\nnew hash chain, key: %016Lx tail class: [%px] %s\n",
2353 (unsigned long long)chain_key, class->key, class->name); 2353 (unsigned long long)chain_key, class->key, class->name);
2354 } 2354 }
2355 2355
@@ -2676,16 +2676,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
2676void print_irqtrace_events(struct task_struct *curr) 2676void print_irqtrace_events(struct task_struct *curr)
2677{ 2677{
2678 printk("irq event stamp: %u\n", curr->irq_events); 2678 printk("irq event stamp: %u\n", curr->irq_events);
2679 printk("hardirqs last enabled at (%u): [<%p>] %pS\n", 2679 printk("hardirqs last enabled at (%u): [<%px>] %pS\n",
2680 curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, 2680 curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip,
2681 (void *)curr->hardirq_enable_ip); 2681 (void *)curr->hardirq_enable_ip);
2682 printk("hardirqs last disabled at (%u): [<%p>] %pS\n", 2682 printk("hardirqs last disabled at (%u): [<%px>] %pS\n",
2683 curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, 2683 curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip,
2684 (void *)curr->hardirq_disable_ip); 2684 (void *)curr->hardirq_disable_ip);
2685 printk("softirqs last enabled at (%u): [<%p>] %pS\n", 2685 printk("softirqs last enabled at (%u): [<%px>] %pS\n",
2686 curr->softirq_enable_event, (void *)curr->softirq_enable_ip, 2686 curr->softirq_enable_event, (void *)curr->softirq_enable_ip,
2687 (void *)curr->softirq_enable_ip); 2687 (void *)curr->softirq_enable_ip);
2688 printk("softirqs last disabled at (%u): [<%p>] %pS\n", 2688 printk("softirqs last disabled at (%u): [<%px>] %pS\n",
2689 curr->softirq_disable_event, (void *)curr->softirq_disable_ip, 2689 curr->softirq_disable_event, (void *)curr->softirq_disable_ip,
2690 (void *)curr->softirq_disable_ip); 2690 (void *)curr->softirq_disable_ip);
2691} 2691}
@@ -3207,7 +3207,7 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
3207 * Sanity check, the lock-class key must be persistent: 3207 * Sanity check, the lock-class key must be persistent:
3208 */ 3208 */
3209 if (!static_obj(key)) { 3209 if (!static_obj(key)) {
3210 printk("BUG: key %p not in .data!\n", key); 3210 printk("BUG: key %px not in .data!\n", key);
3211 /* 3211 /*
3212 * What it says above ^^^^^, I suggest you read it. 3212 * What it says above ^^^^^, I suggest you read it.
3213 */ 3213 */
@@ -3322,7 +3322,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3322 } 3322 }
3323 atomic_inc((atomic_t *)&class->ops); 3323 atomic_inc((atomic_t *)&class->ops);
3324 if (very_verbose(class)) { 3324 if (very_verbose(class)) {
3325 printk("\nacquire class [%p] %s", class->key, class->name); 3325 printk("\nacquire class [%px] %s", class->key, class->name);
3326 if (class->name_version > 1) 3326 if (class->name_version > 1)
3327 printk(KERN_CONT "#%d", class->name_version); 3327 printk(KERN_CONT "#%d", class->name_version);
3328 printk(KERN_CONT "\n"); 3328 printk(KERN_CONT "\n");
@@ -4376,7 +4376,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
4376 pr_warn("WARNING: held lock freed!\n"); 4376 pr_warn("WARNING: held lock freed!\n");
4377 print_kernel_ident(); 4377 print_kernel_ident();
4378 pr_warn("-------------------------\n"); 4378 pr_warn("-------------------------\n");
4379 pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n", 4379 pr_warn("%s/%d is freeing memory %px-%px, with a lock still held there!\n",
4380 curr->comm, task_pid_nr(curr), mem_from, mem_to-1); 4380 curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
4381 print_lock(hlock); 4381 print_lock(hlock);
4382 lockdep_print_held_locks(curr); 4382 lockdep_print_held_locks(curr);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 940633c63254..4f014be7a4b8 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1268,8 +1268,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
1268 1268
1269 if (unlikely(ret)) { 1269 if (unlikely(ret)) {
1270 __set_current_state(TASK_RUNNING); 1270 __set_current_state(TASK_RUNNING);
1271 if (rt_mutex_has_waiters(lock)) 1271 remove_waiter(lock, &waiter);
1272 remove_waiter(lock, &waiter);
1273 rt_mutex_handle_deadlock(ret, chwalk, &waiter); 1272 rt_mutex_handle_deadlock(ret, chwalk, &waiter);
1274 } 1273 }
1275 1274
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 68686b3ec3c1..d1d62f942be2 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -52,12 +52,13 @@ static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
52static inline struct rt_mutex_waiter * 52static inline struct rt_mutex_waiter *
53rt_mutex_top_waiter(struct rt_mutex *lock) 53rt_mutex_top_waiter(struct rt_mutex *lock)
54{ 54{
55 struct rt_mutex_waiter *w; 55 struct rb_node *leftmost = rb_first_cached(&lock->waiters);
56 56 struct rt_mutex_waiter *w = NULL;
57 w = rb_entry(lock->waiters.rb_leftmost,
58 struct rt_mutex_waiter, tree_entry);
59 BUG_ON(w->lock != lock);
60 57
58 if (leftmost) {
59 w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry);
60 BUG_ON(w->lock != lock);
61 }
61 return w; 62 return w;
62} 63}
63 64
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index f549c552dbf1..30465a2f2b6c 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -117,6 +117,7 @@ EXPORT_SYMBOL(down_write_trylock);
117void up_read(struct rw_semaphore *sem) 117void up_read(struct rw_semaphore *sem)
118{ 118{
119 rwsem_release(&sem->dep_map, 1, _RET_IP_); 119 rwsem_release(&sem->dep_map, 1, _RET_IP_);
120 DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED);
120 121
121 __up_read(sem); 122 __up_read(sem);
122} 123}
@@ -129,6 +130,7 @@ EXPORT_SYMBOL(up_read);
129void up_write(struct rw_semaphore *sem) 130void up_write(struct rw_semaphore *sem)
130{ 131{
131 rwsem_release(&sem->dep_map, 1, _RET_IP_); 132 rwsem_release(&sem->dep_map, 1, _RET_IP_);
133 DEBUG_RWSEMS_WARN_ON(sem->owner != current);
132 134
133 rwsem_clear_owner(sem); 135 rwsem_clear_owner(sem);
134 __up_write(sem); 136 __up_write(sem);
@@ -142,6 +144,7 @@ EXPORT_SYMBOL(up_write);
142void downgrade_write(struct rw_semaphore *sem) 144void downgrade_write(struct rw_semaphore *sem)
143{ 145{
144 lock_downgrade(&sem->dep_map, _RET_IP_); 146 lock_downgrade(&sem->dep_map, _RET_IP_);
147 DEBUG_RWSEMS_WARN_ON(sem->owner != current);
145 148
146 rwsem_set_reader_owned(sem); 149 rwsem_set_reader_owned(sem);
147 __downgrade_write(sem); 150 __downgrade_write(sem);
@@ -211,6 +214,7 @@ EXPORT_SYMBOL(down_write_killable_nested);
211 214
212void up_read_non_owner(struct rw_semaphore *sem) 215void up_read_non_owner(struct rw_semaphore *sem)
213{ 216{
217 DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED);
214 __up_read(sem); 218 __up_read(sem);
215} 219}
216 220
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index a883b8f1fdc6..a17cba8d94bb 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -16,6 +16,12 @@
16 */ 16 */
17#define RWSEM_READER_OWNED ((struct task_struct *)1UL) 17#define RWSEM_READER_OWNED ((struct task_struct *)1UL)
18 18
19#ifdef CONFIG_DEBUG_RWSEMS
20# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c)
21#else
22# define DEBUG_RWSEMS_WARN_ON(c)
23#endif
24
19#ifdef CONFIG_RWSEM_SPIN_ON_OWNER 25#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
20/* 26/*
21 * All writes to owner are protected by WRITE_ONCE() to make sure that 27 * All writes to owner are protected by WRITE_ONCE() to make sure that
@@ -41,7 +47,7 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
41 * do a write to the rwsem cacheline when it is really necessary 47 * do a write to the rwsem cacheline when it is really necessary
42 * to minimize cacheline contention. 48 * to minimize cacheline contention.
43 */ 49 */
44 if (sem->owner != RWSEM_READER_OWNED) 50 if (READ_ONCE(sem->owner) != RWSEM_READER_OWNED)
45 WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); 51 WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
46} 52}
47 53
diff --git a/kernel/module.c b/kernel/module.c
index e42764acedb4..a6e43a5806a1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2181,10 +2181,6 @@ static void free_module(struct module *mod)
2181 /* Finally, free the core (containing the module structure) */ 2181 /* Finally, free the core (containing the module structure) */
2182 disable_ro_nx(&mod->core_layout); 2182 disable_ro_nx(&mod->core_layout);
2183 module_memfree(mod->core_layout.base); 2183 module_memfree(mod->core_layout.base);
2184
2185#ifdef CONFIG_MPU
2186 update_protections(current->mm);
2187#endif
2188} 2184}
2189 2185
2190void *__symbol_get(const char *symbol) 2186void *__symbol_get(const char *symbol)
diff --git a/kernel/panic.c b/kernel/panic.c
index 4b794f1d8561..9d833d913c84 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -289,7 +289,7 @@ void panic(const char *fmt, ...)
289 disabled_wait(caller); 289 disabled_wait(caller);
290 } 290 }
291#endif 291#endif
292 pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf); 292 pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
293 local_irq_enable(); 293 local_irq_enable();
294 for (i = 0; ; i += PANIC_TIMER_STEP) { 294 for (i = 0; ; i += PANIC_TIMER_STEP) {
295 touch_softlockup_watchdog(); 295 touch_softlockup_watchdog();
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 0b53eef7d34b..93b57f026688 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -242,16 +242,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
242 242
243 /* 243 /*
244 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. 244 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
245 * sys_wait4() will also block until our children traced from the 245 * kernel_wait4() will also block until our children traced from the
246 * parent namespace are detached and become EXIT_DEAD. 246 * parent namespace are detached and become EXIT_DEAD.
247 */ 247 */
248 do { 248 do {
249 clear_thread_flag(TIF_SIGPENDING); 249 clear_thread_flag(TIF_SIGPENDING);
250 rc = sys_wait4(-1, NULL, __WALL, NULL); 250 rc = kernel_wait4(-1, NULL, __WALL, NULL);
251 } while (rc != -ECHILD); 251 } while (rc != -ECHILD);
252 252
253 /* 253 /*
254 * sys_wait4() above can't reap the EXIT_DEAD children but we do not 254 * kernel_wait4() above can't reap the EXIT_DEAD children but we do not
255 * really care, we could reparent them to the global init. We could 255 * really care, we could reparent them to the global init. We could
256 * exit and reap ->child_reaper even if it is not the last thread in 256 * exit and reap ->child_reaper even if it is not the last thread in
257 * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), 257 * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a5c36e9c56a6..4710f1b142fc 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -701,7 +701,7 @@ int hibernate(void)
701 } 701 }
702 702
703 pr_info("Syncing filesystems ... \n"); 703 pr_info("Syncing filesystems ... \n");
704 sys_sync(); 704 ksys_sync();
705 pr_info("done.\n"); 705 pr_info("done.\n");
706 706
707 error = freeze_processes(); 707 error = freeze_processes();
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 0685c4499431..4c10be0f4843 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -560,7 +560,7 @@ static int enter_state(suspend_state_t state)
560#ifndef CONFIG_SUSPEND_SKIP_SYNC 560#ifndef CONFIG_SUSPEND_SKIP_SYNC
561 trace_suspend_resume(TPS("sync_filesystems"), 0, true); 561 trace_suspend_resume(TPS("sync_filesystems"), 0, true);
562 pr_info("Syncing filesystems ... "); 562 pr_info("Syncing filesystems ... ");
563 sys_sync(); 563 ksys_sync();
564 pr_cont("done.\n"); 564 pr_cont("done.\n");
565 trace_suspend_resume(TPS("sync_filesystems"), 0, false); 565 trace_suspend_resume(TPS("sync_filesystems"), 0, false);
566#endif 566#endif
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 22df9f7ff672..75c959de4b29 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -224,7 +224,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
224 break; 224 break;
225 225
226 printk("Syncing filesystems ... "); 226 printk("Syncing filesystems ... ");
227 sys_sync(); 227 ksys_sync();
228 printk("done.\n"); 228 printk("done.\n");
229 229
230 error = freeze_processes(); 230 error = freeze_processes();
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 6334f2c1abd0..7a693e31184a 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -77,12 +77,18 @@ static inline void rcu_seq_start(unsigned long *sp)
77 WARN_ON_ONCE(rcu_seq_state(*sp) != 1); 77 WARN_ON_ONCE(rcu_seq_state(*sp) != 1);
78} 78}
79 79
80/* Compute the end-of-grace-period value for the specified sequence number. */
81static inline unsigned long rcu_seq_endval(unsigned long *sp)
82{
83 return (*sp | RCU_SEQ_STATE_MASK) + 1;
84}
85
80/* Adjust sequence number for end of update-side operation. */ 86/* Adjust sequence number for end of update-side operation. */
81static inline void rcu_seq_end(unsigned long *sp) 87static inline void rcu_seq_end(unsigned long *sp)
82{ 88{
83 smp_mb(); /* Ensure update-side operation before counter increment. */ 89 smp_mb(); /* Ensure update-side operation before counter increment. */
84 WARN_ON_ONCE(!rcu_seq_state(*sp)); 90 WARN_ON_ONCE(!rcu_seq_state(*sp));
85 WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1); 91 WRITE_ONCE(*sp, rcu_seq_endval(sp));
86} 92}
87 93
88/* Take a snapshot of the update side's sequence number. */ 94/* Take a snapshot of the update side's sequence number. */
@@ -295,9 +301,19 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
295 * Iterate over all possible CPUs in a leaf RCU node. 301 * Iterate over all possible CPUs in a leaf RCU node.
296 */ 302 */
297#define for_each_leaf_node_possible_cpu(rnp, cpu) \ 303#define for_each_leaf_node_possible_cpu(rnp, cpu) \
298 for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ 304 for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \
299 cpu <= rnp->grphi; \ 305 (cpu) <= rnp->grphi; \
300 cpu = cpumask_next((cpu), cpu_possible_mask)) 306 (cpu) = cpumask_next((cpu), cpu_possible_mask))
307
308/*
309 * Iterate over all CPUs in a leaf RCU node's specified mask.
310 */
311#define rcu_find_next_bit(rnp, cpu, mask) \
312 ((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu)))
313#define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \
314 for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \
315 (cpu) <= rnp->grphi; \
316 (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask)))
301 317
302/* 318/*
303 * Wrappers for the rcu_node::lock acquire and release. 319 * Wrappers for the rcu_node::lock acquire and release.
@@ -337,7 +353,7 @@ do { \
337} while (0) 353} while (0)
338 354
339#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ 355#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \
340 raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ 356 raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags)
341 357
342#define raw_spin_trylock_rcu_node(p) \ 358#define raw_spin_trylock_rcu_node(p) \
343({ \ 359({ \
@@ -348,6 +364,9 @@ do { \
348 ___locked; \ 364 ___locked; \
349}) 365})
350 366
367#define raw_lockdep_assert_held_rcu_node(p) \
368 lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
369
351#endif /* #if defined(SRCU) || !defined(TINY_RCU) */ 370#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
352 371
353#ifdef CONFIG_TINY_RCU 372#ifdef CONFIG_TINY_RCU
@@ -356,24 +375,20 @@ static inline bool rcu_gp_is_normal(void) { return true; }
356static inline bool rcu_gp_is_expedited(void) { return false; } 375static inline bool rcu_gp_is_expedited(void) { return false; }
357static inline void rcu_expedite_gp(void) { } 376static inline void rcu_expedite_gp(void) { }
358static inline void rcu_unexpedite_gp(void) { } 377static inline void rcu_unexpedite_gp(void) { }
378static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
359#else /* #ifdef CONFIG_TINY_RCU */ 379#else /* #ifdef CONFIG_TINY_RCU */
360bool rcu_gp_is_normal(void); /* Internal RCU use. */ 380bool rcu_gp_is_normal(void); /* Internal RCU use. */
361bool rcu_gp_is_expedited(void); /* Internal RCU use. */ 381bool rcu_gp_is_expedited(void); /* Internal RCU use. */
362void rcu_expedite_gp(void); 382void rcu_expedite_gp(void);
363void rcu_unexpedite_gp(void); 383void rcu_unexpedite_gp(void);
364void rcupdate_announce_bootup_oddness(void); 384void rcupdate_announce_bootup_oddness(void);
385void rcu_request_urgent_qs_task(struct task_struct *t);
365#endif /* #else #ifdef CONFIG_TINY_RCU */ 386#endif /* #else #ifdef CONFIG_TINY_RCU */
366 387
367#define RCU_SCHEDULER_INACTIVE 0 388#define RCU_SCHEDULER_INACTIVE 0
368#define RCU_SCHEDULER_INIT 1 389#define RCU_SCHEDULER_INIT 1
369#define RCU_SCHEDULER_RUNNING 2 390#define RCU_SCHEDULER_RUNNING 2
370 391
371#ifdef CONFIG_TINY_RCU
372static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
373#else /* #ifdef CONFIG_TINY_RCU */
374void rcu_request_urgent_qs_task(struct task_struct *t);
375#endif /* #else #ifdef CONFIG_TINY_RCU */
376
377enum rcutorture_type { 392enum rcutorture_type {
378 RCU_FLAVOR, 393 RCU_FLAVOR,
379 RCU_BH_FLAVOR, 394 RCU_BH_FLAVOR,
@@ -470,6 +485,7 @@ void show_rcu_gp_kthreads(void);
470void rcu_force_quiescent_state(void); 485void rcu_force_quiescent_state(void);
471void rcu_bh_force_quiescent_state(void); 486void rcu_bh_force_quiescent_state(void);
472void rcu_sched_force_quiescent_state(void); 487void rcu_sched_force_quiescent_state(void);
488extern struct workqueue_struct *rcu_gp_wq;
473#endif /* #else #ifdef CONFIG_TINY_RCU */ 489#endif /* #else #ifdef CONFIG_TINY_RCU */
474 490
475#ifdef CONFIG_RCU_NOCB_CPU 491#ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index d1ebdf9868bb..777e7a6a0292 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -61,11 +61,30 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
61#define VERBOSE_PERFOUT_ERRSTRING(s) \ 61#define VERBOSE_PERFOUT_ERRSTRING(s) \
62 do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) 62 do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
63 63
64/*
65 * The intended use cases for the nreaders and nwriters module parameters
66 * are as follows:
67 *
68 * 1. Specify only the nr_cpus kernel boot parameter. This will
69 * set both nreaders and nwriters to the value specified by
70 * nr_cpus for a mixed reader/writer test.
71 *
72 * 2. Specify the nr_cpus kernel boot parameter, but set
73 * rcuperf.nreaders to zero. This will set nwriters to the
74 * value specified by nr_cpus for an update-only test.
75 *
76 * 3. Specify the nr_cpus kernel boot parameter, but set
77 * rcuperf.nwriters to zero. This will set nreaders to the
78 * value specified by nr_cpus for a read-only test.
79 *
80 * Various other use cases may of course be specified.
81 */
82
64torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); 83torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
65torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); 84torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
66torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); 85torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
67torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); 86torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
68torture_param(int, nreaders, 0, "Number of RCU reader threads"); 87torture_param(int, nreaders, -1, "Number of RCU reader threads");
69torture_param(int, nwriters, -1, "Number of RCU updater threads"); 88torture_param(int, nwriters, -1, "Number of RCU updater threads");
70torture_param(bool, shutdown, !IS_ENABLED(MODULE), 89torture_param(bool, shutdown, !IS_ENABLED(MODULE),
71 "Shutdown at end of performance tests."); 90 "Shutdown at end of performance tests.");
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 308e6fdbced8..680c96d8c00f 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -909,34 +909,38 @@ rcu_torture_writer(void *arg)
909 int nsynctypes = 0; 909 int nsynctypes = 0;
910 910
911 VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); 911 VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
912 if (!can_expedite) { 912 if (!can_expedite)
913 pr_alert("%s" TORTURE_FLAG 913 pr_alert("%s" TORTURE_FLAG
914 " GP expediting controlled from boot/sysfs for %s,\n", 914 " GP expediting controlled from boot/sysfs for %s.\n",
915 torture_type, cur_ops->name); 915 torture_type, cur_ops->name);
916 pr_alert("%s" TORTURE_FLAG
917 " Disabled dynamic grace-period expediting.\n",
918 torture_type);
919 }
920 916
921 /* Initialize synctype[] array. If none set, take default. */ 917 /* Initialize synctype[] array. If none set, take default. */
922 if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) 918 if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
923 gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; 919 gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
924 if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) 920 if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) {
925 synctype[nsynctypes++] = RTWS_COND_GET; 921 synctype[nsynctypes++] = RTWS_COND_GET;
926 else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) 922 pr_info("%s: Testing conditional GPs.\n", __func__);
927 pr_alert("rcu_torture_writer: gp_cond without primitives.\n"); 923 } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) {
928 if (gp_exp1 && cur_ops->exp_sync) 924 pr_alert("%s: gp_cond without primitives.\n", __func__);
925 }
926 if (gp_exp1 && cur_ops->exp_sync) {
929 synctype[nsynctypes++] = RTWS_EXP_SYNC; 927 synctype[nsynctypes++] = RTWS_EXP_SYNC;
930 else if (gp_exp && !cur_ops->exp_sync) 928 pr_info("%s: Testing expedited GPs.\n", __func__);
931 pr_alert("rcu_torture_writer: gp_exp without primitives.\n"); 929 } else if (gp_exp && !cur_ops->exp_sync) {
932 if (gp_normal1 && cur_ops->deferred_free) 930 pr_alert("%s: gp_exp without primitives.\n", __func__);
931 }
932 if (gp_normal1 && cur_ops->deferred_free) {
933 synctype[nsynctypes++] = RTWS_DEF_FREE; 933 synctype[nsynctypes++] = RTWS_DEF_FREE;
934 else if (gp_normal && !cur_ops->deferred_free) 934 pr_info("%s: Testing asynchronous GPs.\n", __func__);
935 pr_alert("rcu_torture_writer: gp_normal without primitives.\n"); 935 } else if (gp_normal && !cur_ops->deferred_free) {
936 if (gp_sync1 && cur_ops->sync) 936 pr_alert("%s: gp_normal without primitives.\n", __func__);
937 }
938 if (gp_sync1 && cur_ops->sync) {
937 synctype[nsynctypes++] = RTWS_SYNC; 939 synctype[nsynctypes++] = RTWS_SYNC;
938 else if (gp_sync && !cur_ops->sync) 940 pr_info("%s: Testing normal GPs.\n", __func__);
939 pr_alert("rcu_torture_writer: gp_sync without primitives.\n"); 941 } else if (gp_sync && !cur_ops->sync) {
942 pr_alert("%s: gp_sync without primitives.\n", __func__);
943 }
940 if (WARN_ONCE(nsynctypes == 0, 944 if (WARN_ONCE(nsynctypes == 0,
941 "rcu_torture_writer: No update-side primitives.\n")) { 945 "rcu_torture_writer: No update-side primitives.\n")) {
942 /* 946 /*
@@ -1011,6 +1015,9 @@ rcu_torture_writer(void *arg)
1011 rcu_unexpedite_gp(); 1015 rcu_unexpedite_gp();
1012 if (++expediting > 3) 1016 if (++expediting > 3)
1013 expediting = -expediting; 1017 expediting = -expediting;
1018 } else if (!can_expedite) { /* Disabled during boot, recheck. */
1019 can_expedite = !rcu_gp_is_expedited() &&
1020 !rcu_gp_is_normal();
1014 } 1021 }
1015 rcu_torture_writer_state = RTWS_STUTTER; 1022 rcu_torture_writer_state = RTWS_STUTTER;
1016 stutter_wait("rcu_torture_writer"); 1023 stutter_wait("rcu_torture_writer");
@@ -1021,6 +1028,10 @@ rcu_torture_writer(void *arg)
1021 while (can_expedite && expediting++ < 0) 1028 while (can_expedite && expediting++ < 0)
1022 rcu_unexpedite_gp(); 1029 rcu_unexpedite_gp();
1023 WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited()); 1030 WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited());
1031 if (!can_expedite)
1032 pr_alert("%s" TORTURE_FLAG
1033 " Dynamic grace-period expediting was disabled.\n",
1034 torture_type);
1024 rcu_torture_writer_state = RTWS_STOPPING; 1035 rcu_torture_writer_state = RTWS_STOPPING;
1025 torture_kthread_stopping("rcu_torture_writer"); 1036 torture_kthread_stopping("rcu_torture_writer");
1026 return 0; 1037 return 0;
@@ -1045,13 +1056,13 @@ rcu_torture_fakewriter(void *arg)
1045 torture_random(&rand) % (nfakewriters * 8) == 0) { 1056 torture_random(&rand) % (nfakewriters * 8) == 0) {
1046 cur_ops->cb_barrier(); 1057 cur_ops->cb_barrier();
1047 } else if (gp_normal == gp_exp) { 1058 } else if (gp_normal == gp_exp) {
1048 if (torture_random(&rand) & 0x80) 1059 if (cur_ops->sync && torture_random(&rand) & 0x80)
1049 cur_ops->sync(); 1060 cur_ops->sync();
1050 else 1061 else if (cur_ops->exp_sync)
1051 cur_ops->exp_sync(); 1062 cur_ops->exp_sync();
1052 } else if (gp_normal) { 1063 } else if (gp_normal && cur_ops->sync) {
1053 cur_ops->sync(); 1064 cur_ops->sync();
1054 } else { 1065 } else if (cur_ops->exp_sync) {
1055 cur_ops->exp_sync(); 1066 cur_ops->exp_sync();
1056 } 1067 }
1057 stutter_wait("rcu_torture_fakewriter"); 1068 stutter_wait("rcu_torture_fakewriter");
@@ -1557,11 +1568,10 @@ static int rcu_torture_barrier_init(void)
1557 atomic_set(&barrier_cbs_count, 0); 1568 atomic_set(&barrier_cbs_count, 0);
1558 atomic_set(&barrier_cbs_invoked, 0); 1569 atomic_set(&barrier_cbs_invoked, 0);
1559 barrier_cbs_tasks = 1570 barrier_cbs_tasks =
1560 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), 1571 kcalloc(n_barrier_cbs, sizeof(barrier_cbs_tasks[0]),
1561 GFP_KERNEL); 1572 GFP_KERNEL);
1562 barrier_cbs_wq = 1573 barrier_cbs_wq =
1563 kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), 1574 kcalloc(n_barrier_cbs, sizeof(barrier_cbs_wq[0]), GFP_KERNEL);
1564 GFP_KERNEL);
1565 if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) 1575 if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
1566 return -ENOMEM; 1576 return -ENOMEM;
1567 for (i = 0; i < n_barrier_cbs; i++) { 1577 for (i = 0; i < n_barrier_cbs; i++) {
@@ -1674,7 +1684,7 @@ static void rcu_torture_err_cb(struct rcu_head *rhp)
1674 * next grace period. Unlikely, but can happen. If it 1684 * next grace period. Unlikely, but can happen. If it
1675 * does happen, the debug-objects subsystem won't have splatted. 1685 * does happen, the debug-objects subsystem won't have splatted.
1676 */ 1686 */
1677 pr_alert("rcutorture: duplicated callback was invoked.\n"); 1687 pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME);
1678} 1688}
1679#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 1689#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
1680 1690
@@ -1691,7 +1701,7 @@ static void rcu_test_debug_objects(void)
1691 1701
1692 init_rcu_head_on_stack(&rh1); 1702 init_rcu_head_on_stack(&rh1);
1693 init_rcu_head_on_stack(&rh2); 1703 init_rcu_head_on_stack(&rh2);
1694 pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); 1704 pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME);
1695 1705
1696 /* Try to queue the rh2 pair of callbacks for the same grace period. */ 1706 /* Try to queue the rh2 pair of callbacks for the same grace period. */
1697 preempt_disable(); /* Prevent preemption from interrupting test. */ 1707 preempt_disable(); /* Prevent preemption from interrupting test. */
@@ -1706,11 +1716,11 @@ static void rcu_test_debug_objects(void)
1706 1716
1707 /* Wait for them all to get done so we can safely return. */ 1717 /* Wait for them all to get done so we can safely return. */
1708 rcu_barrier(); 1718 rcu_barrier();
1709 pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); 1719 pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME);
1710 destroy_rcu_head_on_stack(&rh1); 1720 destroy_rcu_head_on_stack(&rh1);
1711 destroy_rcu_head_on_stack(&rh2); 1721 destroy_rcu_head_on_stack(&rh2);
1712#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 1722#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
1713 pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); 1723 pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME);
1714#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 1724#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
1715} 1725}
1716 1726
@@ -1799,7 +1809,7 @@ rcu_torture_init(void)
1799 if (firsterr) 1809 if (firsterr)
1800 goto unwind; 1810 goto unwind;
1801 if (nfakewriters > 0) { 1811 if (nfakewriters > 0) {
1802 fakewriter_tasks = kzalloc(nfakewriters * 1812 fakewriter_tasks = kcalloc(nfakewriters,
1803 sizeof(fakewriter_tasks[0]), 1813 sizeof(fakewriter_tasks[0]),
1804 GFP_KERNEL); 1814 GFP_KERNEL);
1805 if (fakewriter_tasks == NULL) { 1815 if (fakewriter_tasks == NULL) {
@@ -1814,7 +1824,7 @@ rcu_torture_init(void)
1814 if (firsterr) 1824 if (firsterr)
1815 goto unwind; 1825 goto unwind;
1816 } 1826 }
1817 reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), 1827 reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
1818 GFP_KERNEL); 1828 GFP_KERNEL);
1819 if (reader_tasks == NULL) { 1829 if (reader_tasks == NULL) {
1820 VERBOSE_TOROUT_ERRSTRING("out of memory"); 1830 VERBOSE_TOROUT_ERRSTRING("out of memory");
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index d5cea81378cc..fb560fca9ef4 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -386,7 +386,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
386 flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); 386 flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
387 if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || 387 if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
388 WARN_ON(srcu_readers_active(sp))) { 388 WARN_ON(srcu_readers_active(sp))) {
389 pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); 389 pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
390 return; /* Caller forgot to stop doing call_srcu()? */ 390 return; /* Caller forgot to stop doing call_srcu()? */
391 } 391 }
392 free_percpu(sp->sda); 392 free_percpu(sp->sda);
@@ -439,7 +439,7 @@ static void srcu_gp_start(struct srcu_struct *sp)
439 struct srcu_data *sdp = this_cpu_ptr(sp->sda); 439 struct srcu_data *sdp = this_cpu_ptr(sp->sda);
440 int state; 440 int state;
441 441
442 lockdep_assert_held(&sp->lock); 442 lockdep_assert_held(&ACCESS_PRIVATE(sp, lock));
443 WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); 443 WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
444 rcu_segcblist_advance(&sdp->srcu_cblist, 444 rcu_segcblist_advance(&sdp->srcu_cblist,
445 rcu_seq_current(&sp->srcu_gp_seq)); 445 rcu_seq_current(&sp->srcu_gp_seq));
@@ -492,8 +492,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
492 */ 492 */
493static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) 493static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
494{ 494{
495 srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq, 495 srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay);
496 &sdp->work, delay);
497} 496}
498 497
499/* 498/*
@@ -527,11 +526,11 @@ static void srcu_gp_end(struct srcu_struct *sp)
527{ 526{
528 unsigned long cbdelay; 527 unsigned long cbdelay;
529 bool cbs; 528 bool cbs;
529 bool last_lvl;
530 int cpu; 530 int cpu;
531 unsigned long flags; 531 unsigned long flags;
532 unsigned long gpseq; 532 unsigned long gpseq;
533 int idx; 533 int idx;
534 int idxnext;
535 unsigned long mask; 534 unsigned long mask;
536 struct srcu_data *sdp; 535 struct srcu_data *sdp;
537 struct srcu_node *snp; 536 struct srcu_node *snp;
@@ -555,11 +554,11 @@ static void srcu_gp_end(struct srcu_struct *sp)
555 554
556 /* Initiate callback invocation as needed. */ 555 /* Initiate callback invocation as needed. */
557 idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); 556 idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
558 idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
559 rcu_for_each_node_breadth_first(sp, snp) { 557 rcu_for_each_node_breadth_first(sp, snp) {
560 spin_lock_irq_rcu_node(snp); 558 spin_lock_irq_rcu_node(snp);
561 cbs = false; 559 cbs = false;
562 if (snp >= sp->level[rcu_num_lvls - 1]) 560 last_lvl = snp >= sp->level[rcu_num_lvls - 1];
561 if (last_lvl)
563 cbs = snp->srcu_have_cbs[idx] == gpseq; 562 cbs = snp->srcu_have_cbs[idx] == gpseq;
564 snp->srcu_have_cbs[idx] = gpseq; 563 snp->srcu_have_cbs[idx] = gpseq;
565 rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); 564 rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
@@ -572,13 +571,16 @@ static void srcu_gp_end(struct srcu_struct *sp)
572 srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); 571 srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
573 572
574 /* Occasionally prevent srcu_data counter wrap. */ 573 /* Occasionally prevent srcu_data counter wrap. */
575 if (!(gpseq & counter_wrap_check)) 574 if (!(gpseq & counter_wrap_check) && last_lvl)
576 for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { 575 for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
577 sdp = per_cpu_ptr(sp->sda, cpu); 576 sdp = per_cpu_ptr(sp->sda, cpu);
578 spin_lock_irqsave_rcu_node(sdp, flags); 577 spin_lock_irqsave_rcu_node(sdp, flags);
579 if (ULONG_CMP_GE(gpseq, 578 if (ULONG_CMP_GE(gpseq,
580 sdp->srcu_gp_seq_needed + 100)) 579 sdp->srcu_gp_seq_needed + 100))
581 sdp->srcu_gp_seq_needed = gpseq; 580 sdp->srcu_gp_seq_needed = gpseq;
581 if (ULONG_CMP_GE(gpseq,
582 sdp->srcu_gp_seq_needed_exp + 100))
583 sdp->srcu_gp_seq_needed_exp = gpseq;
582 spin_unlock_irqrestore_rcu_node(sdp, flags); 584 spin_unlock_irqrestore_rcu_node(sdp, flags);
583 } 585 }
584 } 586 }
@@ -593,9 +595,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
593 ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { 595 ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
594 srcu_gp_start(sp); 596 srcu_gp_start(sp);
595 spin_unlock_irq_rcu_node(sp); 597 spin_unlock_irq_rcu_node(sp);
596 /* Throttle expedited grace periods: Should be rare! */ 598 srcu_reschedule(sp, 0);
597 srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
598 ? 0 : SRCU_INTERVAL);
599 } else { 599 } else {
600 spin_unlock_irq_rcu_node(sp); 600 spin_unlock_irq_rcu_node(sp);
601 } 601 }
@@ -626,7 +626,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
626 spin_unlock_irqrestore_rcu_node(snp, flags); 626 spin_unlock_irqrestore_rcu_node(snp, flags);
627 } 627 }
628 spin_lock_irqsave_rcu_node(sp, flags); 628 spin_lock_irqsave_rcu_node(sp, flags);
629 if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) 629 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
630 sp->srcu_gp_seq_needed_exp = s; 630 sp->srcu_gp_seq_needed_exp = s;
631 spin_unlock_irqrestore_rcu_node(sp, flags); 631 spin_unlock_irqrestore_rcu_node(sp, flags);
632} 632}
@@ -691,8 +691,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
691 rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { 691 rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
692 WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); 692 WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
693 srcu_gp_start(sp); 693 srcu_gp_start(sp);
694 queue_delayed_work(system_power_efficient_wq, &sp->work, 694 queue_delayed_work(rcu_gp_wq, &sp->work, srcu_get_delay(sp));
695 srcu_get_delay(sp));
696 } 695 }
697 spin_unlock_irqrestore_rcu_node(sp, flags); 696 spin_unlock_irqrestore_rcu_node(sp, flags);
698} 697}
@@ -1225,7 +1224,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
1225 spin_unlock_irq_rcu_node(sp); 1224 spin_unlock_irq_rcu_node(sp);
1226 1225
1227 if (pushgp) 1226 if (pushgp)
1228 queue_delayed_work(system_power_efficient_wq, &sp->work, delay); 1227 queue_delayed_work(rcu_gp_wq, &sp->work, delay);
1229} 1228}
1230 1229
1231/* 1230/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 491bdf39f276..2a734692a581 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1161,7 +1161,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)
1161 */ 1161 */
1162static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) 1162static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
1163{ 1163{
1164 lockdep_assert_held(&rnp->lock); 1164 raw_lockdep_assert_held_rcu_node(rnp);
1165 if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) 1165 if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum))
1166 WRITE_ONCE(rdp->gpwrap, true); 1166 WRITE_ONCE(rdp->gpwrap, true);
1167 if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) 1167 if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum))
@@ -1350,6 +1350,7 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
1350 rsp->gp_kthread ? rsp->gp_kthread->state : ~0, 1350 rsp->gp_kthread ? rsp->gp_kthread->state : ~0,
1351 rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); 1351 rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1);
1352 if (rsp->gp_kthread) { 1352 if (rsp->gp_kthread) {
1353 pr_err("RCU grace-period kthread stack dump:\n");
1353 sched_show_task(rsp->gp_kthread); 1354 sched_show_task(rsp->gp_kthread);
1354 wake_up_process(rsp->gp_kthread); 1355 wake_up_process(rsp->gp_kthread);
1355 } 1356 }
@@ -1628,7 +1629,7 @@ void rcu_cpu_stall_reset(void)
1628static unsigned long rcu_cbs_completed(struct rcu_state *rsp, 1629static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
1629 struct rcu_node *rnp) 1630 struct rcu_node *rnp)
1630{ 1631{
1631 lockdep_assert_held(&rnp->lock); 1632 raw_lockdep_assert_held_rcu_node(rnp);
1632 1633
1633 /* 1634 /*
1634 * If RCU is idle, we just wait for the next grace period. 1635 * If RCU is idle, we just wait for the next grace period.
@@ -1675,7 +1676,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1675 bool ret = false; 1676 bool ret = false;
1676 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); 1677 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
1677 1678
1678 lockdep_assert_held(&rnp->lock); 1679 raw_lockdep_assert_held_rcu_node(rnp);
1679 1680
1680 /* 1681 /*
1681 * Pick up grace-period number for new callbacks. If this 1682 * Pick up grace-period number for new callbacks. If this
@@ -1803,7 +1804,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1803{ 1804{
1804 bool ret = false; 1805 bool ret = false;
1805 1806
1806 lockdep_assert_held(&rnp->lock); 1807 raw_lockdep_assert_held_rcu_node(rnp);
1807 1808
1808 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ 1809 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1809 if (!rcu_segcblist_pend_cbs(&rdp->cblist)) 1810 if (!rcu_segcblist_pend_cbs(&rdp->cblist))
@@ -1843,7 +1844,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1843static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1844static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1844 struct rcu_data *rdp) 1845 struct rcu_data *rdp)
1845{ 1846{
1846 lockdep_assert_held(&rnp->lock); 1847 raw_lockdep_assert_held_rcu_node(rnp);
1847 1848
1848 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ 1849 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1849 if (!rcu_segcblist_pend_cbs(&rdp->cblist)) 1850 if (!rcu_segcblist_pend_cbs(&rdp->cblist))
@@ -1871,7 +1872,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1871 bool ret; 1872 bool ret;
1872 bool need_gp; 1873 bool need_gp;
1873 1874
1874 lockdep_assert_held(&rnp->lock); 1875 raw_lockdep_assert_held_rcu_node(rnp);
1875 1876
1876 /* Handle the ends of any preceding grace periods first. */ 1877 /* Handle the ends of any preceding grace periods first. */
1877 if (rdp->completed == rnp->completed && 1878 if (rdp->completed == rnp->completed &&
@@ -2296,7 +2297,7 @@ static bool
2296rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 2297rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
2297 struct rcu_data *rdp) 2298 struct rcu_data *rdp)
2298{ 2299{
2299 lockdep_assert_held(&rnp->lock); 2300 raw_lockdep_assert_held_rcu_node(rnp);
2300 if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { 2301 if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
2301 /* 2302 /*
2302 * Either we have not yet spawned the grace-period 2303 * Either we have not yet spawned the grace-period
@@ -2358,7 +2359,7 @@ static bool rcu_start_gp(struct rcu_state *rsp)
2358static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 2359static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
2359 __releases(rcu_get_root(rsp)->lock) 2360 __releases(rcu_get_root(rsp)->lock)
2360{ 2361{
2361 lockdep_assert_held(&rcu_get_root(rsp)->lock); 2362 raw_lockdep_assert_held_rcu_node(rcu_get_root(rsp));
2362 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 2363 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
2363 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); 2364 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
2364 raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); 2365 raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
@@ -2383,7 +2384,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
2383 unsigned long oldmask = 0; 2384 unsigned long oldmask = 0;
2384 struct rcu_node *rnp_c; 2385 struct rcu_node *rnp_c;
2385 2386
2386 lockdep_assert_held(&rnp->lock); 2387 raw_lockdep_assert_held_rcu_node(rnp);
2387 2388
2388 /* Walk up the rcu_node hierarchy. */ 2389 /* Walk up the rcu_node hierarchy. */
2389 for (;;) { 2390 for (;;) {
@@ -2447,7 +2448,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
2447 unsigned long mask; 2448 unsigned long mask;
2448 struct rcu_node *rnp_p; 2449 struct rcu_node *rnp_p;
2449 2450
2450 lockdep_assert_held(&rnp->lock); 2451 raw_lockdep_assert_held_rcu_node(rnp);
2451 if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || 2452 if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
2452 rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { 2453 rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
2453 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2454 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -2592,7 +2593,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2592 long mask; 2593 long mask;
2593 struct rcu_node *rnp = rnp_leaf; 2594 struct rcu_node *rnp = rnp_leaf;
2594 2595
2595 lockdep_assert_held(&rnp->lock); 2596 raw_lockdep_assert_held_rcu_node(rnp);
2596 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || 2597 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
2597 rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) 2598 rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
2598 return; 2599 return;
@@ -2691,7 +2692,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
2691 /* Update counts and requeue any remaining callbacks. */ 2692 /* Update counts and requeue any remaining callbacks. */
2692 rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); 2693 rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
2693 smp_mb(); /* List handling before counting for rcu_barrier(). */ 2694 smp_mb(); /* List handling before counting for rcu_barrier(). */
2694 rdp->n_cbs_invoked += count;
2695 rcu_segcblist_insert_count(&rdp->cblist, &rcl); 2695 rcu_segcblist_insert_count(&rdp->cblist, &rcl);
2696 2696
2697 /* Reinstate batch limit if we have worked down the excess. */ 2697 /* Reinstate batch limit if we have worked down the excess. */
@@ -2845,10 +2845,8 @@ static void force_quiescent_state(struct rcu_state *rsp)
2845 !raw_spin_trylock(&rnp->fqslock); 2845 !raw_spin_trylock(&rnp->fqslock);
2846 if (rnp_old != NULL) 2846 if (rnp_old != NULL)
2847 raw_spin_unlock(&rnp_old->fqslock); 2847 raw_spin_unlock(&rnp_old->fqslock);
2848 if (ret) { 2848 if (ret)
2849 rsp->n_force_qs_lh++;
2850 return; 2849 return;
2851 }
2852 rnp_old = rnp; 2850 rnp_old = rnp;
2853 } 2851 }
2854 /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ 2852 /* rnp_old == rcu_get_root(rsp), rnp == NULL. */
@@ -2857,7 +2855,6 @@ static void force_quiescent_state(struct rcu_state *rsp)
2857 raw_spin_lock_irqsave_rcu_node(rnp_old, flags); 2855 raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
2858 raw_spin_unlock(&rnp_old->fqslock); 2856 raw_spin_unlock(&rnp_old->fqslock);
2859 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 2857 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
2860 rsp->n_force_qs_lh++;
2861 raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); 2858 raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
2862 return; /* Someone beat us to it. */ 2859 return; /* Someone beat us to it. */
2863 } 2860 }
@@ -3355,8 +3352,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3355{ 3352{
3356 struct rcu_node *rnp = rdp->mynode; 3353 struct rcu_node *rnp = rdp->mynode;
3357 3354
3358 rdp->n_rcu_pending++;
3359
3360 /* Check for CPU stalls, if enabled. */ 3355 /* Check for CPU stalls, if enabled. */
3361 check_cpu_stall(rsp, rdp); 3356 check_cpu_stall(rsp, rdp);
3362 3357
@@ -3365,48 +3360,31 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3365 return 0; 3360 return 0;
3366 3361
3367 /* Is the RCU core waiting for a quiescent state from this CPU? */ 3362 /* Is the RCU core waiting for a quiescent state from this CPU? */
3368 if (rcu_scheduler_fully_active && 3363 if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm)
3369 rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
3370 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) {
3371 rdp->n_rp_core_needs_qs++;
3372 } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) {
3373 rdp->n_rp_report_qs++;
3374 return 1; 3364 return 1;
3375 }
3376 3365
3377 /* Does this CPU have callbacks ready to invoke? */ 3366 /* Does this CPU have callbacks ready to invoke? */
3378 if (rcu_segcblist_ready_cbs(&rdp->cblist)) { 3367 if (rcu_segcblist_ready_cbs(&rdp->cblist))
3379 rdp->n_rp_cb_ready++;
3380 return 1; 3368 return 1;
3381 }
3382 3369
3383 /* Has RCU gone idle with this CPU needing another grace period? */ 3370 /* Has RCU gone idle with this CPU needing another grace period? */
3384 if (cpu_needs_another_gp(rsp, rdp)) { 3371 if (cpu_needs_another_gp(rsp, rdp))
3385 rdp->n_rp_cpu_needs_gp++;
3386 return 1; 3372 return 1;
3387 }
3388 3373
3389 /* Has another RCU grace period completed? */ 3374 /* Has another RCU grace period completed? */
3390 if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ 3375 if (READ_ONCE(rnp->completed) != rdp->completed) /* outside lock */
3391 rdp->n_rp_gp_completed++;
3392 return 1; 3376 return 1;
3393 }
3394 3377
3395 /* Has a new RCU grace period started? */ 3378 /* Has a new RCU grace period started? */
3396 if (READ_ONCE(rnp->gpnum) != rdp->gpnum || 3379 if (READ_ONCE(rnp->gpnum) != rdp->gpnum ||
3397 unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */ 3380 unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */
3398 rdp->n_rp_gp_started++;
3399 return 1; 3381 return 1;
3400 }
3401 3382
3402 /* Does this CPU need a deferred NOCB wakeup? */ 3383 /* Does this CPU need a deferred NOCB wakeup? */
3403 if (rcu_nocb_need_deferred_wakeup(rdp)) { 3384 if (rcu_nocb_need_deferred_wakeup(rdp))
3404 rdp->n_rp_nocb_defer_wakeup++;
3405 return 1; 3385 return 1;
3406 }
3407 3386
3408 /* nothing to do */ 3387 /* nothing to do */
3409 rdp->n_rp_need_nothing++;
3410 return 0; 3388 return 0;
3411} 3389}
3412 3390
@@ -3618,7 +3596,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
3618 long mask; 3596 long mask;
3619 struct rcu_node *rnp = rnp_leaf; 3597 struct rcu_node *rnp = rnp_leaf;
3620 3598
3621 lockdep_assert_held(&rnp->lock); 3599 raw_lockdep_assert_held_rcu_node(rnp);
3622 for (;;) { 3600 for (;;) {
3623 mask = rnp->grpmask; 3601 mask = rnp->grpmask;
3624 rnp = rnp->parent; 3602 rnp = rnp->parent;
@@ -3636,12 +3614,9 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
3636static void __init 3614static void __init
3637rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) 3615rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
3638{ 3616{
3639 unsigned long flags;
3640 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 3617 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
3641 struct rcu_node *rnp = rcu_get_root(rsp);
3642 3618
3643 /* Set up local state, ensuring consistent view of global state. */ 3619 /* Set up local state, ensuring consistent view of global state. */
3644 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3645 rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); 3620 rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
3646 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 3621 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
3647 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); 3622 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1);
@@ -3649,7 +3624,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
3649 rdp->cpu = cpu; 3624 rdp->cpu = cpu;
3650 rdp->rsp = rsp; 3625 rdp->rsp = rsp;
3651 rcu_boot_init_nocb_percpu_data(rdp); 3626 rcu_boot_init_nocb_percpu_data(rdp);
3652 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3653} 3627}
3654 3628
3655/* 3629/*
@@ -4193,6 +4167,8 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
4193 pr_cont("\n"); 4167 pr_cont("\n");
4194} 4168}
4195 4169
4170struct workqueue_struct *rcu_gp_wq;
4171
4196void __init rcu_init(void) 4172void __init rcu_init(void)
4197{ 4173{
4198 int cpu; 4174 int cpu;
@@ -4219,6 +4195,10 @@ void __init rcu_init(void)
4219 rcu_cpu_starting(cpu); 4195 rcu_cpu_starting(cpu);
4220 rcutree_online_cpu(cpu); 4196 rcutree_online_cpu(cpu);
4221 } 4197 }
4198
4199 /* Create workqueue for expedited GPs and for Tree SRCU. */
4200 rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
4201 WARN_ON(!rcu_gp_wq);
4222} 4202}
4223 4203
4224#include "tree_exp.h" 4204#include "tree_exp.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 6488a3b0e729..f491ab4f2e8e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -146,12 +146,6 @@ struct rcu_node {
146 /* boosting for this rcu_node structure. */ 146 /* boosting for this rcu_node structure. */
147 unsigned int boost_kthread_status; 147 unsigned int boost_kthread_status;
148 /* State of boost_kthread_task for tracing. */ 148 /* State of boost_kthread_task for tracing. */
149 unsigned long n_tasks_boosted;
150 /* Total number of tasks boosted. */
151 unsigned long n_exp_boosts;
152 /* Number of tasks boosted for expedited GP. */
153 unsigned long n_normal_boosts;
154 /* Number of tasks boosted for normal GP. */
155#ifdef CONFIG_RCU_NOCB_CPU 149#ifdef CONFIG_RCU_NOCB_CPU
156 struct swait_queue_head nocb_gp_wq[2]; 150 struct swait_queue_head nocb_gp_wq[2];
157 /* Place for rcu_nocb_kthread() to wait GP. */ 151 /* Place for rcu_nocb_kthread() to wait GP. */
@@ -184,13 +178,6 @@ union rcu_noqs {
184 u16 s; /* Set of bits, aggregate OR here. */ 178 u16 s; /* Set of bits, aggregate OR here. */
185}; 179};
186 180
187/* Index values for nxttail array in struct rcu_data. */
188#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
189#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
190#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
191#define RCU_NEXT_TAIL 3
192#define RCU_NEXT_SIZE 4
193
194/* Per-CPU data for read-copy update. */ 181/* Per-CPU data for read-copy update. */
195struct rcu_data { 182struct rcu_data {
196 /* 1) quiescent-state and grace-period handling : */ 183 /* 1) quiescent-state and grace-period handling : */
@@ -217,8 +204,6 @@ struct rcu_data {
217 /* different grace periods. */ 204 /* different grace periods. */
218 long qlen_last_fqs_check; 205 long qlen_last_fqs_check;
219 /* qlen at last check for QS forcing */ 206 /* qlen at last check for QS forcing */
220 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
221 unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
222 unsigned long n_force_qs_snap; 207 unsigned long n_force_qs_snap;
223 /* did other CPU force QS recently? */ 208 /* did other CPU force QS recently? */
224 long blimit; /* Upper limit on a processed batch */ 209 long blimit; /* Upper limit on a processed batch */
@@ -234,18 +219,7 @@ struct rcu_data {
234 /* Grace period that needs help */ 219 /* Grace period that needs help */
235 /* from cond_resched(). */ 220 /* from cond_resched(). */
236 221
237 /* 5) __rcu_pending() statistics. */ 222 /* 5) _rcu_barrier(), OOM callbacks, and expediting. */
238 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
239 unsigned long n_rp_core_needs_qs;
240 unsigned long n_rp_report_qs;
241 unsigned long n_rp_cb_ready;
242 unsigned long n_rp_cpu_needs_gp;
243 unsigned long n_rp_gp_completed;
244 unsigned long n_rp_gp_started;
245 unsigned long n_rp_nocb_defer_wakeup;
246 unsigned long n_rp_need_nothing;
247
248 /* 6) _rcu_barrier(), OOM callbacks, and expediting. */
249 struct rcu_head barrier_head; 223 struct rcu_head barrier_head;
250#ifdef CONFIG_RCU_FAST_NO_HZ 224#ifdef CONFIG_RCU_FAST_NO_HZ
251 struct rcu_head oom_head; 225 struct rcu_head oom_head;
@@ -256,7 +230,7 @@ struct rcu_data {
256 atomic_long_t exp_workdone3; /* # done by others #3. */ 230 atomic_long_t exp_workdone3; /* # done by others #3. */
257 int exp_dynticks_snap; /* Double-check need for IPI. */ 231 int exp_dynticks_snap; /* Double-check need for IPI. */
258 232
259 /* 7) Callback offloading. */ 233 /* 6) Callback offloading. */
260#ifdef CONFIG_RCU_NOCB_CPU 234#ifdef CONFIG_RCU_NOCB_CPU
261 struct rcu_head *nocb_head; /* CBs waiting for kthread. */ 235 struct rcu_head *nocb_head; /* CBs waiting for kthread. */
262 struct rcu_head **nocb_tail; 236 struct rcu_head **nocb_tail;
@@ -283,7 +257,7 @@ struct rcu_data {
283 /* Leader CPU takes GP-end wakeups. */ 257 /* Leader CPU takes GP-end wakeups. */
284#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 258#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
285 259
286 /* 8) RCU CPU stall data. */ 260 /* 7) RCU CPU stall data. */
287 unsigned int softirq_snap; /* Snapshot of softirq activity. */ 261 unsigned int softirq_snap; /* Snapshot of softirq activity. */
288 /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ 262 /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */
289 struct irq_work rcu_iw; /* Check for non-irq activity. */ 263 struct irq_work rcu_iw; /* Check for non-irq activity. */
@@ -374,10 +348,6 @@ struct rcu_state {
374 /* kthreads, if configured. */ 348 /* kthreads, if configured. */
375 unsigned long n_force_qs; /* Number of calls to */ 349 unsigned long n_force_qs; /* Number of calls to */
376 /* force_quiescent_state(). */ 350 /* force_quiescent_state(). */
377 unsigned long n_force_qs_lh; /* ~Number of calls leaving */
378 /* due to lock unavailable. */
379 unsigned long n_force_qs_ngp; /* Number of calls leaving */
380 /* due to no GP active. */
381 unsigned long gp_start; /* Time at which GP started, */ 351 unsigned long gp_start; /* Time at which GP started, */
382 /* but in jiffies. */ 352 /* but in jiffies. */
383 unsigned long gp_activity; /* Time of last GP kthread */ 353 unsigned long gp_activity; /* Time of last GP kthread */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 46d61b597731..f72eefab8543 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -29,6 +29,15 @@ static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
29} 29}
30 30
31/* 31/*
32 * Return then value that expedited-grace-period counter will have
33 * at the end of the current grace period.
34 */
35static __maybe_unused unsigned long rcu_exp_gp_seq_endval(struct rcu_state *rsp)
36{
37 return rcu_seq_endval(&rsp->expedited_sequence);
38}
39
40/*
32 * Record the end of an expedited grace period. 41 * Record the end of an expedited grace period.
33 */ 42 */
34static void rcu_exp_gp_seq_end(struct rcu_state *rsp) 43static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
@@ -366,21 +375,30 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
366 int ret; 375 int ret;
367 struct rcu_node *rnp; 376 struct rcu_node *rnp;
368 377
378 trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
369 sync_exp_reset_tree(rsp); 379 sync_exp_reset_tree(rsp);
380 trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
370 rcu_for_each_leaf_node(rsp, rnp) { 381 rcu_for_each_leaf_node(rsp, rnp) {
371 raw_spin_lock_irqsave_rcu_node(rnp, flags); 382 raw_spin_lock_irqsave_rcu_node(rnp, flags);
372 383
373 /* Each pass checks a CPU for identity, offline, and idle. */ 384 /* Each pass checks a CPU for identity, offline, and idle. */
374 mask_ofl_test = 0; 385 mask_ofl_test = 0;
375 for_each_leaf_node_possible_cpu(rnp, cpu) { 386 for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
387 unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
376 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 388 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
389 struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
390 int snap;
377 391
378 rdp->exp_dynticks_snap =
379 rcu_dynticks_snap(rdp->dynticks);
380 if (raw_smp_processor_id() == cpu || 392 if (raw_smp_processor_id() == cpu ||
381 rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) || 393 !(rnp->qsmaskinitnext & mask)) {
382 !(rnp->qsmaskinitnext & rdp->grpmask)) 394 mask_ofl_test |= mask;
383 mask_ofl_test |= rdp->grpmask; 395 } else {
396 snap = rcu_dynticks_snap(rdtp);
397 if (rcu_dynticks_in_eqs(snap))
398 mask_ofl_test |= mask;
399 else
400 rdp->exp_dynticks_snap = snap;
401 }
384 } 402 }
385 mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; 403 mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
386 404
@@ -394,7 +412,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
394 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 412 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
395 413
396 /* IPI the remaining CPUs for expedited quiescent state. */ 414 /* IPI the remaining CPUs for expedited quiescent state. */
397 for_each_leaf_node_possible_cpu(rnp, cpu) { 415 for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
398 unsigned long mask = leaf_node_cpu_bit(rnp, cpu); 416 unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
399 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 417 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
400 418
@@ -417,6 +435,7 @@ retry_ipi:
417 (rnp->expmask & mask)) { 435 (rnp->expmask & mask)) {
418 /* Online, so delay for a bit and try again. */ 436 /* Online, so delay for a bit and try again. */
419 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 437 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
438 trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
420 schedule_timeout_uninterruptible(1); 439 schedule_timeout_uninterruptible(1);
421 goto retry_ipi; 440 goto retry_ipi;
422 } 441 }
@@ -443,6 +462,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
443 struct rcu_node *rnp_root = rcu_get_root(rsp); 462 struct rcu_node *rnp_root = rcu_get_root(rsp);
444 int ret; 463 int ret;
445 464
465 trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("startwait"));
446 jiffies_stall = rcu_jiffies_till_stall_check(); 466 jiffies_stall = rcu_jiffies_till_stall_check();
447 jiffies_start = jiffies; 467 jiffies_start = jiffies;
448 468
@@ -606,7 +626,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
606 rew.rew_rsp = rsp; 626 rew.rew_rsp = rsp;
607 rew.rew_s = s; 627 rew.rew_s = s;
608 INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); 628 INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
609 schedule_work(&rew.rew_work); 629 queue_work(rcu_gp_wq, &rew.rew_work);
610 } 630 }
611 631
612 /* Wait for expedited grace period to complete. */ 632 /* Wait for expedited grace period to complete. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fb88a028deec..84fbee4686d3 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -180,7 +180,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
180 (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); 180 (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
181 struct task_struct *t = current; 181 struct task_struct *t = current;
182 182
183 lockdep_assert_held(&rnp->lock); 183 raw_lockdep_assert_held_rcu_node(rnp);
184 WARN_ON_ONCE(rdp->mynode != rnp); 184 WARN_ON_ONCE(rdp->mynode != rnp);
185 WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); 185 WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
186 186
@@ -560,8 +560,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
560 } 560 }
561 t = list_entry(rnp->gp_tasks->prev, 561 t = list_entry(rnp->gp_tasks->prev,
562 struct task_struct, rcu_node_entry); 562 struct task_struct, rcu_node_entry);
563 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) 563 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
564 /*
565 * We could be printing a lot while holding a spinlock.
566 * Avoid triggering hard lockup.
567 */
568 touch_nmi_watchdog();
564 sched_show_task(t); 569 sched_show_task(t);
570 }
565 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 571 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
566} 572}
567 573
@@ -957,14 +963,10 @@ static int rcu_boost(struct rcu_node *rnp)
957 * expedited grace period must boost all blocked tasks, including 963 * expedited grace period must boost all blocked tasks, including
958 * those blocking the pre-existing normal grace period. 964 * those blocking the pre-existing normal grace period.
959 */ 965 */
960 if (rnp->exp_tasks != NULL) { 966 if (rnp->exp_tasks != NULL)
961 tb = rnp->exp_tasks; 967 tb = rnp->exp_tasks;
962 rnp->n_exp_boosts++; 968 else
963 } else {
964 tb = rnp->boost_tasks; 969 tb = rnp->boost_tasks;
965 rnp->n_normal_boosts++;
966 }
967 rnp->n_tasks_boosted++;
968 970
969 /* 971 /*
970 * We boost task t by manufacturing an rt_mutex that appears to 972 * We boost task t by manufacturing an rt_mutex that appears to
@@ -1042,7 +1044,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1042{ 1044{
1043 struct task_struct *t; 1045 struct task_struct *t;
1044 1046
1045 lockdep_assert_held(&rnp->lock); 1047 raw_lockdep_assert_held_rcu_node(rnp);
1046 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { 1048 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1047 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1049 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1048 return; 1050 return;
@@ -1677,6 +1679,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1677 char *ticks_title; 1679 char *ticks_title;
1678 unsigned long ticks_value; 1680 unsigned long ticks_value;
1679 1681
1682 /*
1683 * We could be printing a lot while holding a spinlock. Avoid
1684 * triggering hard lockup.
1685 */
1686 touch_nmi_watchdog();
1687
1680 if (rsp->gpnum == rdp->gpnum) { 1688 if (rsp->gpnum == rdp->gpnum) {
1681 ticks_title = "ticks this GP"; 1689 ticks_title = "ticks this GP";
1682 ticks_value = rdp->ticks_this_gp; 1690 ticks_value = rdp->ticks_this_gp;
@@ -2235,7 +2243,6 @@ static int rcu_nocb_kthread(void *arg)
2235 smp_mb__before_atomic(); /* _add after CB invocation. */ 2243 smp_mb__before_atomic(); /* _add after CB invocation. */
2236 atomic_long_add(-c, &rdp->nocb_q_count); 2244 atomic_long_add(-c, &rdp->nocb_q_count);
2237 atomic_long_add(-cl, &rdp->nocb_q_count_lazy); 2245 atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
2238 rdp->n_nocbs_invoked += c;
2239 } 2246 }
2240 return 0; 2247 return 0;
2241} 2248}
@@ -2312,8 +2319,11 @@ void __init rcu_init_nohz(void)
2312 cpumask_and(rcu_nocb_mask, cpu_possible_mask, 2319 cpumask_and(rcu_nocb_mask, cpu_possible_mask,
2313 rcu_nocb_mask); 2320 rcu_nocb_mask);
2314 } 2321 }
2315 pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", 2322 if (cpumask_empty(rcu_nocb_mask))
2316 cpumask_pr_args(rcu_nocb_mask)); 2323 pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
2324 else
2325 pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
2326 cpumask_pr_args(rcu_nocb_mask));
2317 if (rcu_nocb_poll) 2327 if (rcu_nocb_poll)
2318 pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); 2328 pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
2319 2329
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index e2f9d4feff40..d9a02b318108 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
17endif 17endif
18 18
19obj-y += core.o loadavg.o clock.o cputime.o 19obj-y += core.o loadavg.o clock.o cputime.o
20obj-y += idle_task.o fair.o rt.o deadline.o 20obj-y += idle.o fair.o rt.o deadline.o
21obj-y += wait.o wait_bit.o swait.o completion.o idle.o 21obj-y += wait.o wait_bit.o swait.o completion.o
22
22obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o 23obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
23obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o 24obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
24obj-$(CONFIG_SCHEDSTATS) += stats.o 25obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index bb4b9fe026a1..6be6c575b6cd 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,10 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include <linux/proc_fs.h> 2/*
3#include <linux/seq_file.h> 3 * Auto-group scheduling implementation:
4#include <linux/utsname.h> 4 */
5#include <linux/security.h>
6#include <linux/export.h>
7
8#include "sched.h" 5#include "sched.h"
9 6
10unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; 7unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
@@ -168,18 +165,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
168 autogroup_kref_put(prev); 165 autogroup_kref_put(prev);
169} 166}
170 167
171/* Allocates GFP_KERNEL, cannot be called under any spinlock */ 168/* Allocates GFP_KERNEL, cannot be called under any spinlock: */
172void sched_autogroup_create_attach(struct task_struct *p) 169void sched_autogroup_create_attach(struct task_struct *p)
173{ 170{
174 struct autogroup *ag = autogroup_create(); 171 struct autogroup *ag = autogroup_create();
175 172
176 autogroup_move_group(p, ag); 173 autogroup_move_group(p, ag);
177 /* drop extra reference added by autogroup_create() */ 174
175 /* Drop extra reference added by autogroup_create(): */
178 autogroup_kref_put(ag); 176 autogroup_kref_put(ag);
179} 177}
180EXPORT_SYMBOL(sched_autogroup_create_attach); 178EXPORT_SYMBOL(sched_autogroup_create_attach);
181 179
182/* Cannot be called under siglock. Currently has no users */ 180/* Cannot be called under siglock. Currently has no users: */
183void sched_autogroup_detach(struct task_struct *p) 181void sched_autogroup_detach(struct task_struct *p)
184{ 182{
185 autogroup_move_group(p, &autogroup_default); 183 autogroup_move_group(p, &autogroup_default);
@@ -202,7 +200,6 @@ static int __init setup_autogroup(char *str)
202 200
203 return 1; 201 return 1;
204} 202}
205
206__setup("noautogroup", setup_autogroup); 203__setup("noautogroup", setup_autogroup);
207 204
208#ifdef CONFIG_PROC_FS 205#ifdef CONFIG_PROC_FS
@@ -224,7 +221,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
224 if (nice < 0 && !can_nice(current, nice)) 221 if (nice < 0 && !can_nice(current, nice))
225 return -EPERM; 222 return -EPERM;
226 223
227 /* this is a heavy operation taking global locks.. */ 224 /* This is a heavy operation, taking global locks.. */
228 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) 225 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
229 return -EAGAIN; 226 return -EAGAIN;
230 227
@@ -267,4 +264,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
267 264
268 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 265 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
269} 266}
270#endif /* CONFIG_SCHED_DEBUG */ 267#endif
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 27cd22b89824..b96419974a1f 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -1,15 +1,11 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifdef CONFIG_SCHED_AUTOGROUP 2#ifdef CONFIG_SCHED_AUTOGROUP
3 3
4#include <linux/kref.h>
5#include <linux/rwsem.h>
6#include <linux/sched/autogroup.h>
7
8struct autogroup { 4struct autogroup {
9 /* 5 /*
10 * reference doesn't mean how many thread attach to this 6 * Reference doesn't mean how many threads attach to this
11 * autogroup now. It just stands for the number of task 7 * autogroup now. It just stands for the number of tasks
12 * could use this autogroup. 8 * which could use this autogroup.
13 */ 9 */
14 struct kref kref; 10 struct kref kref;
15 struct task_group *tg; 11 struct task_group *tg;
@@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
56 return tg; 52 return tg;
57} 53}
58 54
59#ifdef CONFIG_SCHED_DEBUG
60static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 55static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
61{ 56{
62 return 0; 57 return 0;
63} 58}
64#endif
65 59
66#endif /* CONFIG_SCHED_AUTOGROUP */ 60#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e086babe6c61..10c83e73837a 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * sched_clock for unstable cpu clocks 2 * sched_clock() for unstable CPU clocks
3 * 3 *
4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra 4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
5 * 5 *
@@ -11,7 +11,7 @@
11 * Guillaume Chazarain <guichaz@gmail.com> 11 * Guillaume Chazarain <guichaz@gmail.com>
12 * 12 *
13 * 13 *
14 * What: 14 * What this file implements:
15 * 15 *
16 * cpu_clock(i) provides a fast (execution time) high resolution 16 * cpu_clock(i) provides a fast (execution time) high resolution
17 * clock with bounded drift between CPUs. The value of cpu_clock(i) 17 * clock with bounded drift between CPUs. The value of cpu_clock(i)
@@ -26,11 +26,11 @@
26 * at 0 on boot (but people really shouldn't rely on that). 26 * at 0 on boot (but people really shouldn't rely on that).
27 * 27 *
28 * cpu_clock(i) -- can be used from any context, including NMI. 28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * local_clock() -- is cpu_clock() on the current cpu. 29 * local_clock() -- is cpu_clock() on the current CPU.
30 * 30 *
31 * sched_clock_cpu(i) 31 * sched_clock_cpu(i)
32 * 32 *
33 * How: 33 * How it is implemented:
34 * 34 *
35 * The implementation either uses sched_clock() when 35 * The implementation either uses sched_clock() when
36 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the 36 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
@@ -52,19 +52,7 @@
52 * that is otherwise invisible (TSC gets stopped). 52 * that is otherwise invisible (TSC gets stopped).
53 * 53 *
54 */ 54 */
55#include <linux/spinlock.h> 55#include "sched.h"
56#include <linux/hardirq.h>
57#include <linux/export.h>
58#include <linux/percpu.h>
59#include <linux/ktime.h>
60#include <linux/sched.h>
61#include <linux/nmi.h>
62#include <linux/sched/clock.h>
63#include <linux/static_key.h>
64#include <linux/workqueue.h>
65#include <linux/compiler.h>
66#include <linux/tick.h>
67#include <linux/init.h>
68 56
69/* 57/*
70 * Scheduler clock - returns current time in nanosec units. 58 * Scheduler clock - returns current time in nanosec units.
@@ -302,21 +290,21 @@ again:
302 * cmpxchg64 below only protects one readout. 290 * cmpxchg64 below only protects one readout.
303 * 291 *
304 * We must reread via sched_clock_local() in the retry case on 292 * We must reread via sched_clock_local() in the retry case on
305 * 32bit as an NMI could use sched_clock_local() via the 293 * 32-bit kernels as an NMI could use sched_clock_local() via the
306 * tracer and hit between the readout of 294 * tracer and hit between the readout of
307 * the low32bit and the high 32bit portion. 295 * the low 32-bit and the high 32-bit portion.
308 */ 296 */
309 this_clock = sched_clock_local(my_scd); 297 this_clock = sched_clock_local(my_scd);
310 /* 298 /*
311 * We must enforce atomic readout on 32bit, otherwise the 299 * We must enforce atomic readout on 32-bit, otherwise the
312 * update on the remote cpu can hit inbetween the readout of 300 * update on the remote CPU can hit inbetween the readout of
313 * the low32bit and the high 32bit portion. 301 * the low 32-bit and the high 32-bit portion.
314 */ 302 */
315 remote_clock = cmpxchg64(&scd->clock, 0, 0); 303 remote_clock = cmpxchg64(&scd->clock, 0, 0);
316#else 304#else
317 /* 305 /*
318 * On 64bit the read of [my]scd->clock is atomic versus the 306 * On 64-bit kernels the read of [my]scd->clock is atomic versus the
319 * update, so we can avoid the above 32bit dance. 307 * update, so we can avoid the above 32-bit dance.
320 */ 308 */
321 sched_clock_local(my_scd); 309 sched_clock_local(my_scd);
322again: 310again:
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 0926aef10dad..e426b0cb9ac6 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -11,10 +11,7 @@
11 * typically be used for exclusion which gives rise to priority inversion. 11 * typically be used for exclusion which gives rise to priority inversion.
12 * Waiting for completion is a typically sync point, but not an exclusion point. 12 * Waiting for completion is a typically sync point, but not an exclusion point.
13 */ 13 */
14 14#include "sched.h"
15#include <linux/sched/signal.h>
16#include <linux/sched/debug.h>
17#include <linux/completion.h>
18 15
19/** 16/**
20 * complete: - signals a single thread waiting on this completion 17 * complete: - signals a single thread waiting on this completion
@@ -283,7 +280,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);
283bool try_wait_for_completion(struct completion *x) 280bool try_wait_for_completion(struct completion *x)
284{ 281{
285 unsigned long flags; 282 unsigned long flags;
286 int ret = 1; 283 bool ret = true;
287 284
288 /* 285 /*
289 * Since x->done will need to be locked only 286 * Since x->done will need to be locked only
@@ -292,11 +289,11 @@ bool try_wait_for_completion(struct completion *x)
292 * return early in the blocking case. 289 * return early in the blocking case.
293 */ 290 */
294 if (!READ_ONCE(x->done)) 291 if (!READ_ONCE(x->done))
295 return 0; 292 return false;
296 293
297 spin_lock_irqsave(&x->wait.lock, flags); 294 spin_lock_irqsave(&x->wait.lock, flags);
298 if (!x->done) 295 if (!x->done)
299 ret = 0; 296 ret = false;
300 else if (x->done != UINT_MAX) 297 else if (x->done != UINT_MAX)
301 x->done--; 298 x->done--;
302 spin_unlock_irqrestore(&x->wait.lock, flags); 299 spin_unlock_irqrestore(&x->wait.lock, flags);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c94895bc5a2c..28b68995a417 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5,37 +5,11 @@
5 * 5 *
6 * Copyright (C) 1991-2002 Linus Torvalds 6 * Copyright (C) 1991-2002 Linus Torvalds
7 */ 7 */
8#include <linux/sched.h> 8#include "sched.h"
9#include <linux/sched/clock.h>
10#include <uapi/linux/sched/types.h>
11#include <linux/sched/loadavg.h>
12#include <linux/sched/hotplug.h>
13#include <linux/wait_bit.h>
14#include <linux/cpuset.h>
15#include <linux/delayacct.h>
16#include <linux/init_task.h>
17#include <linux/context_tracking.h>
18#include <linux/rcupdate_wait.h>
19#include <linux/compat.h>
20
21#include <linux/blkdev.h>
22#include <linux/kprobes.h>
23#include <linux/mmu_context.h>
24#include <linux/module.h>
25#include <linux/nmi.h>
26#include <linux/prefetch.h>
27#include <linux/profile.h>
28#include <linux/security.h>
29#include <linux/syscalls.h>
30#include <linux/sched/isolation.h>
31 9
32#include <asm/switch_to.h> 10#include <asm/switch_to.h>
33#include <asm/tlb.h> 11#include <asm/tlb.h>
34#ifdef CONFIG_PARAVIRT
35#include <asm/paravirt.h>
36#endif
37 12
38#include "sched.h"
39#include "../workqueue_internal.h" 13#include "../workqueue_internal.h"
40#include "../smpboot.h" 14#include "../smpboot.h"
41 15
@@ -135,7 +109,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
135 * [L] ->on_rq 109 * [L] ->on_rq
136 * RELEASE (rq->lock) 110 * RELEASE (rq->lock)
137 * 111 *
138 * If we observe the old cpu in task_rq_lock, the acquire of 112 * If we observe the old CPU in task_rq_lock, the acquire of
139 * the old rq->lock will fully serialize against the stores. 113 * the old rq->lock will fully serialize against the stores.
140 * 114 *
141 * If we observe the new CPU in task_rq_lock, the acquire will 115 * If we observe the new CPU in task_rq_lock, the acquire will
@@ -333,7 +307,7 @@ void hrtick_start(struct rq *rq, u64 delay)
333} 307}
334#endif /* CONFIG_SMP */ 308#endif /* CONFIG_SMP */
335 309
336static void init_rq_hrtick(struct rq *rq) 310static void hrtick_rq_init(struct rq *rq)
337{ 311{
338#ifdef CONFIG_SMP 312#ifdef CONFIG_SMP
339 rq->hrtick_csd_pending = 0; 313 rq->hrtick_csd_pending = 0;
@@ -351,7 +325,7 @@ static inline void hrtick_clear(struct rq *rq)
351{ 325{
352} 326}
353 327
354static inline void init_rq_hrtick(struct rq *rq) 328static inline void hrtick_rq_init(struct rq *rq)
355{ 329{
356} 330}
357#endif /* CONFIG_SCHED_HRTICK */ 331#endif /* CONFIG_SCHED_HRTICK */
@@ -609,7 +583,7 @@ static inline bool got_nohz_idle_kick(void)
609{ 583{
610 int cpu = smp_processor_id(); 584 int cpu = smp_processor_id();
611 585
612 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) 586 if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
613 return false; 587 return false;
614 588
615 if (idle_cpu(cpu) && !need_resched()) 589 if (idle_cpu(cpu) && !need_resched())
@@ -619,7 +593,7 @@ static inline bool got_nohz_idle_kick(void)
619 * We can't run Idle Load Balance on this CPU for this time so we 593 * We can't run Idle Load Balance on this CPU for this time so we
620 * cancel it and clear NOHZ_BALANCE_KICK 594 * cancel it and clear NOHZ_BALANCE_KICK
621 */ 595 */
622 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 596 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
623 return false; 597 return false;
624} 598}
625 599
@@ -1457,7 +1431,7 @@ EXPORT_SYMBOL_GPL(kick_process);
1457 * 1431 *
1458 * - cpu_active must be a subset of cpu_online 1432 * - cpu_active must be a subset of cpu_online
1459 * 1433 *
1460 * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, 1434 * - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
1461 * see __set_cpus_allowed_ptr(). At this point the newly online 1435 * see __set_cpus_allowed_ptr(). At this point the newly online
1462 * CPU isn't yet part of the sched domains, and balancing will not 1436 * CPU isn't yet part of the sched domains, and balancing will not
1463 * see it. 1437 * see it.
@@ -2488,17 +2462,17 @@ void wake_up_new_task(struct task_struct *p)
2488 2462
2489#ifdef CONFIG_PREEMPT_NOTIFIERS 2463#ifdef CONFIG_PREEMPT_NOTIFIERS
2490 2464
2491static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; 2465static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
2492 2466
2493void preempt_notifier_inc(void) 2467void preempt_notifier_inc(void)
2494{ 2468{
2495 static_key_slow_inc(&preempt_notifier_key); 2469 static_branch_inc(&preempt_notifier_key);
2496} 2470}
2497EXPORT_SYMBOL_GPL(preempt_notifier_inc); 2471EXPORT_SYMBOL_GPL(preempt_notifier_inc);
2498 2472
2499void preempt_notifier_dec(void) 2473void preempt_notifier_dec(void)
2500{ 2474{
2501 static_key_slow_dec(&preempt_notifier_key); 2475 static_branch_dec(&preempt_notifier_key);
2502} 2476}
2503EXPORT_SYMBOL_GPL(preempt_notifier_dec); 2477EXPORT_SYMBOL_GPL(preempt_notifier_dec);
2504 2478
@@ -2508,7 +2482,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_dec);
2508 */ 2482 */
2509void preempt_notifier_register(struct preempt_notifier *notifier) 2483void preempt_notifier_register(struct preempt_notifier *notifier)
2510{ 2484{
2511 if (!static_key_false(&preempt_notifier_key)) 2485 if (!static_branch_unlikely(&preempt_notifier_key))
2512 WARN(1, "registering preempt_notifier while notifiers disabled\n"); 2486 WARN(1, "registering preempt_notifier while notifiers disabled\n");
2513 2487
2514 hlist_add_head(&notifier->link, &current->preempt_notifiers); 2488 hlist_add_head(&notifier->link, &current->preempt_notifiers);
@@ -2537,7 +2511,7 @@ static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
2537 2511
2538static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2512static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2539{ 2513{
2540 if (static_key_false(&preempt_notifier_key)) 2514 if (static_branch_unlikely(&preempt_notifier_key))
2541 __fire_sched_in_preempt_notifiers(curr); 2515 __fire_sched_in_preempt_notifiers(curr);
2542} 2516}
2543 2517
@@ -2555,7 +2529,7 @@ static __always_inline void
2555fire_sched_out_preempt_notifiers(struct task_struct *curr, 2529fire_sched_out_preempt_notifiers(struct task_struct *curr,
2556 struct task_struct *next) 2530 struct task_struct *next)
2557{ 2531{
2558 if (static_key_false(&preempt_notifier_key)) 2532 if (static_branch_unlikely(&preempt_notifier_key))
2559 __fire_sched_out_preempt_notifiers(curr, next); 2533 __fire_sched_out_preempt_notifiers(curr, next);
2560} 2534}
2561 2535
@@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq)
2629 raw_spin_unlock_irq(&rq->lock); 2603 raw_spin_unlock_irq(&rq->lock);
2630} 2604}
2631 2605
2606/*
2607 * NOP if the arch has not defined these:
2608 */
2609
2610#ifndef prepare_arch_switch
2611# define prepare_arch_switch(next) do { } while (0)
2612#endif
2613
2614#ifndef finish_arch_post_lock_switch
2615# define finish_arch_post_lock_switch() do { } while (0)
2616#endif
2617
2632/** 2618/**
2633 * prepare_task_switch - prepare to switch tasks 2619 * prepare_task_switch - prepare to switch tasks
2634 * @rq: the runqueue preparing to switch 2620 * @rq: the runqueue preparing to switch
@@ -3037,7 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3037 3023
3038#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 3024#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
3039 /* 3025 /*
3040 * 64-bit doesn't need locks to atomically read a 64bit value. 3026 * 64-bit doesn't need locks to atomically read a 64-bit value.
3041 * So we have a optimization chance when the task's delta_exec is 0. 3027 * So we have a optimization chance when the task's delta_exec is 0.
3042 * Reading ->on_cpu is racy, but this is ok. 3028 * Reading ->on_cpu is racy, but this is ok.
3043 * 3029 *
@@ -3096,35 +3082,99 @@ void scheduler_tick(void)
3096 rq->idle_balance = idle_cpu(cpu); 3082 rq->idle_balance = idle_cpu(cpu);
3097 trigger_load_balance(rq); 3083 trigger_load_balance(rq);
3098#endif 3084#endif
3099 rq_last_tick_reset(rq);
3100} 3085}
3101 3086
3102#ifdef CONFIG_NO_HZ_FULL 3087#ifdef CONFIG_NO_HZ_FULL
3103/** 3088
3104 * scheduler_tick_max_deferment 3089struct tick_work {
3105 * 3090 int cpu;
3106 * Keep at least one tick per second when a single 3091 struct delayed_work work;
3107 * active task is running because the scheduler doesn't 3092};
3108 * yet completely support full dynticks environment. 3093
3109 * 3094static struct tick_work __percpu *tick_work_cpu;
3110 * This makes sure that uptime, CFS vruntime, load 3095
3111 * balancing, etc... continue to move forward, even 3096static void sched_tick_remote(struct work_struct *work)
3112 * with a very low granularity.
3113 *
3114 * Return: Maximum deferment in nanoseconds.
3115 */
3116u64 scheduler_tick_max_deferment(void)
3117{ 3097{
3118 struct rq *rq = this_rq(); 3098 struct delayed_work *dwork = to_delayed_work(work);
3119 unsigned long next, now = READ_ONCE(jiffies); 3099 struct tick_work *twork = container_of(dwork, struct tick_work, work);
3100 int cpu = twork->cpu;
3101 struct rq *rq = cpu_rq(cpu);
3102 struct rq_flags rf;
3103
3104 /*
3105 * Handle the tick only if it appears the remote CPU is running in full
3106 * dynticks mode. The check is racy by nature, but missing a tick or
3107 * having one too much is no big deal because the scheduler tick updates
3108 * statistics and checks timeslices in a time-independent way, regardless
3109 * of when exactly it is running.
3110 */
3111 if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
3112 struct task_struct *curr;
3113 u64 delta;
3120 3114
3121 next = rq->last_sched_tick + HZ; 3115 rq_lock_irq(rq, &rf);
3116 update_rq_clock(rq);
3117 curr = rq->curr;
3118 delta = rq_clock_task(rq) - curr->se.exec_start;
3122 3119
3123 if (time_before_eq(next, now)) 3120 /*
3124 return 0; 3121 * Make sure the next tick runs within a reasonable
3122 * amount of time.
3123 */
3124 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
3125 curr->sched_class->task_tick(rq, curr, 0);
3126 rq_unlock_irq(rq, &rf);
3127 }
3125 3128
3126 return jiffies_to_nsecs(next - now); 3129 /*
3130 * Run the remote tick once per second (1Hz). This arbitrary
3131 * frequency is large enough to avoid overload but short enough
3132 * to keep scheduler internal stats reasonably up to date.
3133 */
3134 queue_delayed_work(system_unbound_wq, dwork, HZ);
3135}
3136
3137static void sched_tick_start(int cpu)
3138{
3139 struct tick_work *twork;
3140
3141 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
3142 return;
3143
3144 WARN_ON_ONCE(!tick_work_cpu);
3145
3146 twork = per_cpu_ptr(tick_work_cpu, cpu);
3147 twork->cpu = cpu;
3148 INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
3149 queue_delayed_work(system_unbound_wq, &twork->work, HZ);
3127} 3150}
3151
3152#ifdef CONFIG_HOTPLUG_CPU
3153static void sched_tick_stop(int cpu)
3154{
3155 struct tick_work *twork;
3156
3157 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
3158 return;
3159
3160 WARN_ON_ONCE(!tick_work_cpu);
3161
3162 twork = per_cpu_ptr(tick_work_cpu, cpu);
3163 cancel_delayed_work_sync(&twork->work);
3164}
3165#endif /* CONFIG_HOTPLUG_CPU */
3166
3167int __init sched_tick_offload_init(void)
3168{
3169 tick_work_cpu = alloc_percpu(struct tick_work);
3170 BUG_ON(!tick_work_cpu);
3171
3172 return 0;
3173}
3174
3175#else /* !CONFIG_NO_HZ_FULL */
3176static inline void sched_tick_start(int cpu) { }
3177static inline void sched_tick_stop(int cpu) { }
3128#endif 3178#endif
3129 3179
3130#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 3180#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
@@ -4892,7 +4942,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4892 * 4942 *
4893 * Return: 0. 4943 * Return: 0.
4894 */ 4944 */
4895SYSCALL_DEFINE0(sched_yield) 4945static void do_sched_yield(void)
4896{ 4946{
4897 struct rq_flags rf; 4947 struct rq_flags rf;
4898 struct rq *rq; 4948 struct rq *rq;
@@ -4913,7 +4963,11 @@ SYSCALL_DEFINE0(sched_yield)
4913 sched_preempt_enable_no_resched(); 4963 sched_preempt_enable_no_resched();
4914 4964
4915 schedule(); 4965 schedule();
4966}
4916 4967
4968SYSCALL_DEFINE0(sched_yield)
4969{
4970 do_sched_yield();
4917 return 0; 4971 return 0;
4918} 4972}
4919 4973
@@ -4997,7 +5051,7 @@ EXPORT_SYMBOL(__cond_resched_softirq);
4997void __sched yield(void) 5051void __sched yield(void)
4998{ 5052{
4999 set_current_state(TASK_RUNNING); 5053 set_current_state(TASK_RUNNING);
5000 sys_sched_yield(); 5054 do_sched_yield();
5001} 5055}
5002EXPORT_SYMBOL(yield); 5056EXPORT_SYMBOL(yield);
5003 5057
@@ -5786,6 +5840,7 @@ int sched_cpu_starting(unsigned int cpu)
5786{ 5840{
5787 set_cpu_rq_start_time(cpu); 5841 set_cpu_rq_start_time(cpu);
5788 sched_rq_cpu_starting(cpu); 5842 sched_rq_cpu_starting(cpu);
5843 sched_tick_start(cpu);
5789 return 0; 5844 return 0;
5790} 5845}
5791 5846
@@ -5797,6 +5852,7 @@ int sched_cpu_dying(unsigned int cpu)
5797 5852
5798 /* Handle pending wakeups and then migrate everything off */ 5853 /* Handle pending wakeups and then migrate everything off */
5799 sched_ttwu_pending(); 5854 sched_ttwu_pending();
5855 sched_tick_stop(cpu);
5800 5856
5801 rq_lock_irqsave(rq, &rf); 5857 rq_lock_irqsave(rq, &rf);
5802 if (rq->rd) { 5858 if (rq->rd) {
@@ -5809,7 +5865,7 @@ int sched_cpu_dying(unsigned int cpu)
5809 5865
5810 calc_load_migrate(rq); 5866 calc_load_migrate(rq);
5811 update_max_interval(); 5867 update_max_interval();
5812 nohz_balance_exit_idle(cpu); 5868 nohz_balance_exit_idle(rq);
5813 hrtick_clear(rq); 5869 hrtick_clear(rq);
5814 return 0; 5870 return 0;
5815} 5871}
@@ -6022,13 +6078,11 @@ void __init sched_init(void)
6022 rq_attach_root(rq, &def_root_domain); 6078 rq_attach_root(rq, &def_root_domain);
6023#ifdef CONFIG_NO_HZ_COMMON 6079#ifdef CONFIG_NO_HZ_COMMON
6024 rq->last_load_update_tick = jiffies; 6080 rq->last_load_update_tick = jiffies;
6025 rq->nohz_flags = 0; 6081 rq->last_blocked_load_update_tick = jiffies;
6026#endif 6082 atomic_set(&rq->nohz_flags, 0);
6027#ifdef CONFIG_NO_HZ_FULL
6028 rq->last_sched_tick = 0;
6029#endif 6083#endif
6030#endif /* CONFIG_SMP */ 6084#endif /* CONFIG_SMP */
6031 init_rq_hrtick(rq); 6085 hrtick_rq_init(rq);
6032 atomic_set(&rq->nr_iowait, 0); 6086 atomic_set(&rq->nr_iowait, 0);
6033 } 6087 }
6034 6088
@@ -7027,3 +7081,5 @@ const u32 sched_prio_to_wmult[40] = {
7027 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, 7081 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
7028 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 7082 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
7029}; 7083};
7084
7085#undef CREATE_TRACE_POINTS
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 44ab32a4fab6..9fbb10383434 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -1,24 +1,13 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include <linux/cgroup.h>
3#include <linux/slab.h>
4#include <linux/percpu.h>
5#include <linux/spinlock.h>
6#include <linux/cpumask.h>
7#include <linux/seq_file.h>
8#include <linux/rcupdate.h>
9#include <linux/kernel_stat.h>
10#include <linux/err.h>
11
12#include "sched.h"
13
14/* 2/*
15 * CPU accounting code for task groups. 3 * CPU accounting code for task groups.
16 * 4 *
17 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh 5 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
18 * (balbir@in.ibm.com). 6 * (balbir@in.ibm.com).
19 */ 7 */
8#include "sched.h"
20 9
21/* Time spent by the tasks of the cpu accounting group executing in ... */ 10/* Time spent by the tasks of the CPU accounting group executing in ... */
22enum cpuacct_stat_index { 11enum cpuacct_stat_index {
23 CPUACCT_STAT_USER, /* ... user mode */ 12 CPUACCT_STAT_USER, /* ... user mode */
24 CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 13 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
@@ -35,12 +24,12 @@ struct cpuacct_usage {
35 u64 usages[CPUACCT_STAT_NSTATS]; 24 u64 usages[CPUACCT_STAT_NSTATS];
36}; 25};
37 26
38/* track cpu usage of a group of tasks and its child groups */ 27/* track CPU usage of a group of tasks and its child groups */
39struct cpuacct { 28struct cpuacct {
40 struct cgroup_subsys_state css; 29 struct cgroup_subsys_state css;
41 /* cpuusage holds pointer to a u64-type object on every cpu */ 30 /* cpuusage holds pointer to a u64-type object on every CPU */
42 struct cpuacct_usage __percpu *cpuusage; 31 struct cpuacct_usage __percpu *cpuusage;
43 struct kernel_cpustat __percpu *cpustat; 32 struct kernel_cpustat __percpu *cpustat;
44}; 33};
45 34
46static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) 35static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
@@ -48,7 +37,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
48 return css ? container_of(css, struct cpuacct, css) : NULL; 37 return css ? container_of(css, struct cpuacct, css) : NULL;
49} 38}
50 39
51/* return cpu accounting group to which this task belongs */ 40/* Return CPU accounting group to which this task belongs */
52static inline struct cpuacct *task_ca(struct task_struct *tsk) 41static inline struct cpuacct *task_ca(struct task_struct *tsk)
53{ 42{
54 return css_ca(task_css(tsk, cpuacct_cgrp_id)); 43 return css_ca(task_css(tsk, cpuacct_cgrp_id));
@@ -65,7 +54,7 @@ static struct cpuacct root_cpuacct = {
65 .cpuusage = &root_cpuacct_cpuusage, 54 .cpuusage = &root_cpuacct_cpuusage,
66}; 55};
67 56
68/* create a new cpu accounting group */ 57/* Create a new CPU accounting group */
69static struct cgroup_subsys_state * 58static struct cgroup_subsys_state *
70cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) 59cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
71{ 60{
@@ -96,7 +85,7 @@ out:
96 return ERR_PTR(-ENOMEM); 85 return ERR_PTR(-ENOMEM);
97} 86}
98 87
99/* destroy an existing cpu accounting group */ 88/* Destroy an existing CPU accounting group */
100static void cpuacct_css_free(struct cgroup_subsys_state *css) 89static void cpuacct_css_free(struct cgroup_subsys_state *css)
101{ 90{
102 struct cpuacct *ca = css_ca(css); 91 struct cpuacct *ca = css_ca(css);
@@ -162,7 +151,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
162#endif 151#endif
163} 152}
164 153
165/* return total cpu usage (in nanoseconds) of a group */ 154/* Return total CPU usage (in nanoseconds) of a group */
166static u64 __cpuusage_read(struct cgroup_subsys_state *css, 155static u64 __cpuusage_read(struct cgroup_subsys_state *css,
167 enum cpuacct_stat_index index) 156 enum cpuacct_stat_index index)
168{ 157{
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 8d9562d890d3..50316455ea66 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -10,11 +10,7 @@
10 * as published by the Free Software Foundation; version 2 10 * as published by the Free Software Foundation; version 2
11 * of the License. 11 * of the License.
12 */ 12 */
13 13#include "sched.h"
14#include <linux/gfp.h>
15#include <linux/kernel.h>
16#include <linux/slab.h>
17#include "cpudeadline.h"
18 14
19static inline int parent(int i) 15static inline int parent(int i)
20{ 16{
@@ -42,8 +38,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx)
42 return; 38 return;
43 39
44 /* adapted from lib/prio_heap.c */ 40 /* adapted from lib/prio_heap.c */
45 while(1) { 41 while (1) {
46 u64 largest_dl; 42 u64 largest_dl;
43
47 l = left_child(idx); 44 l = left_child(idx);
48 r = right_child(idx); 45 r = right_child(idx);
49 largest = idx; 46 largest = idx;
@@ -131,6 +128,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
131 return 1; 128 return 1;
132 } else { 129 } else {
133 int best_cpu = cpudl_maximum(cp); 130 int best_cpu = cpudl_maximum(cp);
131
134 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); 132 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
135 133
136 if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && 134 if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
@@ -145,9 +143,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
145} 143}
146 144
147/* 145/*
148 * cpudl_clear - remove a cpu from the cpudl max-heap 146 * cpudl_clear - remove a CPU from the cpudl max-heap
149 * @cp: the cpudl max-heap context 147 * @cp: the cpudl max-heap context
150 * @cpu: the target cpu 148 * @cpu: the target CPU
151 * 149 *
152 * Notes: assumes cpu_rq(cpu)->lock is locked 150 * Notes: assumes cpu_rq(cpu)->lock is locked
153 * 151 *
@@ -186,8 +184,8 @@ void cpudl_clear(struct cpudl *cp, int cpu)
186/* 184/*
187 * cpudl_set - update the cpudl max-heap 185 * cpudl_set - update the cpudl max-heap
188 * @cp: the cpudl max-heap context 186 * @cp: the cpudl max-heap context
189 * @cpu: the target cpu 187 * @cpu: the target CPU
190 * @dl: the new earliest deadline for this cpu 188 * @dl: the new earliest deadline for this CPU
191 * 189 *
192 * Notes: assumes cpu_rq(cpu)->lock is locked 190 * Notes: assumes cpu_rq(cpu)->lock is locked
193 * 191 *
@@ -205,6 +203,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
205 old_idx = cp->elements[cpu].idx; 203 old_idx = cp->elements[cpu].idx;
206 if (old_idx == IDX_INVALID) { 204 if (old_idx == IDX_INVALID) {
207 int new_idx = cp->size++; 205 int new_idx = cp->size++;
206
208 cp->elements[new_idx].dl = dl; 207 cp->elements[new_idx].dl = dl;
209 cp->elements[new_idx].cpu = cpu; 208 cp->elements[new_idx].cpu = cpu;
210 cp->elements[cpu].idx = new_idx; 209 cp->elements[cpu].idx = new_idx;
@@ -221,7 +220,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
221/* 220/*
222 * cpudl_set_freecpu - Set the cpudl.free_cpus 221 * cpudl_set_freecpu - Set the cpudl.free_cpus
223 * @cp: the cpudl max-heap context 222 * @cp: the cpudl max-heap context
224 * @cpu: rd attached cpu 223 * @cpu: rd attached CPU
225 */ 224 */
226void cpudl_set_freecpu(struct cpudl *cp, int cpu) 225void cpudl_set_freecpu(struct cpudl *cp, int cpu)
227{ 226{
@@ -231,7 +230,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu)
231/* 230/*
232 * cpudl_clear_freecpu - Clear the cpudl.free_cpus 231 * cpudl_clear_freecpu - Clear the cpudl.free_cpus
233 * @cp: the cpudl max-heap context 232 * @cp: the cpudl max-heap context
234 * @cpu: rd attached cpu 233 * @cpu: rd attached CPU
235 */ 234 */
236void cpudl_clear_freecpu(struct cpudl *cp, int cpu) 235void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
237{ 236{
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index b010d26e108e..0adeda93b5fb 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -1,35 +1,26 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_CPUDL_H
3#define _LINUX_CPUDL_H
4 2
5#include <linux/sched.h> 3#define IDX_INVALID -1
6#include <linux/sched/deadline.h>
7
8#define IDX_INVALID -1
9 4
10struct cpudl_item { 5struct cpudl_item {
11 u64 dl; 6 u64 dl;
12 int cpu; 7 int cpu;
13 int idx; 8 int idx;
14}; 9};
15 10
16struct cpudl { 11struct cpudl {
17 raw_spinlock_t lock; 12 raw_spinlock_t lock;
18 int size; 13 int size;
19 cpumask_var_t free_cpus; 14 cpumask_var_t free_cpus;
20 struct cpudl_item *elements; 15 struct cpudl_item *elements;
21}; 16};
22 17
23
24#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
25int cpudl_find(struct cpudl *cp, struct task_struct *p, 19int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
26 struct cpumask *later_mask);
27void cpudl_set(struct cpudl *cp, int cpu, u64 dl); 20void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
28void cpudl_clear(struct cpudl *cp, int cpu); 21void cpudl_clear(struct cpudl *cp, int cpu);
29int cpudl_init(struct cpudl *cp); 22int cpudl_init(struct cpudl *cp);
30void cpudl_set_freecpu(struct cpudl *cp, int cpu); 23void cpudl_set_freecpu(struct cpudl *cp, int cpu);
31void cpudl_clear_freecpu(struct cpudl *cp, int cpu); 24void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
32void cpudl_cleanup(struct cpudl *cp); 25void cpudl_cleanup(struct cpudl *cp);
33#endif /* CONFIG_SMP */ 26#endif /* CONFIG_SMP */
34
35#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index dbc51442ecbc..5e54cbcae673 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -8,7 +8,6 @@
8 * it under the terms of the GNU General Public License version 2 as 8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation. 9 * published by the Free Software Foundation.
10 */ 10 */
11
12#include "sched.h" 11#include "sched.h"
13 12
14DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); 13DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 617c6741c525..d2c6083304b4 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -11,61 +11,56 @@
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 13
14#include <linux/cpufreq.h>
15#include <linux/kthread.h>
16#include <uapi/linux/sched/types.h>
17#include <linux/slab.h>
18#include <trace/events/power.h>
19
20#include "sched.h" 14#include "sched.h"
21 15
16#include <trace/events/power.h>
17
22struct sugov_tunables { 18struct sugov_tunables {
23 struct gov_attr_set attr_set; 19 struct gov_attr_set attr_set;
24 unsigned int rate_limit_us; 20 unsigned int rate_limit_us;
25}; 21};
26 22
27struct sugov_policy { 23struct sugov_policy {
28 struct cpufreq_policy *policy; 24 struct cpufreq_policy *policy;
29 25
30 struct sugov_tunables *tunables; 26 struct sugov_tunables *tunables;
31 struct list_head tunables_hook; 27 struct list_head tunables_hook;
32 28
33 raw_spinlock_t update_lock; /* For shared policies */ 29 raw_spinlock_t update_lock; /* For shared policies */
34 u64 last_freq_update_time; 30 u64 last_freq_update_time;
35 s64 freq_update_delay_ns; 31 s64 freq_update_delay_ns;
36 unsigned int next_freq; 32 unsigned int next_freq;
37 unsigned int cached_raw_freq; 33 unsigned int cached_raw_freq;
38 34
39 /* The next fields are only needed if fast switch cannot be used. */ 35 /* The next fields are only needed if fast switch cannot be used: */
40 struct irq_work irq_work; 36 struct irq_work irq_work;
41 struct kthread_work work; 37 struct kthread_work work;
42 struct mutex work_lock; 38 struct mutex work_lock;
43 struct kthread_worker worker; 39 struct kthread_worker worker;
44 struct task_struct *thread; 40 struct task_struct *thread;
45 bool work_in_progress; 41 bool work_in_progress;
46 42
47 bool need_freq_update; 43 bool need_freq_update;
48}; 44};
49 45
50struct sugov_cpu { 46struct sugov_cpu {
51 struct update_util_data update_util; 47 struct update_util_data update_util;
52 struct sugov_policy *sg_policy; 48 struct sugov_policy *sg_policy;
53 unsigned int cpu; 49 unsigned int cpu;
54 50
55 bool iowait_boost_pending; 51 bool iowait_boost_pending;
56 unsigned int iowait_boost; 52 unsigned int iowait_boost;
57 unsigned int iowait_boost_max; 53 unsigned int iowait_boost_max;
58 u64 last_update; 54 u64 last_update;
59 55
60 /* The fields below are only needed when sharing a policy. */ 56 /* The fields below are only needed when sharing a policy: */
61 unsigned long util_cfs; 57 unsigned long util_cfs;
62 unsigned long util_dl; 58 unsigned long util_dl;
63 unsigned long max; 59 unsigned long max;
64 unsigned int flags;
65 60
66 /* The field below is for single-CPU policies only. */ 61 /* The field below is for single-CPU policies only: */
67#ifdef CONFIG_NO_HZ_COMMON 62#ifdef CONFIG_NO_HZ_COMMON
68 unsigned long saved_idle_calls; 63 unsigned long saved_idle_calls;
69#endif 64#endif
70}; 65};
71 66
@@ -79,9 +74,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
79 74
80 /* 75 /*
81 * Since cpufreq_update_util() is called with rq->lock held for 76 * Since cpufreq_update_util() is called with rq->lock held for
82 * the @target_cpu, our per-cpu data is fully serialized. 77 * the @target_cpu, our per-CPU data is fully serialized.
83 * 78 *
84 * However, drivers cannot in general deal with cross-cpu 79 * However, drivers cannot in general deal with cross-CPU
85 * requests, so while get_next_freq() will work, our 80 * requests, so while get_next_freq() will work, our
86 * sugov_update_commit() call may not for the fast switching platforms. 81 * sugov_update_commit() call may not for the fast switching platforms.
87 * 82 *
@@ -111,6 +106,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
111 } 106 }
112 107
113 delta_ns = time - sg_policy->last_freq_update_time; 108 delta_ns = time - sg_policy->last_freq_update_time;
109
114 return delta_ns >= sg_policy->freq_update_delay_ns; 110 return delta_ns >= sg_policy->freq_update_delay_ns;
115} 111}
116 112
@@ -186,17 +182,28 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
186 182
187static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) 183static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
188{ 184{
185 struct rq *rq = cpu_rq(sg_cpu->cpu);
186 unsigned long util;
187
188 if (rq->rt.rt_nr_running) {
189 util = sg_cpu->max;
190 } else {
191 util = sg_cpu->util_dl;
192 if (rq->cfs.h_nr_running)
193 util += sg_cpu->util_cfs;
194 }
195
189 /* 196 /*
190 * Ideally we would like to set util_dl as min/guaranteed freq and 197 * Ideally we would like to set util_dl as min/guaranteed freq and
191 * util_cfs + util_dl as requested freq. However, cpufreq is not yet 198 * util_cfs + util_dl as requested freq. However, cpufreq is not yet
192 * ready for such an interface. So, we only do the latter for now. 199 * ready for such an interface. So, we only do the latter for now.
193 */ 200 */
194 return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max); 201 return min(util, sg_cpu->max);
195} 202}
196 203
197static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time) 204static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)
198{ 205{
199 if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) { 206 if (flags & SCHED_CPUFREQ_IOWAIT) {
200 if (sg_cpu->iowait_boost_pending) 207 if (sg_cpu->iowait_boost_pending)
201 return; 208 return;
202 209
@@ -260,43 +267,51 @@ static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
260static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } 267static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
261#endif /* CONFIG_NO_HZ_COMMON */ 268#endif /* CONFIG_NO_HZ_COMMON */
262 269
270/*
271 * Make sugov_should_update_freq() ignore the rate limit when DL
272 * has increased the utilization.
273 */
274static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
275{
276 if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl)
277 sg_policy->need_freq_update = true;
278}
279
263static void sugov_update_single(struct update_util_data *hook, u64 time, 280static void sugov_update_single(struct update_util_data *hook, u64 time,
264 unsigned int flags) 281 unsigned int flags)
265{ 282{
266 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 283 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
267 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 284 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
268 struct cpufreq_policy *policy = sg_policy->policy;
269 unsigned long util, max; 285 unsigned long util, max;
270 unsigned int next_f; 286 unsigned int next_f;
271 bool busy; 287 bool busy;
272 288
273 sugov_set_iowait_boost(sg_cpu, time); 289 sugov_set_iowait_boost(sg_cpu, time, flags);
274 sg_cpu->last_update = time; 290 sg_cpu->last_update = time;
275 291
292 ignore_dl_rate_limit(sg_cpu, sg_policy);
293
276 if (!sugov_should_update_freq(sg_policy, time)) 294 if (!sugov_should_update_freq(sg_policy, time))
277 return; 295 return;
278 296
279 busy = sugov_cpu_is_busy(sg_cpu); 297 busy = sugov_cpu_is_busy(sg_cpu);
280 298
281 if (flags & SCHED_CPUFREQ_RT) { 299 sugov_get_util(sg_cpu);
282 next_f = policy->cpuinfo.max_freq; 300 max = sg_cpu->max;
283 } else { 301 util = sugov_aggregate_util(sg_cpu);
284 sugov_get_util(sg_cpu); 302 sugov_iowait_boost(sg_cpu, &util, &max);
285 max = sg_cpu->max; 303 next_f = get_next_freq(sg_policy, util, max);
286 util = sugov_aggregate_util(sg_cpu); 304 /*
287 sugov_iowait_boost(sg_cpu, &util, &max); 305 * Do not reduce the frequency if the CPU has not been idle
288 next_f = get_next_freq(sg_policy, util, max); 306 * recently, as the reduction is likely to be premature then.
289 /* 307 */
290 * Do not reduce the frequency if the CPU has not been idle 308 if (busy && next_f < sg_policy->next_freq) {
291 * recently, as the reduction is likely to be premature then. 309 next_f = sg_policy->next_freq;
292 */
293 if (busy && next_f < sg_policy->next_freq) {
294 next_f = sg_policy->next_freq;
295 310
296 /* Reset cached freq as next_freq has changed */ 311 /* Reset cached freq as next_freq has changed */
297 sg_policy->cached_raw_freq = 0; 312 sg_policy->cached_raw_freq = 0;
298 }
299 } 313 }
314
300 sugov_update_commit(sg_policy, time, next_f); 315 sugov_update_commit(sg_policy, time, next_f);
301} 316}
302 317
@@ -312,6 +327,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
312 unsigned long j_util, j_max; 327 unsigned long j_util, j_max;
313 s64 delta_ns; 328 s64 delta_ns;
314 329
330 sugov_get_util(j_sg_cpu);
331
315 /* 332 /*
316 * If the CFS CPU utilization was last updated before the 333 * If the CFS CPU utilization was last updated before the
317 * previous frequency update and the time elapsed between the 334 * previous frequency update and the time elapsed between the
@@ -325,28 +342,22 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
325 if (delta_ns > TICK_NSEC) { 342 if (delta_ns > TICK_NSEC) {
326 j_sg_cpu->iowait_boost = 0; 343 j_sg_cpu->iowait_boost = 0;
327 j_sg_cpu->iowait_boost_pending = false; 344 j_sg_cpu->iowait_boost_pending = false;
328 j_sg_cpu->util_cfs = 0;
329 if (j_sg_cpu->util_dl == 0)
330 continue;
331 } 345 }
332 if (j_sg_cpu->flags & SCHED_CPUFREQ_RT)
333 return policy->cpuinfo.max_freq;
334 346
335 j_max = j_sg_cpu->max; 347 j_max = j_sg_cpu->max;
336 j_util = sugov_aggregate_util(j_sg_cpu); 348 j_util = sugov_aggregate_util(j_sg_cpu);
349 sugov_iowait_boost(j_sg_cpu, &j_util, &j_max);
337 if (j_util * max > j_max * util) { 350 if (j_util * max > j_max * util) {
338 util = j_util; 351 util = j_util;
339 max = j_max; 352 max = j_max;
340 } 353 }
341
342 sugov_iowait_boost(j_sg_cpu, &util, &max);
343 } 354 }
344 355
345 return get_next_freq(sg_policy, util, max); 356 return get_next_freq(sg_policy, util, max);
346} 357}
347 358
348static void sugov_update_shared(struct update_util_data *hook, u64 time, 359static void
349 unsigned int flags) 360sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
350{ 361{
351 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 362 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
352 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 363 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
@@ -354,18 +365,13 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
354 365
355 raw_spin_lock(&sg_policy->update_lock); 366 raw_spin_lock(&sg_policy->update_lock);
356 367
357 sugov_get_util(sg_cpu); 368 sugov_set_iowait_boost(sg_cpu, time, flags);
358 sg_cpu->flags = flags;
359
360 sugov_set_iowait_boost(sg_cpu, time);
361 sg_cpu->last_update = time; 369 sg_cpu->last_update = time;
362 370
363 if (sugov_should_update_freq(sg_policy, time)) { 371 ignore_dl_rate_limit(sg_cpu, sg_policy);
364 if (flags & SCHED_CPUFREQ_RT)
365 next_f = sg_policy->policy->cpuinfo.max_freq;
366 else
367 next_f = sugov_next_freq_shared(sg_cpu, time);
368 372
373 if (sugov_should_update_freq(sg_policy, time)) {
374 next_f = sugov_next_freq_shared(sg_cpu, time);
369 sugov_update_commit(sg_policy, time, next_f); 375 sugov_update_commit(sg_policy, time, next_f);
370 } 376 }
371 377
@@ -423,8 +429,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
423 return sprintf(buf, "%u\n", tunables->rate_limit_us); 429 return sprintf(buf, "%u\n", tunables->rate_limit_us);
424} 430}
425 431
426static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, 432static ssize_t
427 size_t count) 433rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
428{ 434{
429 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 435 struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
430 struct sugov_policy *sg_policy; 436 struct sugov_policy *sg_policy;
@@ -479,11 +485,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
479{ 485{
480 struct task_struct *thread; 486 struct task_struct *thread;
481 struct sched_attr attr = { 487 struct sched_attr attr = {
482 .size = sizeof(struct sched_attr), 488 .size = sizeof(struct sched_attr),
483 .sched_policy = SCHED_DEADLINE, 489 .sched_policy = SCHED_DEADLINE,
484 .sched_flags = SCHED_FLAG_SUGOV, 490 .sched_flags = SCHED_FLAG_SUGOV,
485 .sched_nice = 0, 491 .sched_nice = 0,
486 .sched_priority = 0, 492 .sched_priority = 0,
487 /* 493 /*
488 * Fake (unused) bandwidth; workaround to "fix" 494 * Fake (unused) bandwidth; workaround to "fix"
489 * priority inheritance. 495 * priority inheritance.
@@ -662,21 +668,20 @@ static int sugov_start(struct cpufreq_policy *policy)
662 struct sugov_policy *sg_policy = policy->governor_data; 668 struct sugov_policy *sg_policy = policy->governor_data;
663 unsigned int cpu; 669 unsigned int cpu;
664 670
665 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; 671 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
666 sg_policy->last_freq_update_time = 0; 672 sg_policy->last_freq_update_time = 0;
667 sg_policy->next_freq = UINT_MAX; 673 sg_policy->next_freq = UINT_MAX;
668 sg_policy->work_in_progress = false; 674 sg_policy->work_in_progress = false;
669 sg_policy->need_freq_update = false; 675 sg_policy->need_freq_update = false;
670 sg_policy->cached_raw_freq = 0; 676 sg_policy->cached_raw_freq = 0;
671 677
672 for_each_cpu(cpu, policy->cpus) { 678 for_each_cpu(cpu, policy->cpus) {
673 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); 679 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
674 680
675 memset(sg_cpu, 0, sizeof(*sg_cpu)); 681 memset(sg_cpu, 0, sizeof(*sg_cpu));
676 sg_cpu->cpu = cpu; 682 sg_cpu->cpu = cpu;
677 sg_cpu->sg_policy = sg_policy; 683 sg_cpu->sg_policy = sg_policy;
678 sg_cpu->flags = 0; 684 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
679 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
680 } 685 }
681 686
682 for_each_cpu(cpu, policy->cpus) { 687 for_each_cpu(cpu, policy->cpus) {
@@ -720,14 +725,14 @@ static void sugov_limits(struct cpufreq_policy *policy)
720} 725}
721 726
722static struct cpufreq_governor schedutil_gov = { 727static struct cpufreq_governor schedutil_gov = {
723 .name = "schedutil", 728 .name = "schedutil",
724 .owner = THIS_MODULE, 729 .owner = THIS_MODULE,
725 .dynamic_switching = true, 730 .dynamic_switching = true,
726 .init = sugov_init, 731 .init = sugov_init,
727 .exit = sugov_exit, 732 .exit = sugov_exit,
728 .start = sugov_start, 733 .start = sugov_start,
729 .stop = sugov_stop, 734 .stop = sugov_stop,
730 .limits = sugov_limits, 735 .limits = sugov_limits,
731}; 736};
732 737
733#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL 738#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 2511aba36b89..daaadf939ccb 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -14,7 +14,7 @@
14 * 14 *
15 * going from the lowest priority to the highest. CPUs in the INVALID state 15 * going from the lowest priority to the highest. CPUs in the INVALID state
16 * are not eligible for routing. The system maintains this state with 16 * are not eligible for routing. The system maintains this state with
17 * a 2 dimensional bitmap (the first for priority class, the second for cpus 17 * a 2 dimensional bitmap (the first for priority class, the second for CPUs
18 * in that class). Therefore a typical application without affinity 18 * in that class). Therefore a typical application without affinity
19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit 19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
20 * searches). For tasks with affinity restrictions, the algorithm has a 20 * searches). For tasks with affinity restrictions, the algorithm has a
@@ -26,12 +26,7 @@
26 * as published by the Free Software Foundation; version 2 26 * as published by the Free Software Foundation; version 2
27 * of the License. 27 * of the License.
28 */ 28 */
29 29#include "sched.h"
30#include <linux/gfp.h>
31#include <linux/sched.h>
32#include <linux/sched/rt.h>
33#include <linux/slab.h>
34#include "cpupri.h"
35 30
36/* Convert between a 140 based task->prio, and our 102 based cpupri */ 31/* Convert between a 140 based task->prio, and our 102 based cpupri */
37static int convert_prio(int prio) 32static int convert_prio(int prio)
@@ -128,9 +123,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
128} 123}
129 124
130/** 125/**
131 * cpupri_set - update the cpu priority setting 126 * cpupri_set - update the CPU priority setting
132 * @cp: The cpupri context 127 * @cp: The cpupri context
133 * @cpu: The target cpu 128 * @cpu: The target CPU
134 * @newpri: The priority (INVALID-RT99) to assign to this CPU 129 * @newpri: The priority (INVALID-RT99) to assign to this CPU
135 * 130 *
136 * Note: Assumes cpu_rq(cpu)->lock is locked 131 * Note: Assumes cpu_rq(cpu)->lock is locked
@@ -151,7 +146,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
151 return; 146 return;
152 147
153 /* 148 /*
154 * If the cpu was currently mapped to a different value, we 149 * If the CPU was currently mapped to a different value, we
155 * need to map it to the new value then remove the old value. 150 * need to map it to the new value then remove the old value.
156 * Note, we must add the new value first, otherwise we risk the 151 * Note, we must add the new value first, otherwise we risk the
157 * cpu being missed by the priority loop in cpupri_find. 152 * cpu being missed by the priority loop in cpupri_find.
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index bab050019071..7dc20a3232e7 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,32 +1,25 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_CPUPRI_H
3#define _LINUX_CPUPRI_H
4
5#include <linux/sched.h>
6 2
7#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) 3#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
8 4
9#define CPUPRI_INVALID -1 5#define CPUPRI_INVALID -1
10#define CPUPRI_IDLE 0 6#define CPUPRI_IDLE 0
11#define CPUPRI_NORMAL 1 7#define CPUPRI_NORMAL 1
12/* values 2-101 are RT priorities 0-99 */ 8/* values 2-101 are RT priorities 0-99 */
13 9
14struct cpupri_vec { 10struct cpupri_vec {
15 atomic_t count; 11 atomic_t count;
16 cpumask_var_t mask; 12 cpumask_var_t mask;
17}; 13};
18 14
19struct cpupri { 15struct cpupri {
20 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; 16 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
21 int *cpu_to_pri; 17 int *cpu_to_pri;
22}; 18};
23 19
24#ifdef CONFIG_SMP 20#ifdef CONFIG_SMP
25int cpupri_find(struct cpupri *cp, 21int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask);
26 struct task_struct *p, struct cpumask *lowest_mask);
27void cpupri_set(struct cpupri *cp, int cpu, int pri); 22void cpupri_set(struct cpupri *cp, int cpu, int pri);
28int cpupri_init(struct cpupri *cp); 23int cpupri_init(struct cpupri *cp);
29void cpupri_cleanup(struct cpupri *cp); 24void cpupri_cleanup(struct cpupri *cp);
30#endif 25#endif
31
32#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index bac6ac9a4ec7..0796f938c4f0 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1,10 +1,6 @@
1#include <linux/export.h> 1/*
2#include <linux/sched.h> 2 * Simple CPU accounting cgroup controller
3#include <linux/tsacct_kern.h> 3 */
4#include <linux/kernel_stat.h>
5#include <linux/static_key.h>
6#include <linux/context_tracking.h>
7#include <linux/sched/cputime.h>
8#include "sched.h" 4#include "sched.h"
9 5
10#ifdef CONFIG_IRQ_TIME_ACCOUNTING 6#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -113,9 +109,9 @@ static inline void task_group_account_field(struct task_struct *p, int index,
113} 109}
114 110
115/* 111/*
116 * Account user cpu time to a process. 112 * Account user CPU time to a process.
117 * @p: the process that the cpu time gets accounted to 113 * @p: the process that the CPU time gets accounted to
118 * @cputime: the cpu time spent in user space since the last update 114 * @cputime: the CPU time spent in user space since the last update
119 */ 115 */
120void account_user_time(struct task_struct *p, u64 cputime) 116void account_user_time(struct task_struct *p, u64 cputime)
121{ 117{
@@ -135,9 +131,9 @@ void account_user_time(struct task_struct *p, u64 cputime)
135} 131}
136 132
137/* 133/*
138 * Account guest cpu time to a process. 134 * Account guest CPU time to a process.
139 * @p: the process that the cpu time gets accounted to 135 * @p: the process that the CPU time gets accounted to
140 * @cputime: the cpu time spent in virtual machine since the last update 136 * @cputime: the CPU time spent in virtual machine since the last update
141 */ 137 */
142void account_guest_time(struct task_struct *p, u64 cputime) 138void account_guest_time(struct task_struct *p, u64 cputime)
143{ 139{
@@ -159,9 +155,9 @@ void account_guest_time(struct task_struct *p, u64 cputime)
159} 155}
160 156
161/* 157/*
162 * Account system cpu time to a process and desired cpustat field 158 * Account system CPU time to a process and desired cpustat field
163 * @p: the process that the cpu time gets accounted to 159 * @p: the process that the CPU time gets accounted to
164 * @cputime: the cpu time spent in kernel space since the last update 160 * @cputime: the CPU time spent in kernel space since the last update
165 * @index: pointer to cpustat field that has to be updated 161 * @index: pointer to cpustat field that has to be updated
166 */ 162 */
167void account_system_index_time(struct task_struct *p, 163void account_system_index_time(struct task_struct *p,
@@ -179,10 +175,10 @@ void account_system_index_time(struct task_struct *p,
179} 175}
180 176
181/* 177/*
182 * Account system cpu time to a process. 178 * Account system CPU time to a process.
183 * @p: the process that the cpu time gets accounted to 179 * @p: the process that the CPU time gets accounted to
184 * @hardirq_offset: the offset to subtract from hardirq_count() 180 * @hardirq_offset: the offset to subtract from hardirq_count()
185 * @cputime: the cpu time spent in kernel space since the last update 181 * @cputime: the CPU time spent in kernel space since the last update
186 */ 182 */
187void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) 183void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
188{ 184{
@@ -205,7 +201,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
205 201
206/* 202/*
207 * Account for involuntary wait time. 203 * Account for involuntary wait time.
208 * @cputime: the cpu time spent in involuntary wait 204 * @cputime: the CPU time spent in involuntary wait
209 */ 205 */
210void account_steal_time(u64 cputime) 206void account_steal_time(u64 cputime)
211{ 207{
@@ -216,7 +212,7 @@ void account_steal_time(u64 cputime)
216 212
217/* 213/*
218 * Account for idle time. 214 * Account for idle time.
219 * @cputime: the cpu time spent in idle wait 215 * @cputime: the CPU time spent in idle wait
220 */ 216 */
221void account_idle_time(u64 cputime) 217void account_idle_time(u64 cputime)
222{ 218{
@@ -338,7 +334,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
338#ifdef CONFIG_IRQ_TIME_ACCOUNTING 334#ifdef CONFIG_IRQ_TIME_ACCOUNTING
339/* 335/*
340 * Account a tick to a process and cpustat 336 * Account a tick to a process and cpustat
341 * @p: the process that the cpu time gets accounted to 337 * @p: the process that the CPU time gets accounted to
342 * @user_tick: is the tick from userspace 338 * @user_tick: is the tick from userspace
343 * @rq: the pointer to rq 339 * @rq: the pointer to rq
344 * 340 *
@@ -400,17 +396,16 @@ static void irqtime_account_idle_ticks(int ticks)
400 irqtime_account_process_tick(current, 0, rq, ticks); 396 irqtime_account_process_tick(current, 0, rq, ticks);
401} 397}
402#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 398#else /* CONFIG_IRQ_TIME_ACCOUNTING */
403static inline void irqtime_account_idle_ticks(int ticks) {} 399static inline void irqtime_account_idle_ticks(int ticks) { }
404static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 400static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
405 struct rq *rq, int nr_ticks) {} 401 struct rq *rq, int nr_ticks) { }
406#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 402#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
407 403
408/* 404/*
409 * Use precise platform statistics if available: 405 * Use precise platform statistics if available:
410 */ 406 */
411#ifdef CONFIG_VIRT_CPU_ACCOUNTING 407#ifdef CONFIG_VIRT_CPU_ACCOUNTING
412 408# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
413#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
414void vtime_common_task_switch(struct task_struct *prev) 409void vtime_common_task_switch(struct task_struct *prev)
415{ 410{
416 if (is_idle_task(prev)) 411 if (is_idle_task(prev))
@@ -421,8 +416,7 @@ void vtime_common_task_switch(struct task_struct *prev)
421 vtime_flush(prev); 416 vtime_flush(prev);
422 arch_vtime_task_switch(prev); 417 arch_vtime_task_switch(prev);
423} 418}
424#endif 419# endif
425
426#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ 420#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
427 421
428 422
@@ -469,10 +463,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
469 *ut = cputime.utime; 463 *ut = cputime.utime;
470 *st = cputime.stime; 464 *st = cputime.stime;
471} 465}
472#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 466
467#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
468
473/* 469/*
474 * Account a single tick of cpu time. 470 * Account a single tick of CPU time.
475 * @p: the process that the cpu time gets accounted to 471 * @p: the process that the CPU time gets accounted to
476 * @user_tick: indicates if the tick is a user or a system tick 472 * @user_tick: indicates if the tick is a user or a system tick
477 */ 473 */
478void account_process_tick(struct task_struct *p, int user_tick) 474void account_process_tick(struct task_struct *p, int user_tick)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9df09782025c..d1c7bf7c7e5b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -17,9 +17,6 @@
17 */ 17 */
18#include "sched.h" 18#include "sched.h"
19 19
20#include <linux/slab.h>
21#include <uapi/linux/sched/types.h>
22
23struct dl_bandwidth def_dl_bandwidth; 20struct dl_bandwidth def_dl_bandwidth;
24 21
25static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) 22static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
@@ -87,7 +84,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
87 SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ 84 SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
88 SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); 85 SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
89 /* kick cpufreq (see the comment in kernel/sched/sched.h). */ 86 /* kick cpufreq (see the comment in kernel/sched/sched.h). */
90 cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); 87 cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
91} 88}
92 89
93static inline 90static inline
@@ -101,7 +98,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
101 if (dl_rq->running_bw > old) 98 if (dl_rq->running_bw > old)
102 dl_rq->running_bw = 0; 99 dl_rq->running_bw = 0;
103 /* kick cpufreq (see the comment in kernel/sched/sched.h). */ 100 /* kick cpufreq (see the comment in kernel/sched/sched.h). */
104 cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); 101 cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
105} 102}
106 103
107static inline 104static inline
@@ -514,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
514static void push_dl_tasks(struct rq *); 511static void push_dl_tasks(struct rq *);
515static void pull_dl_task(struct rq *); 512static void pull_dl_task(struct rq *);
516 513
517static inline void queue_push_tasks(struct rq *rq) 514static inline void deadline_queue_push_tasks(struct rq *rq)
518{ 515{
519 if (!has_pushable_dl_tasks(rq)) 516 if (!has_pushable_dl_tasks(rq))
520 return; 517 return;
@@ -522,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq)
522 queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); 519 queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
523} 520}
524 521
525static inline void queue_pull_task(struct rq *rq) 522static inline void deadline_queue_pull_task(struct rq *rq)
526{ 523{
527 queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); 524 queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
528} 525}
@@ -539,12 +536,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
539 536
540 /* 537 /*
541 * If we cannot preempt any rq, fall back to pick any 538 * If we cannot preempt any rq, fall back to pick any
542 * online cpu. 539 * online CPU:
543 */ 540 */
544 cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); 541 cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
545 if (cpu >= nr_cpu_ids) { 542 if (cpu >= nr_cpu_ids) {
546 /* 543 /*
547 * Fail to find any suitable cpu. 544 * Failed to find any suitable CPU.
548 * The task will never come back! 545 * The task will never come back!
549 */ 546 */
550 BUG_ON(dl_bandwidth_enabled()); 547 BUG_ON(dl_bandwidth_enabled());
@@ -597,19 +594,18 @@ static inline void pull_dl_task(struct rq *rq)
597{ 594{
598} 595}
599 596
600static inline void queue_push_tasks(struct rq *rq) 597static inline void deadline_queue_push_tasks(struct rq *rq)
601{ 598{
602} 599}
603 600
604static inline void queue_pull_task(struct rq *rq) 601static inline void deadline_queue_pull_task(struct rq *rq)
605{ 602{
606} 603}
607#endif /* CONFIG_SMP */ 604#endif /* CONFIG_SMP */
608 605
609static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); 606static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
610static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); 607static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
611static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, 608static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
612 int flags);
613 609
614/* 610/*
615 * We are being explicitly informed that a new instance is starting, 611 * We are being explicitly informed that a new instance is starting,
@@ -1763,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1763 if (hrtick_enabled(rq)) 1759 if (hrtick_enabled(rq))
1764 start_hrtick_dl(rq, p); 1760 start_hrtick_dl(rq, p);
1765 1761
1766 queue_push_tasks(rq); 1762 deadline_queue_push_tasks(rq);
1767 1763
1768 return p; 1764 return p;
1769} 1765}
@@ -1776,6 +1772,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
1776 enqueue_pushable_dl_task(rq, p); 1772 enqueue_pushable_dl_task(rq, p);
1777} 1773}
1778 1774
1775/*
1776 * scheduler tick hitting a task of our scheduling class.
1777 *
1778 * NOTE: This function can be called remotely by the tick offload that
1779 * goes along full dynticks. Therefore no local assumption can be made
1780 * and everything must be accessed through the @rq and @curr passed in
1781 * parameters.
1782 */
1779static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) 1783static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1780{ 1784{
1781 update_curr_dl(rq); 1785 update_curr_dl(rq);
@@ -1865,7 +1869,7 @@ static int find_later_rq(struct task_struct *task)
1865 1869
1866 /* 1870 /*
1867 * We have to consider system topology and task affinity 1871 * We have to consider system topology and task affinity
1868 * first, then we can look for a suitable cpu. 1872 * first, then we can look for a suitable CPU.
1869 */ 1873 */
1870 if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) 1874 if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
1871 return -1; 1875 return -1;
@@ -1879,7 +1883,7 @@ static int find_later_rq(struct task_struct *task)
1879 * Now we check how well this matches with task's 1883 * Now we check how well this matches with task's
1880 * affinity and system topology. 1884 * affinity and system topology.
1881 * 1885 *
1882 * The last cpu where the task run is our first 1886 * The last CPU where the task run is our first
1883 * guess, since it is most likely cache-hot there. 1887 * guess, since it is most likely cache-hot there.
1884 */ 1888 */
1885 if (cpumask_test_cpu(cpu, later_mask)) 1889 if (cpumask_test_cpu(cpu, later_mask))
@@ -1909,9 +1913,9 @@ static int find_later_rq(struct task_struct *task)
1909 best_cpu = cpumask_first_and(later_mask, 1913 best_cpu = cpumask_first_and(later_mask,
1910 sched_domain_span(sd)); 1914 sched_domain_span(sd));
1911 /* 1915 /*
1912 * Last chance: if a cpu being in both later_mask 1916 * Last chance: if a CPU being in both later_mask
1913 * and current sd span is valid, that becomes our 1917 * and current sd span is valid, that becomes our
1914 * choice. Of course, the latest possible cpu is 1918 * choice. Of course, the latest possible CPU is
1915 * already under consideration through later_mask. 1919 * already under consideration through later_mask.
1916 */ 1920 */
1917 if (best_cpu < nr_cpu_ids) { 1921 if (best_cpu < nr_cpu_ids) {
@@ -2067,7 +2071,7 @@ retry:
2067 if (task == next_task) { 2071 if (task == next_task) {
2068 /* 2072 /*
2069 * The task is still there. We don't try 2073 * The task is still there. We don't try
2070 * again, some other cpu will pull it when ready. 2074 * again, some other CPU will pull it when ready.
2071 */ 2075 */
2072 goto out; 2076 goto out;
2073 } 2077 }
@@ -2300,12 +2304,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
2300 /* 2304 /*
2301 * Since this might be the only -deadline task on the rq, 2305 * Since this might be the only -deadline task on the rq,
2302 * this is the right place to try to pull some other one 2306 * this is the right place to try to pull some other one
2303 * from an overloaded cpu, if any. 2307 * from an overloaded CPU, if any.
2304 */ 2308 */
2305 if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) 2309 if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
2306 return; 2310 return;
2307 2311
2308 queue_pull_task(rq); 2312 deadline_queue_pull_task(rq);
2309} 2313}
2310 2314
2311/* 2315/*
@@ -2327,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
2327 if (rq->curr != p) { 2331 if (rq->curr != p) {
2328#ifdef CONFIG_SMP 2332#ifdef CONFIG_SMP
2329 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) 2333 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
2330 queue_push_tasks(rq); 2334 deadline_queue_push_tasks(rq);
2331#endif 2335#endif
2332 if (dl_task(rq->curr)) 2336 if (dl_task(rq->curr))
2333 check_preempt_curr_dl(rq, p, 0); 2337 check_preempt_curr_dl(rq, p, 0);
@@ -2352,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
2352 * or lowering its prio, so... 2356 * or lowering its prio, so...
2353 */ 2357 */
2354 if (!rq->dl.overloaded) 2358 if (!rq->dl.overloaded)
2355 queue_pull_task(rq); 2359 deadline_queue_pull_task(rq);
2356 2360
2357 /* 2361 /*
2358 * If we now have a earlier deadline task than p, 2362 * If we now have a earlier deadline task than p,
@@ -2626,17 +2630,17 @@ void __dl_clear_params(struct task_struct *p)
2626{ 2630{
2627 struct sched_dl_entity *dl_se = &p->dl; 2631 struct sched_dl_entity *dl_se = &p->dl;
2628 2632
2629 dl_se->dl_runtime = 0; 2633 dl_se->dl_runtime = 0;
2630 dl_se->dl_deadline = 0; 2634 dl_se->dl_deadline = 0;
2631 dl_se->dl_period = 0; 2635 dl_se->dl_period = 0;
2632 dl_se->flags = 0; 2636 dl_se->flags = 0;
2633 dl_se->dl_bw = 0; 2637 dl_se->dl_bw = 0;
2634 dl_se->dl_density = 0; 2638 dl_se->dl_density = 0;
2635 2639
2636 dl_se->dl_throttled = 0; 2640 dl_se->dl_throttled = 0;
2637 dl_se->dl_yielded = 0; 2641 dl_se->dl_yielded = 0;
2638 dl_se->dl_non_contending = 0; 2642 dl_se->dl_non_contending = 0;
2639 dl_se->dl_overrun = 0; 2643 dl_se->dl_overrun = 0;
2640} 2644}
2641 2645
2642bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) 2646bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
@@ -2655,21 +2659,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
2655#ifdef CONFIG_SMP 2659#ifdef CONFIG_SMP
2656int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) 2660int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
2657{ 2661{
2658 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, 2662 unsigned int dest_cpu;
2659 cs_cpus_allowed);
2660 struct dl_bw *dl_b; 2663 struct dl_bw *dl_b;
2661 bool overflow; 2664 bool overflow;
2662 int cpus, ret; 2665 int cpus, ret;
2663 unsigned long flags; 2666 unsigned long flags;
2664 2667
2668 dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
2669
2665 rcu_read_lock_sched(); 2670 rcu_read_lock_sched();
2666 dl_b = dl_bw_of(dest_cpu); 2671 dl_b = dl_bw_of(dest_cpu);
2667 raw_spin_lock_irqsave(&dl_b->lock, flags); 2672 raw_spin_lock_irqsave(&dl_b->lock, flags);
2668 cpus = dl_bw_cpus(dest_cpu); 2673 cpus = dl_bw_cpus(dest_cpu);
2669 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); 2674 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
2670 if (overflow) 2675 if (overflow) {
2671 ret = -EBUSY; 2676 ret = -EBUSY;
2672 else { 2677 } else {
2673 /* 2678 /*
2674 * We reserve space for this task in the destination 2679 * We reserve space for this task in the destination
2675 * root_domain, as we can't fail after this point. 2680 * root_domain, as we can't fail after this point.
@@ -2681,6 +2686,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo
2681 } 2686 }
2682 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 2687 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
2683 rcu_read_unlock_sched(); 2688 rcu_read_unlock_sched();
2689
2684 return ret; 2690 return ret;
2685} 2691}
2686 2692
@@ -2701,6 +2707,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
2701 ret = 0; 2707 ret = 0;
2702 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); 2708 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
2703 rcu_read_unlock_sched(); 2709 rcu_read_unlock_sched();
2710
2704 return ret; 2711 return ret;
2705} 2712}
2706 2713
@@ -2718,6 +2725,7 @@ bool dl_cpu_busy(unsigned int cpu)
2718 overflow = __dl_overflow(dl_b, cpus, 0, 0); 2725 overflow = __dl_overflow(dl_b, cpus, 0, 0);
2719 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 2726 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
2720 rcu_read_unlock_sched(); 2727 rcu_read_unlock_sched();
2728
2721 return overflow; 2729 return overflow;
2722} 2730}
2723#endif 2731#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 72c401b3b15c..15b10e210a6b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * kernel/sched/debug.c 2 * kernel/sched/debug.c
3 * 3 *
4 * Print the CFS rbtree 4 * Print the CFS rbtree and other debugging details
5 * 5 *
6 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar 6 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
7 * 7 *
@@ -9,16 +9,6 @@
9 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation. 10 * published by the Free Software Foundation.
11 */ 11 */
12
13#include <linux/proc_fs.h>
14#include <linux/sched/mm.h>
15#include <linux/sched/task.h>
16#include <linux/seq_file.h>
17#include <linux/kallsyms.h>
18#include <linux/utsname.h>
19#include <linux/mempolicy.h>
20#include <linux/debugfs.h>
21
22#include "sched.h" 12#include "sched.h"
23 13
24static DEFINE_SPINLOCK(sched_debug_lock); 14static DEFINE_SPINLOCK(sched_debug_lock);
@@ -274,34 +264,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
274 if (table == NULL) 264 if (table == NULL)
275 return NULL; 265 return NULL;
276 266
277 set_table_entry(&table[0], "min_interval", &sd->min_interval, 267 set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
278 sizeof(long), 0644, proc_doulongvec_minmax, false); 268 set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
279 set_table_entry(&table[1], "max_interval", &sd->max_interval, 269 set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
280 sizeof(long), 0644, proc_doulongvec_minmax, false); 270 set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
281 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 271 set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
282 sizeof(int), 0644, proc_dointvec_minmax, true); 272 set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
283 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 273 set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
284 sizeof(int), 0644, proc_dointvec_minmax, true); 274 set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false);
285 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 275 set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false);
286 sizeof(int), 0644, proc_dointvec_minmax, true); 276 set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
287 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 277 set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
288 sizeof(int), 0644, proc_dointvec_minmax, true); 278 set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
289 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 279 set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
290 sizeof(int), 0644, proc_dointvec_minmax, true);
291 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
292 sizeof(int), 0644, proc_dointvec_minmax, false);
293 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
294 sizeof(int), 0644, proc_dointvec_minmax, false);
295 set_table_entry(&table[9], "cache_nice_tries",
296 &sd->cache_nice_tries,
297 sizeof(int), 0644, proc_dointvec_minmax, false);
298 set_table_entry(&table[10], "flags", &sd->flags,
299 sizeof(int), 0644, proc_dointvec_minmax, false);
300 set_table_entry(&table[11], "max_newidle_lb_cost",
301 &sd->max_newidle_lb_cost,
302 sizeof(long), 0644, proc_doulongvec_minmax, false);
303 set_table_entry(&table[12], "name", sd->name,
304 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
305 /* &table[13] is terminator */ 280 /* &table[13] is terminator */
306 281
307 return table; 282 return table;
@@ -332,8 +307,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
332 return table; 307 return table;
333} 308}
334 309
335static cpumask_var_t sd_sysctl_cpus; 310static cpumask_var_t sd_sysctl_cpus;
336static struct ctl_table_header *sd_sysctl_header; 311static struct ctl_table_header *sd_sysctl_header;
337 312
338void register_sched_domain_sysctl(void) 313void register_sched_domain_sysctl(void)
339{ 314{
@@ -413,14 +388,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
413{ 388{
414 struct sched_entity *se = tg->se[cpu]; 389 struct sched_entity *se = tg->se[cpu];
415 390
416#define P(F) \ 391#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
417 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) 392#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
418#define P_SCHEDSTAT(F) \ 393#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
419 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) 394#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
420#define PN(F) \
421 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
422#define PN_SCHEDSTAT(F) \
423 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
424 395
425 if (!se) 396 if (!se)
426 return; 397 return;
@@ -428,6 +399,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
428 PN(se->exec_start); 399 PN(se->exec_start);
429 PN(se->vruntime); 400 PN(se->vruntime);
430 PN(se->sum_exec_runtime); 401 PN(se->sum_exec_runtime);
402
431 if (schedstat_enabled()) { 403 if (schedstat_enabled()) {
432 PN_SCHEDSTAT(se->statistics.wait_start); 404 PN_SCHEDSTAT(se->statistics.wait_start);
433 PN_SCHEDSTAT(se->statistics.sleep_start); 405 PN_SCHEDSTAT(se->statistics.sleep_start);
@@ -440,6 +412,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
440 PN_SCHEDSTAT(se->statistics.wait_sum); 412 PN_SCHEDSTAT(se->statistics.wait_sum);
441 P_SCHEDSTAT(se->statistics.wait_count); 413 P_SCHEDSTAT(se->statistics.wait_count);
442 } 414 }
415
443 P(se->load.weight); 416 P(se->load.weight);
444 P(se->runnable_weight); 417 P(se->runnable_weight);
445#ifdef CONFIG_SMP 418#ifdef CONFIG_SMP
@@ -464,6 +437,7 @@ static char *task_group_path(struct task_group *tg)
464 return group_path; 437 return group_path;
465 438
466 cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 439 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
440
467 return group_path; 441 return group_path;
468} 442}
469#endif 443#endif
@@ -569,6 +543,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
569 cfs_rq->avg.runnable_load_avg); 543 cfs_rq->avg.runnable_load_avg);
570 SEQ_printf(m, " .%-30s: %lu\n", "util_avg", 544 SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
571 cfs_rq->avg.util_avg); 545 cfs_rq->avg.util_avg);
546 SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
547 cfs_rq->avg.util_est.enqueued);
572 SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", 548 SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
573 cfs_rq->removed.load_avg); 549 cfs_rq->removed.load_avg);
574 SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", 550 SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
@@ -804,9 +780,9 @@ void sysrq_sched_debug_show(void)
804/* 780/*
805 * This itererator needs some explanation. 781 * This itererator needs some explanation.
806 * It returns 1 for the header position. 782 * It returns 1 for the header position.
807 * This means 2 is cpu 0. 783 * This means 2 is CPU 0.
808 * In a hotplugged system some cpus, including cpu 0, may be missing so we have 784 * In a hotplugged system some CPUs, including CPU 0, may be missing so we have
809 * to use cpumask_* to iterate over the cpus. 785 * to use cpumask_* to iterate over the CPUs.
810 */ 786 */
811static void *sched_debug_start(struct seq_file *file, loff_t *offset) 787static void *sched_debug_start(struct seq_file *file, loff_t *offset)
812{ 788{
@@ -826,6 +802,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset)
826 802
827 if (n < nr_cpu_ids) 803 if (n < nr_cpu_ids)
828 return (void *)(unsigned long)(n + 2); 804 return (void *)(unsigned long)(n + 2);
805
829 return NULL; 806 return NULL;
830} 807}
831 808
@@ -840,10 +817,10 @@ static void sched_debug_stop(struct seq_file *file, void *data)
840} 817}
841 818
842static const struct seq_operations sched_debug_sops = { 819static const struct seq_operations sched_debug_sops = {
843 .start = sched_debug_start, 820 .start = sched_debug_start,
844 .next = sched_debug_next, 821 .next = sched_debug_next,
845 .stop = sched_debug_stop, 822 .stop = sched_debug_stop,
846 .show = sched_debug_show, 823 .show = sched_debug_show,
847}; 824};
848 825
849static int sched_debug_release(struct inode *inode, struct file *file) 826static int sched_debug_release(struct inode *inode, struct file *file)
@@ -881,14 +858,10 @@ static int __init init_sched_debug_procfs(void)
881 858
882__initcall(init_sched_debug_procfs); 859__initcall(init_sched_debug_procfs);
883 860
884#define __P(F) \ 861#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
885 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) 862#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
886#define P(F) \ 863#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
887 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) 864#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
888#define __PN(F) \
889 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
890#define PN(F) \
891 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
892 865
893 866
894#ifdef CONFIG_NUMA_BALANCING 867#ifdef CONFIG_NUMA_BALANCING
@@ -1023,6 +996,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
1023 P(se.avg.runnable_load_avg); 996 P(se.avg.runnable_load_avg);
1024 P(se.avg.util_avg); 997 P(se.avg.util_avg);
1025 P(se.avg.last_update_time); 998 P(se.avg.last_update_time);
999 P(se.avg.util_est.ewma);
1000 P(se.avg.util_est.enqueued);
1026#endif 1001#endif
1027 P(policy); 1002 P(policy);
1028 P(prio); 1003 P(prio);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5eb3ffc9be84..0951d1c58d2f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,25 +20,10 @@
20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra 20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
21 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra 21 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
22 */ 22 */
23 23#include "sched.h"
24#include <linux/sched/mm.h>
25#include <linux/sched/topology.h>
26
27#include <linux/latencytop.h>
28#include <linux/cpumask.h>
29#include <linux/cpuidle.h>
30#include <linux/slab.h>
31#include <linux/profile.h>
32#include <linux/interrupt.h>
33#include <linux/mempolicy.h>
34#include <linux/migrate.h>
35#include <linux/task_work.h>
36#include <linux/sched/isolation.h>
37 24
38#include <trace/events/sched.h> 25#include <trace/events/sched.h>
39 26
40#include "sched.h"
41
42/* 27/*
43 * Targeted preemption latency for CPU-bound tasks: 28 * Targeted preemption latency for CPU-bound tasks:
44 * 29 *
@@ -103,7 +88,7 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
103 88
104#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
105/* 90/*
106 * For asym packing, by default the lower numbered cpu has higher priority. 91 * For asym packing, by default the lower numbered CPU has higher priority.
107 */ 92 */
108int __weak arch_asym_cpu_priority(int cpu) 93int __weak arch_asym_cpu_priority(int cpu)
109{ 94{
@@ -787,7 +772,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
787 * For !fair tasks do: 772 * For !fair tasks do:
788 * 773 *
789 update_cfs_rq_load_avg(now, cfs_rq); 774 update_cfs_rq_load_avg(now, cfs_rq);
790 attach_entity_load_avg(cfs_rq, se); 775 attach_entity_load_avg(cfs_rq, se, 0);
791 switched_from_fair(rq, p); 776 switched_from_fair(rq, p);
792 * 777 *
793 * such that the next switched_to_fair() has the 778 * such that the next switched_to_fair() has the
@@ -1181,7 +1166,7 @@ pid_t task_numa_group_id(struct task_struct *p)
1181} 1166}
1182 1167
1183/* 1168/*
1184 * The averaged statistics, shared & private, memory & cpu, 1169 * The averaged statistics, shared & private, memory & CPU,
1185 * occupy the first half of the array. The second half of the 1170 * occupy the first half of the array. The second half of the
1186 * array is for current counters, which are averaged into the 1171 * array is for current counters, which are averaged into the
1187 * first set by task_numa_placement. 1172 * first set by task_numa_placement.
@@ -1587,7 +1572,7 @@ static void task_numa_compare(struct task_numa_env *env,
1587 * be incurred if the tasks were swapped. 1572 * be incurred if the tasks were swapped.
1588 */ 1573 */
1589 if (cur) { 1574 if (cur) {
1590 /* Skip this swap candidate if cannot move to the source cpu */ 1575 /* Skip this swap candidate if cannot move to the source CPU: */
1591 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) 1576 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
1592 goto unlock; 1577 goto unlock;
1593 1578
@@ -1631,7 +1616,7 @@ static void task_numa_compare(struct task_numa_env *env,
1631 goto balance; 1616 goto balance;
1632 } 1617 }
1633 1618
1634 /* Balance doesn't matter much if we're running a task per cpu */ 1619 /* Balance doesn't matter much if we're running a task per CPU: */
1635 if (imp > env->best_imp && src_rq->nr_running == 1 && 1620 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1636 dst_rq->nr_running == 1) 1621 dst_rq->nr_running == 1)
1637 goto assign; 1622 goto assign;
@@ -1676,7 +1661,7 @@ balance:
1676 */ 1661 */
1677 if (!cur) { 1662 if (!cur) {
1678 /* 1663 /*
1679 * select_idle_siblings() uses an per-cpu cpumask that 1664 * select_idle_siblings() uses an per-CPU cpumask that
1680 * can be used from IRQ context. 1665 * can be used from IRQ context.
1681 */ 1666 */
1682 local_irq_disable(); 1667 local_irq_disable();
@@ -1869,6 +1854,7 @@ static int task_numa_migrate(struct task_struct *p)
1869static void numa_migrate_preferred(struct task_struct *p) 1854static void numa_migrate_preferred(struct task_struct *p)
1870{ 1855{
1871 unsigned long interval = HZ; 1856 unsigned long interval = HZ;
1857 unsigned long numa_migrate_retry;
1872 1858
1873 /* This task has no NUMA fault statistics yet */ 1859 /* This task has no NUMA fault statistics yet */
1874 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) 1860 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
@@ -1876,7 +1862,18 @@ static void numa_migrate_preferred(struct task_struct *p)
1876 1862
1877 /* Periodically retry migrating the task to the preferred node */ 1863 /* Periodically retry migrating the task to the preferred node */
1878 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); 1864 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1879 p->numa_migrate_retry = jiffies + interval; 1865 numa_migrate_retry = jiffies + interval;
1866
1867 /*
1868 * Check that the new retry threshold is after the current one. If
1869 * the retry is in the future, it implies that wake_affine has
1870 * temporarily asked NUMA balancing to backoff from placement.
1871 */
1872 if (numa_migrate_retry > p->numa_migrate_retry)
1873 return;
1874
1875 /* Safe to try placing the task on the preferred node */
1876 p->numa_migrate_retry = numa_migrate_retry;
1880 1877
1881 /* Success if task is already running on preferred CPU */ 1878 /* Success if task is already running on preferred CPU */
1882 if (task_node(p) == p->numa_preferred_nid) 1879 if (task_node(p) == p->numa_preferred_nid)
@@ -2823,7 +2820,7 @@ void reweight_task(struct task_struct *p, int prio)
2823} 2820}
2824 2821
2825#ifdef CONFIG_FAIR_GROUP_SCHED 2822#ifdef CONFIG_FAIR_GROUP_SCHED
2826# ifdef CONFIG_SMP 2823#ifdef CONFIG_SMP
2827/* 2824/*
2828 * All this does is approximate the hierarchical proportion which includes that 2825 * All this does is approximate the hierarchical proportion which includes that
2829 * global sum we all love to hate. 2826 * global sum we all love to hate.
@@ -2974,7 +2971,7 @@ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
2974 2971
2975 return clamp_t(long, runnable, MIN_SHARES, shares); 2972 return clamp_t(long, runnable, MIN_SHARES, shares);
2976} 2973}
2977# endif /* CONFIG_SMP */ 2974#endif /* CONFIG_SMP */
2978 2975
2979static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); 2976static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2980 2977
@@ -3012,11 +3009,11 @@ static inline void update_cfs_group(struct sched_entity *se)
3012} 3009}
3013#endif /* CONFIG_FAIR_GROUP_SCHED */ 3010#endif /* CONFIG_FAIR_GROUP_SCHED */
3014 3011
3015static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) 3012static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3016{ 3013{
3017 struct rq *rq = rq_of(cfs_rq); 3014 struct rq *rq = rq_of(cfs_rq);
3018 3015
3019 if (&rq->cfs == cfs_rq) { 3016 if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
3020 /* 3017 /*
3021 * There are a few boundary cases this might miss but it should 3018 * There are a few boundary cases this might miss but it should
3022 * get called often enough that that should (hopefully) not be 3019 * get called often enough that that should (hopefully) not be
@@ -3031,7 +3028,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
3031 * 3028 *
3032 * See cpu_util(). 3029 * See cpu_util().
3033 */ 3030 */
3034 cpufreq_update_util(rq, 0); 3031 cpufreq_update_util(rq, flags);
3035 } 3032 }
3036} 3033}
3037 3034
@@ -3246,6 +3243,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
3246} 3243}
3247 3244
3248/* 3245/*
3246 * When a task is dequeued, its estimated utilization should not be update if
3247 * its util_avg has not been updated at least once.
3248 * This flag is used to synchronize util_avg updates with util_est updates.
3249 * We map this information into the LSB bit of the utilization saved at
3250 * dequeue time (i.e. util_est.dequeued).
3251 */
3252#define UTIL_AVG_UNCHANGED 0x1
3253
3254static inline void cfs_se_util_change(struct sched_avg *avg)
3255{
3256 unsigned int enqueued;
3257
3258 if (!sched_feat(UTIL_EST))
3259 return;
3260
3261 /* Avoid store if the flag has been already set */
3262 enqueued = avg->util_est.enqueued;
3263 if (!(enqueued & UTIL_AVG_UNCHANGED))
3264 return;
3265
3266 /* Reset flag to report util_avg has been updated */
3267 enqueued &= ~UTIL_AVG_UNCHANGED;
3268 WRITE_ONCE(avg->util_est.enqueued, enqueued);
3269}
3270
3271/*
3249 * sched_entity: 3272 * sched_entity:
3250 * 3273 *
3251 * task: 3274 * task:
@@ -3296,6 +3319,7 @@ __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entit
3296 cfs_rq->curr == se)) { 3319 cfs_rq->curr == se)) {
3297 3320
3298 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 3321 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
3322 cfs_se_util_change(&se->avg);
3299 return 1; 3323 return 1;
3300 } 3324 }
3301 3325
@@ -3350,7 +3374,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3350} 3374}
3351 3375
3352/* 3376/*
3353 * Called within set_task_rq() right before setting a task's cpu. The 3377 * Called within set_task_rq() right before setting a task's CPU. The
3354 * caller only guarantees p->pi_lock is held; no other assumptions, 3378 * caller only guarantees p->pi_lock is held; no other assumptions,
3355 * including the state of rq->lock, should be made. 3379 * including the state of rq->lock, should be made.
3356 */ 3380 */
@@ -3529,7 +3553,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
3529 3553
3530 /* 3554 /*
3531 * runnable_sum can't be lower than running_sum 3555 * runnable_sum can't be lower than running_sum
3532 * As running sum is scale with cpu capacity wehreas the runnable sum 3556 * As running sum is scale with CPU capacity wehreas the runnable sum
3533 * is not we rescale running_sum 1st 3557 * is not we rescale running_sum 1st
3534 */ 3558 */
3535 running_sum = se->avg.util_sum / 3559 running_sum = se->avg.util_sum /
@@ -3689,7 +3713,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3689#endif 3713#endif
3690 3714
3691 if (decayed) 3715 if (decayed)
3692 cfs_rq_util_change(cfs_rq); 3716 cfs_rq_util_change(cfs_rq, 0);
3693 3717
3694 return decayed; 3718 return decayed;
3695} 3719}
@@ -3702,7 +3726,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3702 * Must call update_cfs_rq_load_avg() before this, since we rely on 3726 * Must call update_cfs_rq_load_avg() before this, since we rely on
3703 * cfs_rq->avg.last_update_time being current. 3727 * cfs_rq->avg.last_update_time being current.
3704 */ 3728 */
3705static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3729static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3706{ 3730{
3707 u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; 3731 u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3708 3732
@@ -3738,7 +3762,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3738 3762
3739 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); 3763 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3740 3764
3741 cfs_rq_util_change(cfs_rq); 3765 cfs_rq_util_change(cfs_rq, flags);
3742} 3766}
3743 3767
3744/** 3768/**
@@ -3757,7 +3781,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3757 3781
3758 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); 3782 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3759 3783
3760 cfs_rq_util_change(cfs_rq); 3784 cfs_rq_util_change(cfs_rq, 0);
3761} 3785}
3762 3786
3763/* 3787/*
@@ -3787,7 +3811,14 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3787 3811
3788 if (!se->avg.last_update_time && (flags & DO_ATTACH)) { 3812 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3789 3813
3790 attach_entity_load_avg(cfs_rq, se); 3814 /*
3815 * DO_ATTACH means we're here from enqueue_entity().
3816 * !last_update_time means we've passed through
3817 * migrate_task_rq_fair() indicating we migrated.
3818 *
3819 * IOW we're enqueueing a task on a new CPU.
3820 */
3821 attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
3791 update_tg_load_avg(cfs_rq, 0); 3822 update_tg_load_avg(cfs_rq, 0);
3792 3823
3793 } else if (decayed && (flags & UPDATE_TG)) 3824 } else if (decayed && (flags & UPDATE_TG))
@@ -3869,6 +3900,120 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3869 3900
3870static int idle_balance(struct rq *this_rq, struct rq_flags *rf); 3901static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
3871 3902
3903static inline unsigned long task_util(struct task_struct *p)
3904{
3905 return READ_ONCE(p->se.avg.util_avg);
3906}
3907
3908static inline unsigned long _task_util_est(struct task_struct *p)
3909{
3910 struct util_est ue = READ_ONCE(p->se.avg.util_est);
3911
3912 return max(ue.ewma, ue.enqueued);
3913}
3914
3915static inline unsigned long task_util_est(struct task_struct *p)
3916{
3917 return max(task_util(p), _task_util_est(p));
3918}
3919
3920static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3921 struct task_struct *p)
3922{
3923 unsigned int enqueued;
3924
3925 if (!sched_feat(UTIL_EST))
3926 return;
3927
3928 /* Update root cfs_rq's estimated utilization */
3929 enqueued = cfs_rq->avg.util_est.enqueued;
3930 enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
3931 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3932}
3933
3934/*
3935 * Check if a (signed) value is within a specified (unsigned) margin,
3936 * based on the observation that:
3937 *
3938 * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
3939 *
3940 * NOTE: this only works when value + maring < INT_MAX.
3941 */
3942static inline bool within_margin(int value, int margin)
3943{
3944 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3945}
3946
3947static void
3948util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3949{
3950 long last_ewma_diff;
3951 struct util_est ue;
3952
3953 if (!sched_feat(UTIL_EST))
3954 return;
3955
3956 /*
3957 * Update root cfs_rq's estimated utilization
3958 *
3959 * If *p is the last task then the root cfs_rq's estimated utilization
3960 * of a CPU is 0 by definition.
3961 */
3962 ue.enqueued = 0;
3963 if (cfs_rq->nr_running) {
3964 ue.enqueued = cfs_rq->avg.util_est.enqueued;
3965 ue.enqueued -= min_t(unsigned int, ue.enqueued,
3966 (_task_util_est(p) | UTIL_AVG_UNCHANGED));
3967 }
3968 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3969
3970 /*
3971 * Skip update of task's estimated utilization when the task has not
3972 * yet completed an activation, e.g. being migrated.
3973 */
3974 if (!task_sleep)
3975 return;
3976
3977 /*
3978 * If the PELT values haven't changed since enqueue time,
3979 * skip the util_est update.
3980 */
3981 ue = p->se.avg.util_est;
3982 if (ue.enqueued & UTIL_AVG_UNCHANGED)
3983 return;
3984
3985 /*
3986 * Skip update of task's estimated utilization when its EWMA is
3987 * already ~1% close to its last activation value.
3988 */
3989 ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
3990 last_ewma_diff = ue.enqueued - ue.ewma;
3991 if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
3992 return;
3993
3994 /*
3995 * Update Task's estimated utilization
3996 *
3997 * When *p completes an activation we can consolidate another sample
3998 * of the task size. This is done by storing the current PELT value
3999 * as ue.enqueued and by using this value to update the Exponential
4000 * Weighted Moving Average (EWMA):
4001 *
4002 * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
4003 * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
4004 * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
4005 * = w * ( last_ewma_diff ) + ewma(t-1)
4006 * = w * (last_ewma_diff + ewma(t-1) / w)
4007 *
4008 * Where 'w' is the weight of new samples, which is configured to be
4009 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
4010 */
4011 ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
4012 ue.ewma += last_ewma_diff;
4013 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
4014 WRITE_ONCE(p->se.avg.util_est, ue);
4015}
4016
3872#else /* CONFIG_SMP */ 4017#else /* CONFIG_SMP */
3873 4018
3874static inline int 4019static inline int
@@ -3883,13 +4028,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3883 4028
3884static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) 4029static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
3885{ 4030{
3886 cfs_rq_util_change(cfs_rq); 4031 cfs_rq_util_change(cfs_rq, 0);
3887} 4032}
3888 4033
3889static inline void remove_entity_load_avg(struct sched_entity *se) {} 4034static inline void remove_entity_load_avg(struct sched_entity *se) {}
3890 4035
3891static inline void 4036static inline void
3892attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} 4037attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
3893static inline void 4038static inline void
3894detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} 4039detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3895 4040
@@ -3898,6 +4043,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
3898 return 0; 4043 return 0;
3899} 4044}
3900 4045
4046static inline void
4047util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4048
4049static inline void
4050util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
4051 bool task_sleep) {}
4052
3901#endif /* CONFIG_SMP */ 4053#endif /* CONFIG_SMP */
3902 4054
3903static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) 4055static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -4676,7 +4828,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4676 if (!se) 4828 if (!se)
4677 add_nr_running(rq, task_delta); 4829 add_nr_running(rq, task_delta);
4678 4830
4679 /* determine whether we need to wake up potentially idle cpu */ 4831 /* Determine whether we need to wake up potentially idle CPU: */
4680 if (rq->curr == rq->idle && rq->cfs.nr_running) 4832 if (rq->curr == rq->idle && rq->cfs.nr_running)
4681 resched_curr(rq); 4833 resched_curr(rq);
4682} 4834}
@@ -5041,7 +5193,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5041} 5193}
5042 5194
5043/* 5195/*
5044 * Both these cpu hotplug callbacks race against unregister_fair_sched_group() 5196 * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
5045 * 5197 *
5046 * The race is harmless, since modifying bandwidth settings of unhooked group 5198 * The race is harmless, since modifying bandwidth settings of unhooked group
5047 * bits doesn't do much. 5199 * bits doesn't do much.
@@ -5086,7 +5238,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5086 */ 5238 */
5087 cfs_rq->runtime_remaining = 1; 5239 cfs_rq->runtime_remaining = 1;
5088 /* 5240 /*
5089 * Offline rq is schedulable till cpu is completely disabled 5241 * Offline rq is schedulable till CPU is completely disabled
5090 * in take_cpu_down(), so we prevent new cfs throttling here. 5242 * in take_cpu_down(), so we prevent new cfs throttling here.
5091 */ 5243 */
5092 cfs_rq->runtime_enabled = 0; 5244 cfs_rq->runtime_enabled = 0;
@@ -5245,6 +5397,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5245 if (!se) 5397 if (!se)
5246 add_nr_running(rq, 1); 5398 add_nr_running(rq, 1);
5247 5399
5400 util_est_enqueue(&rq->cfs, p);
5248 hrtick_update(rq); 5401 hrtick_update(rq);
5249} 5402}
5250 5403
@@ -5304,6 +5457,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5304 if (!se) 5457 if (!se)
5305 sub_nr_running(rq, 1); 5458 sub_nr_running(rq, 1);
5306 5459
5460 util_est_dequeue(&rq->cfs, p, task_sleep);
5307 hrtick_update(rq); 5461 hrtick_update(rq);
5308} 5462}
5309 5463
@@ -5323,8 +5477,8 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5323 * 5477 *
5324 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load 5478 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5325 * 5479 *
5326 * If a cpu misses updates for n ticks (as it was idle) and update gets 5480 * If a CPU misses updates for n ticks (as it was idle) and update gets
5327 * called on the n+1-th tick when cpu may be busy, then we have: 5481 * called on the n+1-th tick when CPU may be busy, then we have:
5328 * 5482 *
5329 * load_n = (1 - 1/2^i)^n * load_0 5483 * load_n = (1 - 1/2^i)^n * load_0
5330 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load 5484 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
@@ -5379,6 +5533,15 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5379 } 5533 }
5380 return load; 5534 return load;
5381} 5535}
5536
5537static struct {
5538 cpumask_var_t idle_cpus_mask;
5539 atomic_t nr_cpus;
5540 int has_blocked; /* Idle CPUS has blocked load */
5541 unsigned long next_balance; /* in jiffy units */
5542 unsigned long next_blocked; /* Next update of blocked load in jiffies */
5543} nohz ____cacheline_aligned;
5544
5382#endif /* CONFIG_NO_HZ_COMMON */ 5545#endif /* CONFIG_NO_HZ_COMMON */
5383 5546
5384/** 5547/**
@@ -5468,7 +5631,7 @@ static unsigned long weighted_cpuload(struct rq *rq)
5468#ifdef CONFIG_NO_HZ_COMMON 5631#ifdef CONFIG_NO_HZ_COMMON
5469/* 5632/*
5470 * There is no sane way to deal with nohz on smp when using jiffies because the 5633 * There is no sane way to deal with nohz on smp when using jiffies because the
5471 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading 5634 * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
5472 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. 5635 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5473 * 5636 *
5474 * Therefore we need to avoid the delta approach from the regular tick when 5637 * Therefore we need to avoid the delta approach from the regular tick when
@@ -5579,7 +5742,7 @@ void cpu_load_update_active(struct rq *this_rq)
5579} 5742}
5580 5743
5581/* 5744/*
5582 * Return a low guess at the load of a migration-source cpu weighted 5745 * Return a low guess at the load of a migration-source CPU weighted
5583 * according to the scheduling class and "nice" value. 5746 * according to the scheduling class and "nice" value.
5584 * 5747 *
5585 * We want to under-estimate the load of migration sources, to 5748 * We want to under-estimate the load of migration sources, to
@@ -5597,7 +5760,7 @@ static unsigned long source_load(int cpu, int type)
5597} 5760}
5598 5761
5599/* 5762/*
5600 * Return a high guess at the load of a migration-target cpu weighted 5763 * Return a high guess at the load of a migration-target CPU weighted
5601 * according to the scheduling class and "nice" value. 5764 * according to the scheduling class and "nice" value.
5602 */ 5765 */
5603static unsigned long target_load(int cpu, int type) 5766static unsigned long target_load(int cpu, int type)
@@ -5724,7 +5887,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5724 unsigned long task_load; 5887 unsigned long task_load;
5725 5888
5726 this_eff_load = target_load(this_cpu, sd->wake_idx); 5889 this_eff_load = target_load(this_cpu, sd->wake_idx);
5727 prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5728 5890
5729 if (sync) { 5891 if (sync) {
5730 unsigned long current_load = task_h_load(current); 5892 unsigned long current_load = task_h_load(current);
@@ -5742,18 +5904,69 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5742 this_eff_load *= 100; 5904 this_eff_load *= 100;
5743 this_eff_load *= capacity_of(prev_cpu); 5905 this_eff_load *= capacity_of(prev_cpu);
5744 5906
5907 prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5745 prev_eff_load -= task_load; 5908 prev_eff_load -= task_load;
5746 if (sched_feat(WA_BIAS)) 5909 if (sched_feat(WA_BIAS))
5747 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; 5910 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5748 prev_eff_load *= capacity_of(this_cpu); 5911 prev_eff_load *= capacity_of(this_cpu);
5749 5912
5750 return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits; 5913 /*
5914 * If sync, adjust the weight of prev_eff_load such that if
5915 * prev_eff == this_eff that select_idle_sibling() will consider
5916 * stacking the wakee on top of the waker if no other CPU is
5917 * idle.
5918 */
5919 if (sync)
5920 prev_eff_load += 1;
5921
5922 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
5923}
5924
5925#ifdef CONFIG_NUMA_BALANCING
5926static void
5927update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
5928{
5929 unsigned long interval;
5930
5931 if (!static_branch_likely(&sched_numa_balancing))
5932 return;
5933
5934 /* If balancing has no preference then continue gathering data */
5935 if (p->numa_preferred_nid == -1)
5936 return;
5937
5938 /*
5939 * If the wakeup is not affecting locality then it is neutral from
5940 * the perspective of NUMA balacing so continue gathering data.
5941 */
5942 if (cpu_to_node(prev_cpu) == cpu_to_node(target))
5943 return;
5944
5945 /*
5946 * Temporarily prevent NUMA balancing trying to place waker/wakee after
5947 * wakee has been moved by wake_affine. This will potentially allow
5948 * related tasks to converge and update their data placement. The
5949 * 4 * numa_scan_period is to allow the two-pass filter to migrate
5950 * hot data to the wakers node.
5951 */
5952 interval = max(sysctl_numa_balancing_scan_delay,
5953 p->numa_scan_period << 2);
5954 p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
5955
5956 interval = max(sysctl_numa_balancing_scan_delay,
5957 current->numa_scan_period << 2);
5958 current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
5751} 5959}
5960#else
5961static void
5962update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
5963{
5964}
5965#endif
5752 5966
5753static int wake_affine(struct sched_domain *sd, struct task_struct *p, 5967static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5754 int prev_cpu, int sync) 5968 int this_cpu, int prev_cpu, int sync)
5755{ 5969{
5756 int this_cpu = smp_processor_id();
5757 int target = nr_cpumask_bits; 5970 int target = nr_cpumask_bits;
5758 5971
5759 if (sched_feat(WA_IDLE)) 5972 if (sched_feat(WA_IDLE))
@@ -5766,12 +5979,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5766 if (target == nr_cpumask_bits) 5979 if (target == nr_cpumask_bits)
5767 return prev_cpu; 5980 return prev_cpu;
5768 5981
5982 update_wa_numa_placement(p, prev_cpu, target);
5769 schedstat_inc(sd->ttwu_move_affine); 5983 schedstat_inc(sd->ttwu_move_affine);
5770 schedstat_inc(p->se.statistics.nr_wakeups_affine); 5984 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5771 return target; 5985 return target;
5772} 5986}
5773 5987
5774static inline unsigned long task_util(struct task_struct *p);
5775static unsigned long cpu_util_wake(int cpu, struct task_struct *p); 5988static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
5776 5989
5777static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) 5990static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
@@ -5826,7 +6039,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5826 max_spare_cap = 0; 6039 max_spare_cap = 0;
5827 6040
5828 for_each_cpu(i, sched_group_span(group)) { 6041 for_each_cpu(i, sched_group_span(group)) {
5829 /* Bias balancing toward cpus of our domain */ 6042 /* Bias balancing toward CPUs of our domain */
5830 if (local_group) 6043 if (local_group)
5831 load = source_load(i, load_idx); 6044 load = source_load(i, load_idx);
5832 else 6045 else
@@ -5856,7 +6069,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5856 if (min_runnable_load > (runnable_load + imbalance)) { 6069 if (min_runnable_load > (runnable_load + imbalance)) {
5857 /* 6070 /*
5858 * The runnable load is significantly smaller 6071 * The runnable load is significantly smaller
5859 * so we can pick this new cpu 6072 * so we can pick this new CPU:
5860 */ 6073 */
5861 min_runnable_load = runnable_load; 6074 min_runnable_load = runnable_load;
5862 min_avg_load = avg_load; 6075 min_avg_load = avg_load;
@@ -5865,7 +6078,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5865 (100*min_avg_load > imbalance_scale*avg_load)) { 6078 (100*min_avg_load > imbalance_scale*avg_load)) {
5866 /* 6079 /*
5867 * The runnable loads are close so take the 6080 * The runnable loads are close so take the
5868 * blocked load into account through avg_load. 6081 * blocked load into account through avg_load:
5869 */ 6082 */
5870 min_avg_load = avg_load; 6083 min_avg_load = avg_load;
5871 idlest = group; 6084 idlest = group;
@@ -5903,6 +6116,18 @@ skip_spare:
5903 if (!idlest) 6116 if (!idlest)
5904 return NULL; 6117 return NULL;
5905 6118
6119 /*
6120 * When comparing groups across NUMA domains, it's possible for the
6121 * local domain to be very lightly loaded relative to the remote
6122 * domains but "imbalance" skews the comparison making remote CPUs
6123 * look much more favourable. When considering cross-domain, add
6124 * imbalance to the runnable load on the remote node and consider
6125 * staying local.
6126 */
6127 if ((sd->flags & SD_NUMA) &&
6128 min_runnable_load + imbalance >= this_runnable_load)
6129 return NULL;
6130
5906 if (min_runnable_load > (this_runnable_load + imbalance)) 6131 if (min_runnable_load > (this_runnable_load + imbalance))
5907 return NULL; 6132 return NULL;
5908 6133
@@ -5914,7 +6139,7 @@ skip_spare:
5914} 6139}
5915 6140
5916/* 6141/*
5917 * find_idlest_group_cpu - find the idlest cpu among the cpus in group. 6142 * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
5918 */ 6143 */
5919static int 6144static int
5920find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 6145find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
@@ -5992,12 +6217,12 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
5992 6217
5993 new_cpu = find_idlest_group_cpu(group, p, cpu); 6218 new_cpu = find_idlest_group_cpu(group, p, cpu);
5994 if (new_cpu == cpu) { 6219 if (new_cpu == cpu) {
5995 /* Now try balancing at a lower domain level of cpu */ 6220 /* Now try balancing at a lower domain level of 'cpu': */
5996 sd = sd->child; 6221 sd = sd->child;
5997 continue; 6222 continue;
5998 } 6223 }
5999 6224
6000 /* Now try balancing at a lower domain level of new_cpu */ 6225 /* Now try balancing at a lower domain level of 'new_cpu': */
6001 cpu = new_cpu; 6226 cpu = new_cpu;
6002 weight = sd->span_weight; 6227 weight = sd->span_weight;
6003 sd = NULL; 6228 sd = NULL;
@@ -6007,7 +6232,6 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
6007 if (tmp->flags & sd_flag) 6232 if (tmp->flags & sd_flag)
6008 sd = tmp; 6233 sd = tmp;
6009 } 6234 }
6010 /* while loop will break here if sd == NULL */
6011 } 6235 }
6012 6236
6013 return new_cpu; 6237 return new_cpu;
@@ -6203,12 +6427,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6203 return target; 6427 return target;
6204 6428
6205 /* 6429 /*
6206 * If the previous cpu is cache affine and idle, don't be stupid. 6430 * If the previous CPU is cache affine and idle, don't be stupid:
6207 */ 6431 */
6208 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) 6432 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
6209 return prev; 6433 return prev;
6210 6434
6211 /* Check a recently used CPU as a potential idle candidate */ 6435 /* Check a recently used CPU as a potential idle candidate: */
6212 recent_used_cpu = p->recent_used_cpu; 6436 recent_used_cpu = p->recent_used_cpu;
6213 if (recent_used_cpu != prev && 6437 if (recent_used_cpu != prev &&
6214 recent_used_cpu != target && 6438 recent_used_cpu != target &&
@@ -6217,7 +6441,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6217 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { 6441 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
6218 /* 6442 /*
6219 * Replace recent_used_cpu with prev as it is a potential 6443 * Replace recent_used_cpu with prev as it is a potential
6220 * candidate for the next wake. 6444 * candidate for the next wake:
6221 */ 6445 */
6222 p->recent_used_cpu = prev; 6446 p->recent_used_cpu = prev;
6223 return recent_used_cpu; 6447 return recent_used_cpu;
@@ -6242,11 +6466,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6242 return target; 6466 return target;
6243} 6467}
6244 6468
6245/* 6469/**
6246 * cpu_util returns the amount of capacity of a CPU that is used by CFS 6470 * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
6247 * tasks. The unit of the return value must be the one of capacity so we can 6471 * @cpu: the CPU to get the utilization of
6248 * compare the utilization with the capacity of the CPU that is available for 6472 *
6249 * CFS task (ie cpu_capacity). 6473 * The unit of the return value must be the one of capacity so we can compare
6474 * the utilization with the capacity of the CPU that is available for CFS task
6475 * (ie cpu_capacity).
6250 * 6476 *
6251 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the 6477 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
6252 * recent utilization of currently non-runnable tasks on a CPU. It represents 6478 * recent utilization of currently non-runnable tasks on a CPU. It represents
@@ -6257,6 +6483,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6257 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is 6483 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
6258 * the running time on this CPU scaled by capacity_curr. 6484 * the running time on this CPU scaled by capacity_curr.
6259 * 6485 *
6486 * The estimated utilization of a CPU is defined to be the maximum between its
6487 * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
6488 * currently RUNNABLE on that CPU.
6489 * This allows to properly represent the expected utilization of a CPU which
6490 * has just got a big task running since a long sleep period. At the same time
6491 * however it preserves the benefits of the "blocked utilization" in
6492 * describing the potential for other tasks waking up on the same CPU.
6493 *
6260 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even 6494 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
6261 * higher than capacity_orig because of unfortunate rounding in 6495 * higher than capacity_orig because of unfortunate rounding in
6262 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until 6496 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
@@ -6267,36 +6501,77 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6267 * available capacity. We allow utilization to overshoot capacity_curr (but not 6501 * available capacity. We allow utilization to overshoot capacity_curr (but not
6268 * capacity_orig) as it useful for predicting the capacity required after task 6502 * capacity_orig) as it useful for predicting the capacity required after task
6269 * migrations (scheduler-driven DVFS). 6503 * migrations (scheduler-driven DVFS).
6504 *
6505 * Return: the (estimated) utilization for the specified CPU
6270 */ 6506 */
6271static unsigned long cpu_util(int cpu) 6507static inline unsigned long cpu_util(int cpu)
6272{ 6508{
6273 unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; 6509 struct cfs_rq *cfs_rq;
6274 unsigned long capacity = capacity_orig_of(cpu); 6510 unsigned int util;
6275 6511
6276 return (util >= capacity) ? capacity : util; 6512 cfs_rq = &cpu_rq(cpu)->cfs;
6277} 6513 util = READ_ONCE(cfs_rq->avg.util_avg);
6278 6514
6279static inline unsigned long task_util(struct task_struct *p) 6515 if (sched_feat(UTIL_EST))
6280{ 6516 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6281 return p->se.avg.util_avg; 6517
6518 return min_t(unsigned long, util, capacity_orig_of(cpu));
6282} 6519}
6283 6520
6284/* 6521/*
6285 * cpu_util_wake: Compute cpu utilization with any contributions from 6522 * cpu_util_wake: Compute CPU utilization with any contributions from
6286 * the waking task p removed. 6523 * the waking task p removed.
6287 */ 6524 */
6288static unsigned long cpu_util_wake(int cpu, struct task_struct *p) 6525static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6289{ 6526{
6290 unsigned long util, capacity; 6527 struct cfs_rq *cfs_rq;
6528 unsigned int util;
6291 6529
6292 /* Task has no contribution or is new */ 6530 /* Task has no contribution or is new */
6293 if (cpu != task_cpu(p) || !p->se.avg.last_update_time) 6531 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6294 return cpu_util(cpu); 6532 return cpu_util(cpu);
6295 6533
6296 capacity = capacity_orig_of(cpu); 6534 cfs_rq = &cpu_rq(cpu)->cfs;
6297 util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); 6535 util = READ_ONCE(cfs_rq->avg.util_avg);
6298 6536
6299 return (util >= capacity) ? capacity : util; 6537 /* Discount task's blocked util from CPU's util */
6538 util -= min_t(unsigned int, util, task_util(p));
6539
6540 /*
6541 * Covered cases:
6542 *
6543 * a) if *p is the only task sleeping on this CPU, then:
6544 * cpu_util (== task_util) > util_est (== 0)
6545 * and thus we return:
6546 * cpu_util_wake = (cpu_util - task_util) = 0
6547 *
6548 * b) if other tasks are SLEEPING on this CPU, which is now exiting
6549 * IDLE, then:
6550 * cpu_util >= task_util
6551 * cpu_util > util_est (== 0)
6552 * and thus we discount *p's blocked utilization to return:
6553 * cpu_util_wake = (cpu_util - task_util) >= 0
6554 *
6555 * c) if other tasks are RUNNABLE on that CPU and
6556 * util_est > cpu_util
6557 * then we use util_est since it returns a more restrictive
6558 * estimation of the spare capacity on that CPU, by just
6559 * considering the expected utilization of tasks already
6560 * runnable on that CPU.
6561 *
6562 * Cases a) and b) are covered by the above code, while case c) is
6563 * covered by the following code when estimated utilization is
6564 * enabled.
6565 */
6566 if (sched_feat(UTIL_EST))
6567 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6568
6569 /*
6570 * Utilization (estimated) can exceed the CPU capacity, thus let's
6571 * clamp to the maximum CPU capacity to ensure consistency with
6572 * the cpu_util call.
6573 */
6574 return min_t(unsigned long, util, capacity_orig_of(cpu));
6300} 6575}
6301 6576
6302/* 6577/*
@@ -6328,10 +6603,10 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6328 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, 6603 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
6329 * SD_BALANCE_FORK, or SD_BALANCE_EXEC. 6604 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
6330 * 6605 *
6331 * Balances load by selecting the idlest cpu in the idlest group, or under 6606 * Balances load by selecting the idlest CPU in the idlest group, or under
6332 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. 6607 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
6333 * 6608 *
6334 * Returns the target cpu number. 6609 * Returns the target CPU number.
6335 * 6610 *
6336 * preempt must be disabled. 6611 * preempt must be disabled.
6337 */ 6612 */
@@ -6342,7 +6617,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6342 int cpu = smp_processor_id(); 6617 int cpu = smp_processor_id();
6343 int new_cpu = prev_cpu; 6618 int new_cpu = prev_cpu;
6344 int want_affine = 0; 6619 int want_affine = 0;
6345 int sync = wake_flags & WF_SYNC; 6620 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
6346 6621
6347 if (sd_flag & SD_BALANCE_WAKE) { 6622 if (sd_flag & SD_BALANCE_WAKE) {
6348 record_wakee(p); 6623 record_wakee(p);
@@ -6356,7 +6631,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6356 break; 6631 break;
6357 6632
6358 /* 6633 /*
6359 * If both cpu and prev_cpu are part of this domain, 6634 * If both 'cpu' and 'prev_cpu' are part of this domain,
6360 * cpu is a valid SD_WAKE_AFFINE target. 6635 * cpu is a valid SD_WAKE_AFFINE target.
6361 */ 6636 */
6362 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 6637 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
@@ -6376,7 +6651,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6376 if (cpu == prev_cpu) 6651 if (cpu == prev_cpu)
6377 goto pick_cpu; 6652 goto pick_cpu;
6378 6653
6379 new_cpu = wake_affine(affine_sd, p, prev_cpu, sync); 6654 new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
6380 } 6655 }
6381 6656
6382 if (sd && !(sd_flag & SD_BALANCE_FORK)) { 6657 if (sd && !(sd_flag & SD_BALANCE_FORK)) {
@@ -6407,9 +6682,9 @@ pick_cpu:
6407static void detach_entity_cfs_rq(struct sched_entity *se); 6682static void detach_entity_cfs_rq(struct sched_entity *se);
6408 6683
6409/* 6684/*
6410 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 6685 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
6411 * cfs_rq_of(p) references at time of call are still valid and identify the 6686 * cfs_rq_of(p) references at time of call are still valid and identify the
6412 * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. 6687 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
6413 */ 6688 */
6414static void migrate_task_rq_fair(struct task_struct *p) 6689static void migrate_task_rq_fair(struct task_struct *p)
6415{ 6690{
@@ -6738,7 +7013,7 @@ simple:
6738 7013
6739 p = task_of(se); 7014 p = task_of(se);
6740 7015
6741done: __maybe_unused 7016done: __maybe_unused;
6742#ifdef CONFIG_SMP 7017#ifdef CONFIG_SMP
6743 /* 7018 /*
6744 * Move the next running task to the front of 7019 * Move the next running task to the front of
@@ -6843,17 +7118,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6843 * BASICS 7118 * BASICS
6844 * 7119 *
6845 * The purpose of load-balancing is to achieve the same basic fairness the 7120 * The purpose of load-balancing is to achieve the same basic fairness the
6846 * per-cpu scheduler provides, namely provide a proportional amount of compute 7121 * per-CPU scheduler provides, namely provide a proportional amount of compute
6847 * time to each task. This is expressed in the following equation: 7122 * time to each task. This is expressed in the following equation:
6848 * 7123 *
6849 * W_i,n/P_i == W_j,n/P_j for all i,j (1) 7124 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
6850 * 7125 *
6851 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight 7126 * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
6852 * W_i,0 is defined as: 7127 * W_i,0 is defined as:
6853 * 7128 *
6854 * W_i,0 = \Sum_j w_i,j (2) 7129 * W_i,0 = \Sum_j w_i,j (2)
6855 * 7130 *
6856 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight 7131 * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
6857 * is derived from the nice value as per sched_prio_to_weight[]. 7132 * is derived from the nice value as per sched_prio_to_weight[].
6858 * 7133 *
6859 * The weight average is an exponential decay average of the instantaneous 7134 * The weight average is an exponential decay average of the instantaneous
@@ -6861,7 +7136,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6861 * 7136 *
6862 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) 7137 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
6863 * 7138 *
6864 * C_i is the compute capacity of cpu i, typically it is the 7139 * C_i is the compute capacity of CPU i, typically it is the
6865 * fraction of 'recent' time available for SCHED_OTHER task execution. But it 7140 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
6866 * can also include other factors [XXX]. 7141 * can also include other factors [XXX].
6867 * 7142 *
@@ -6882,11 +7157,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6882 * SCHED DOMAINS 7157 * SCHED DOMAINS
6883 * 7158 *
6884 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) 7159 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
6885 * for all i,j solution, we create a tree of cpus that follows the hardware 7160 * for all i,j solution, we create a tree of CPUs that follows the hardware
6886 * topology where each level pairs two lower groups (or better). This results 7161 * topology where each level pairs two lower groups (or better). This results
6887 * in O(log n) layers. Furthermore we reduce the number of cpus going up the 7162 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
6888 * tree to only the first of the previous level and we decrease the frequency 7163 * tree to only the first of the previous level and we decrease the frequency
6889 * of load-balance at each level inv. proportional to the number of cpus in 7164 * of load-balance at each level inv. proportional to the number of CPUs in
6890 * the groups. 7165 * the groups.
6891 * 7166 *
6892 * This yields: 7167 * This yields:
@@ -6895,7 +7170,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6895 * \Sum { --- * --- * 2^i } = O(n) (5) 7170 * \Sum { --- * --- * 2^i } = O(n) (5)
6896 * i = 0 2^i 2^i 7171 * i = 0 2^i 2^i
6897 * `- size of each group 7172 * `- size of each group
6898 * | | `- number of cpus doing load-balance 7173 * | | `- number of CPUs doing load-balance
6899 * | `- freq 7174 * | `- freq
6900 * `- sum over all levels 7175 * `- sum over all levels
6901 * 7176 *
@@ -6903,7 +7178,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6903 * this makes (5) the runtime complexity of the balancer. 7178 * this makes (5) the runtime complexity of the balancer.
6904 * 7179 *
6905 * An important property here is that each CPU is still (indirectly) connected 7180 * An important property here is that each CPU is still (indirectly) connected
6906 * to every other cpu in at most O(log n) steps: 7181 * to every other CPU in at most O(log n) steps:
6907 * 7182 *
6908 * The adjacency matrix of the resulting graph is given by: 7183 * The adjacency matrix of the resulting graph is given by:
6909 * 7184 *
@@ -6915,7 +7190,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6915 * 7190 *
6916 * A^(log_2 n)_i,j != 0 for all i,j (7) 7191 * A^(log_2 n)_i,j != 0 for all i,j (7)
6917 * 7192 *
6918 * Showing there's indeed a path between every cpu in at most O(log n) steps. 7193 * Showing there's indeed a path between every CPU in at most O(log n) steps.
6919 * The task movement gives a factor of O(m), giving a convergence complexity 7194 * The task movement gives a factor of O(m), giving a convergence complexity
6920 * of: 7195 * of:
6921 * 7196 *
@@ -6925,7 +7200,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6925 * WORK CONSERVING 7200 * WORK CONSERVING
6926 * 7201 *
6927 * In order to avoid CPUs going idle while there's still work to do, new idle 7202 * In order to avoid CPUs going idle while there's still work to do, new idle
6928 * balancing is more aggressive and has the newly idle cpu iterate up the domain 7203 * balancing is more aggressive and has the newly idle CPU iterate up the domain
6929 * tree itself instead of relying on other CPUs to bring it work. 7204 * tree itself instead of relying on other CPUs to bring it work.
6930 * 7205 *
6931 * This adds some complexity to both (5) and (8) but it reduces the total idle 7206 * This adds some complexity to both (5) and (8) but it reduces the total idle
@@ -6946,7 +7221,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6946 * 7221 *
6947 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) 7222 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
6948 * 7223 *
6949 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. 7224 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
6950 * 7225 *
6951 * The big problem is S_k, its a global sum needed to compute a local (W_i) 7226 * The big problem is S_k, its a global sum needed to compute a local (W_i)
6952 * property. 7227 * property.
@@ -6963,6 +7238,8 @@ enum fbq_type { regular, remote, all };
6963#define LBF_NEED_BREAK 0x02 7238#define LBF_NEED_BREAK 0x02
6964#define LBF_DST_PINNED 0x04 7239#define LBF_DST_PINNED 0x04
6965#define LBF_SOME_PINNED 0x08 7240#define LBF_SOME_PINNED 0x08
7241#define LBF_NOHZ_STATS 0x10
7242#define LBF_NOHZ_AGAIN 0x20
6966 7243
6967struct lb_env { 7244struct lb_env {
6968 struct sched_domain *sd; 7245 struct sched_domain *sd;
@@ -7110,7 +7387,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
7110 env->flags |= LBF_SOME_PINNED; 7387 env->flags |= LBF_SOME_PINNED;
7111 7388
7112 /* 7389 /*
7113 * Remember if this task can be migrated to any other cpu in 7390 * Remember if this task can be migrated to any other CPU in
7114 * our sched_group. We may want to revisit it if we couldn't 7391 * our sched_group. We may want to revisit it if we couldn't
7115 * meet load balance goals by pulling other tasks on src_cpu. 7392 * meet load balance goals by pulling other tasks on src_cpu.
7116 * 7393 *
@@ -7120,7 +7397,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
7120 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) 7397 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
7121 return 0; 7398 return 0;
7122 7399
7123 /* Prevent to re-select dst_cpu via env's cpus */ 7400 /* Prevent to re-select dst_cpu via env's CPUs: */
7124 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 7401 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7125 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { 7402 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
7126 env->flags |= LBF_DST_PINNED; 7403 env->flags |= LBF_DST_PINNED;
@@ -7347,6 +7624,17 @@ static void attach_tasks(struct lb_env *env)
7347 rq_unlock(env->dst_rq, &rf); 7624 rq_unlock(env->dst_rq, &rf);
7348} 7625}
7349 7626
7627static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
7628{
7629 if (cfs_rq->avg.load_avg)
7630 return true;
7631
7632 if (cfs_rq->avg.util_avg)
7633 return true;
7634
7635 return false;
7636}
7637
7350#ifdef CONFIG_FAIR_GROUP_SCHED 7638#ifdef CONFIG_FAIR_GROUP_SCHED
7351 7639
7352static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) 7640static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7371,6 +7659,7 @@ static void update_blocked_averages(int cpu)
7371 struct rq *rq = cpu_rq(cpu); 7659 struct rq *rq = cpu_rq(cpu);
7372 struct cfs_rq *cfs_rq, *pos; 7660 struct cfs_rq *cfs_rq, *pos;
7373 struct rq_flags rf; 7661 struct rq_flags rf;
7662 bool done = true;
7374 7663
7375 rq_lock_irqsave(rq, &rf); 7664 rq_lock_irqsave(rq, &rf);
7376 update_rq_clock(rq); 7665 update_rq_clock(rq);
@@ -7400,7 +7689,17 @@ static void update_blocked_averages(int cpu)
7400 */ 7689 */
7401 if (cfs_rq_is_decayed(cfs_rq)) 7690 if (cfs_rq_is_decayed(cfs_rq))
7402 list_del_leaf_cfs_rq(cfs_rq); 7691 list_del_leaf_cfs_rq(cfs_rq);
7692
7693 /* Don't need periodic decay once load/util_avg are null */
7694 if (cfs_rq_has_blocked(cfs_rq))
7695 done = false;
7403 } 7696 }
7697
7698#ifdef CONFIG_NO_HZ_COMMON
7699 rq->last_blocked_load_update_tick = jiffies;
7700 if (done)
7701 rq->has_blocked_load = 0;
7702#endif
7404 rq_unlock_irqrestore(rq, &rf); 7703 rq_unlock_irqrestore(rq, &rf);
7405} 7704}
7406 7705
@@ -7460,6 +7759,11 @@ static inline void update_blocked_averages(int cpu)
7460 rq_lock_irqsave(rq, &rf); 7759 rq_lock_irqsave(rq, &rf);
7461 update_rq_clock(rq); 7760 update_rq_clock(rq);
7462 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); 7761 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
7762#ifdef CONFIG_NO_HZ_COMMON
7763 rq->last_blocked_load_update_tick = jiffies;
7764 if (!cfs_rq_has_blocked(cfs_rq))
7765 rq->has_blocked_load = 0;
7766#endif
7463 rq_unlock_irqrestore(rq, &rf); 7767 rq_unlock_irqrestore(rq, &rf);
7464} 7768}
7465 7769
@@ -7694,8 +7998,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
7694 * Group imbalance indicates (and tries to solve) the problem where balancing 7998 * Group imbalance indicates (and tries to solve) the problem where balancing
7695 * groups is inadequate due to ->cpus_allowed constraints. 7999 * groups is inadequate due to ->cpus_allowed constraints.
7696 * 8000 *
7697 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a 8001 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
7698 * cpumask covering 1 cpu of the first group and 3 cpus of the second group. 8002 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
7699 * Something like: 8003 * Something like:
7700 * 8004 *
7701 * { 0 1 2 3 } { 4 5 6 7 } 8005 * { 0 1 2 3 } { 4 5 6 7 }
@@ -7703,7 +8007,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
7703 * 8007 *
7704 * If we were to balance group-wise we'd place two tasks in the first group and 8008 * If we were to balance group-wise we'd place two tasks in the first group and
7705 * two tasks in the second group. Clearly this is undesired as it will overload 8009 * two tasks in the second group. Clearly this is undesired as it will overload
7706 * cpu 3 and leave one of the cpus in the second group unused. 8010 * cpu 3 and leave one of the CPUs in the second group unused.
7707 * 8011 *
7708 * The current solution to this issue is detecting the skew in the first group 8012 * The current solution to this issue is detecting the skew in the first group
7709 * by noticing the lower domain failed to reach balance and had difficulty 8013 * by noticing the lower domain failed to reach balance and had difficulty
@@ -7794,6 +8098,28 @@ group_type group_classify(struct sched_group *group,
7794 return group_other; 8098 return group_other;
7795} 8099}
7796 8100
8101static bool update_nohz_stats(struct rq *rq, bool force)
8102{
8103#ifdef CONFIG_NO_HZ_COMMON
8104 unsigned int cpu = rq->cpu;
8105
8106 if (!rq->has_blocked_load)
8107 return false;
8108
8109 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
8110 return false;
8111
8112 if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
8113 return true;
8114
8115 update_blocked_averages(cpu);
8116
8117 return rq->has_blocked_load;
8118#else
8119 return false;
8120#endif
8121}
8122
7797/** 8123/**
7798 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 8124 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
7799 * @env: The load balancing environment. 8125 * @env: The load balancing environment.
@@ -7816,7 +8142,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
7816 for_each_cpu_and(i, sched_group_span(group), env->cpus) { 8142 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
7817 struct rq *rq = cpu_rq(i); 8143 struct rq *rq = cpu_rq(i);
7818 8144
7819 /* Bias balancing toward cpus of our domain */ 8145 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
8146 env->flags |= LBF_NOHZ_AGAIN;
8147
8148 /* Bias balancing toward CPUs of our domain: */
7820 if (local_group) 8149 if (local_group)
7821 load = target_load(i, load_idx); 8150 load = target_load(i, load_idx);
7822 else 8151 else
@@ -7902,7 +8231,7 @@ asym_packing:
7902 if (!(env->sd->flags & SD_ASYM_PACKING)) 8231 if (!(env->sd->flags & SD_ASYM_PACKING))
7903 return true; 8232 return true;
7904 8233
7905 /* No ASYM_PACKING if target cpu is already busy */ 8234 /* No ASYM_PACKING if target CPU is already busy */
7906 if (env->idle == CPU_NOT_IDLE) 8235 if (env->idle == CPU_NOT_IDLE)
7907 return true; 8236 return true;
7908 /* 8237 /*
@@ -7915,7 +8244,7 @@ asym_packing:
7915 if (!sds->busiest) 8244 if (!sds->busiest)
7916 return true; 8245 return true;
7917 8246
7918 /* Prefer to move from lowest priority cpu's work */ 8247 /* Prefer to move from lowest priority CPU's work */
7919 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, 8248 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
7920 sg->asym_prefer_cpu)) 8249 sg->asym_prefer_cpu))
7921 return true; 8250 return true;
@@ -7971,6 +8300,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
7971 if (child && child->flags & SD_PREFER_SIBLING) 8300 if (child && child->flags & SD_PREFER_SIBLING)
7972 prefer_sibling = 1; 8301 prefer_sibling = 1;
7973 8302
8303#ifdef CONFIG_NO_HZ_COMMON
8304 if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
8305 env->flags |= LBF_NOHZ_STATS;
8306#endif
8307
7974 load_idx = get_sd_load_idx(env->sd, env->idle); 8308 load_idx = get_sd_load_idx(env->sd, env->idle);
7975 8309
7976 do { 8310 do {
@@ -8024,6 +8358,15 @@ next_group:
8024 sg = sg->next; 8358 sg = sg->next;
8025 } while (sg != env->sd->groups); 8359 } while (sg != env->sd->groups);
8026 8360
8361#ifdef CONFIG_NO_HZ_COMMON
8362 if ((env->flags & LBF_NOHZ_AGAIN) &&
8363 cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
8364
8365 WRITE_ONCE(nohz.next_blocked,
8366 jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
8367 }
8368#endif
8369
8027 if (env->sd->flags & SD_NUMA) 8370 if (env->sd->flags & SD_NUMA)
8028 env->fbq_type = fbq_classify_group(&sds->busiest_stat); 8371 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
8029 8372
@@ -8168,7 +8511,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
8168 if (busiest->group_type == group_imbalanced) { 8511 if (busiest->group_type == group_imbalanced) {
8169 /* 8512 /*
8170 * In the group_imb case we cannot rely on group-wide averages 8513 * In the group_imb case we cannot rely on group-wide averages
8171 * to ensure cpu-load equilibrium, look at wider averages. XXX 8514 * to ensure CPU-load equilibrium, look at wider averages. XXX
8172 */ 8515 */
8173 busiest->load_per_task = 8516 busiest->load_per_task =
8174 min(busiest->load_per_task, sds->avg_load); 8517 min(busiest->load_per_task, sds->avg_load);
@@ -8187,7 +8530,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
8187 } 8530 }
8188 8531
8189 /* 8532 /*
8190 * If there aren't any idle cpus, avoid creating some. 8533 * If there aren't any idle CPUs, avoid creating some.
8191 */ 8534 */
8192 if (busiest->group_type == group_overloaded && 8535 if (busiest->group_type == group_overloaded &&
8193 local->group_type == group_overloaded) { 8536 local->group_type == group_overloaded) {
@@ -8201,9 +8544,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
8201 } 8544 }
8202 8545
8203 /* 8546 /*
8204 * We're trying to get all the cpus to the average_load, so we don't 8547 * We're trying to get all the CPUs to the average_load, so we don't
8205 * want to push ourselves above the average load, nor do we wish to 8548 * want to push ourselves above the average load, nor do we wish to
8206 * reduce the max loaded cpu below the average load. At the same time, 8549 * reduce the max loaded CPU below the average load. At the same time,
8207 * we also don't want to reduce the group load below the group 8550 * we also don't want to reduce the group load below the group
8208 * capacity. Thus we look for the minimum possible imbalance. 8551 * capacity. Thus we look for the minimum possible imbalance.
8209 */ 8552 */
@@ -8297,9 +8640,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
8297 8640
8298 if (env->idle == CPU_IDLE) { 8641 if (env->idle == CPU_IDLE) {
8299 /* 8642 /*
8300 * This cpu is idle. If the busiest group is not overloaded 8643 * This CPU is idle. If the busiest group is not overloaded
8301 * and there is no imbalance between this and busiest group 8644 * and there is no imbalance between this and busiest group
8302 * wrt idle cpus, it is balanced. The imbalance becomes 8645 * wrt idle CPUs, it is balanced. The imbalance becomes
8303 * significant if the diff is greater than 1 otherwise we 8646 * significant if the diff is greater than 1 otherwise we
8304 * might end up to just move the imbalance on another group 8647 * might end up to just move the imbalance on another group
8305 */ 8648 */
@@ -8327,7 +8670,7 @@ out_balanced:
8327} 8670}
8328 8671
8329/* 8672/*
8330 * find_busiest_queue - find the busiest runqueue among the cpus in group. 8673 * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
8331 */ 8674 */
8332static struct rq *find_busiest_queue(struct lb_env *env, 8675static struct rq *find_busiest_queue(struct lb_env *env,
8333 struct sched_group *group) 8676 struct sched_group *group)
@@ -8371,7 +8714,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
8371 8714
8372 /* 8715 /*
8373 * When comparing with imbalance, use weighted_cpuload() 8716 * When comparing with imbalance, use weighted_cpuload()
8374 * which is not scaled with the cpu capacity. 8717 * which is not scaled with the CPU capacity.
8375 */ 8718 */
8376 8719
8377 if (rq->nr_running == 1 && wl > env->imbalance && 8720 if (rq->nr_running == 1 && wl > env->imbalance &&
@@ -8379,9 +8722,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
8379 continue; 8722 continue;
8380 8723
8381 /* 8724 /*
8382 * For the load comparisons with the other cpu's, consider 8725 * For the load comparisons with the other CPU's, consider
8383 * the weighted_cpuload() scaled with the cpu capacity, so 8726 * the weighted_cpuload() scaled with the CPU capacity, so
8384 * that the load can be moved away from the cpu that is 8727 * that the load can be moved away from the CPU that is
8385 * potentially running at a lower capacity. 8728 * potentially running at a lower capacity.
8386 * 8729 *
8387 * Thus we're looking for max(wl_i / capacity_i), crosswise 8730 * Thus we're looking for max(wl_i / capacity_i), crosswise
@@ -8452,13 +8795,13 @@ static int should_we_balance(struct lb_env *env)
8452 return 0; 8795 return 0;
8453 8796
8454 /* 8797 /*
8455 * In the newly idle case, we will allow all the cpu's 8798 * In the newly idle case, we will allow all the CPUs
8456 * to do the newly idle load balance. 8799 * to do the newly idle load balance.
8457 */ 8800 */
8458 if (env->idle == CPU_NEWLY_IDLE) 8801 if (env->idle == CPU_NEWLY_IDLE)
8459 return 1; 8802 return 1;
8460 8803
8461 /* Try to find first idle cpu */ 8804 /* Try to find first idle CPU */
8462 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { 8805 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
8463 if (!idle_cpu(cpu)) 8806 if (!idle_cpu(cpu))
8464 continue; 8807 continue;
@@ -8471,7 +8814,7 @@ static int should_we_balance(struct lb_env *env)
8471 balance_cpu = group_balance_cpu(sg); 8814 balance_cpu = group_balance_cpu(sg);
8472 8815
8473 /* 8816 /*
8474 * First idle cpu or the first cpu(busiest) in this sched group 8817 * First idle CPU or the first CPU(busiest) in this sched group
8475 * is eligible for doing load balancing at this and above domains. 8818 * is eligible for doing load balancing at this and above domains.
8476 */ 8819 */
8477 return balance_cpu == env->dst_cpu; 8820 return balance_cpu == env->dst_cpu;
@@ -8580,7 +8923,7 @@ more_balance:
8580 * Revisit (affine) tasks on src_cpu that couldn't be moved to 8923 * Revisit (affine) tasks on src_cpu that couldn't be moved to
8581 * us and move them to an alternate dst_cpu in our sched_group 8924 * us and move them to an alternate dst_cpu in our sched_group
8582 * where they can run. The upper limit on how many times we 8925 * where they can run. The upper limit on how many times we
8583 * iterate on same src_cpu is dependent on number of cpus in our 8926 * iterate on same src_cpu is dependent on number of CPUs in our
8584 * sched_group. 8927 * sched_group.
8585 * 8928 *
8586 * This changes load balance semantics a bit on who can move 8929 * This changes load balance semantics a bit on who can move
@@ -8597,7 +8940,7 @@ more_balance:
8597 */ 8940 */
8598 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { 8941 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
8599 8942
8600 /* Prevent to re-select dst_cpu via env's cpus */ 8943 /* Prevent to re-select dst_cpu via env's CPUs */
8601 cpumask_clear_cpu(env.dst_cpu, env.cpus); 8944 cpumask_clear_cpu(env.dst_cpu, env.cpus);
8602 8945
8603 env.dst_rq = cpu_rq(env.new_dst_cpu); 8946 env.dst_rq = cpu_rq(env.new_dst_cpu);
@@ -8659,9 +9002,10 @@ more_balance:
8659 9002
8660 raw_spin_lock_irqsave(&busiest->lock, flags); 9003 raw_spin_lock_irqsave(&busiest->lock, flags);
8661 9004
8662 /* don't kick the active_load_balance_cpu_stop, 9005 /*
8663 * if the curr task on busiest cpu can't be 9006 * Don't kick the active_load_balance_cpu_stop,
8664 * moved to this_cpu 9007 * if the curr task on busiest CPU can't be
9008 * moved to this_cpu:
8665 */ 9009 */
8666 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { 9010 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
8667 raw_spin_unlock_irqrestore(&busiest->lock, 9011 raw_spin_unlock_irqrestore(&busiest->lock,
@@ -8773,121 +9117,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
8773} 9117}
8774 9118
8775/* 9119/*
8776 * idle_balance is called by schedule() if this_cpu is about to become 9120 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
8777 * idle. Attempts to pull tasks from other CPUs.
8778 */
8779static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
8780{
8781 unsigned long next_balance = jiffies + HZ;
8782 int this_cpu = this_rq->cpu;
8783 struct sched_domain *sd;
8784 int pulled_task = 0;
8785 u64 curr_cost = 0;
8786
8787 /*
8788 * We must set idle_stamp _before_ calling idle_balance(), such that we
8789 * measure the duration of idle_balance() as idle time.
8790 */
8791 this_rq->idle_stamp = rq_clock(this_rq);
8792
8793 /*
8794 * Do not pull tasks towards !active CPUs...
8795 */
8796 if (!cpu_active(this_cpu))
8797 return 0;
8798
8799 /*
8800 * This is OK, because current is on_cpu, which avoids it being picked
8801 * for load-balance and preemption/IRQs are still disabled avoiding
8802 * further scheduler activity on it and we're being very careful to
8803 * re-start the picking loop.
8804 */
8805 rq_unpin_lock(this_rq, rf);
8806
8807 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
8808 !this_rq->rd->overload) {
8809 rcu_read_lock();
8810 sd = rcu_dereference_check_sched_domain(this_rq->sd);
8811 if (sd)
8812 update_next_balance(sd, &next_balance);
8813 rcu_read_unlock();
8814
8815 goto out;
8816 }
8817
8818 raw_spin_unlock(&this_rq->lock);
8819
8820 update_blocked_averages(this_cpu);
8821 rcu_read_lock();
8822 for_each_domain(this_cpu, sd) {
8823 int continue_balancing = 1;
8824 u64 t0, domain_cost;
8825
8826 if (!(sd->flags & SD_LOAD_BALANCE))
8827 continue;
8828
8829 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
8830 update_next_balance(sd, &next_balance);
8831 break;
8832 }
8833
8834 if (sd->flags & SD_BALANCE_NEWIDLE) {
8835 t0 = sched_clock_cpu(this_cpu);
8836
8837 pulled_task = load_balance(this_cpu, this_rq,
8838 sd, CPU_NEWLY_IDLE,
8839 &continue_balancing);
8840
8841 domain_cost = sched_clock_cpu(this_cpu) - t0;
8842 if (domain_cost > sd->max_newidle_lb_cost)
8843 sd->max_newidle_lb_cost = domain_cost;
8844
8845 curr_cost += domain_cost;
8846 }
8847
8848 update_next_balance(sd, &next_balance);
8849
8850 /*
8851 * Stop searching for tasks to pull if there are
8852 * now runnable tasks on this rq.
8853 */
8854 if (pulled_task || this_rq->nr_running > 0)
8855 break;
8856 }
8857 rcu_read_unlock();
8858
8859 raw_spin_lock(&this_rq->lock);
8860
8861 if (curr_cost > this_rq->max_idle_balance_cost)
8862 this_rq->max_idle_balance_cost = curr_cost;
8863
8864 /*
8865 * While browsing the domains, we released the rq lock, a task could
8866 * have been enqueued in the meantime. Since we're not going idle,
8867 * pretend we pulled a task.
8868 */
8869 if (this_rq->cfs.h_nr_running && !pulled_task)
8870 pulled_task = 1;
8871
8872out:
8873 /* Move the next balance forward */
8874 if (time_after(this_rq->next_balance, next_balance))
8875 this_rq->next_balance = next_balance;
8876
8877 /* Is there a task of a high priority class? */
8878 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
8879 pulled_task = -1;
8880
8881 if (pulled_task)
8882 this_rq->idle_stamp = 0;
8883
8884 rq_repin_lock(this_rq, rf);
8885
8886 return pulled_task;
8887}
8888
8889/*
8890 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
8891 * running tasks off the busiest CPU onto idle CPUs. It requires at 9121 * running tasks off the busiest CPU onto idle CPUs. It requires at
8892 * least 1 task to be running on each physical CPU where possible, and 9122 * least 1 task to be running on each physical CPU where possible, and
8893 * avoids physical / logical imbalances. 9123 * avoids physical / logical imbalances.
@@ -8911,7 +9141,7 @@ static int active_load_balance_cpu_stop(void *data)
8911 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) 9141 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
8912 goto out_unlock; 9142 goto out_unlock;
8913 9143
8914 /* make sure the requested cpu hasn't gone down in the meantime */ 9144 /* Make sure the requested CPU hasn't gone down in the meantime: */
8915 if (unlikely(busiest_cpu != smp_processor_id() || 9145 if (unlikely(busiest_cpu != smp_processor_id() ||
8916 !busiest_rq->active_balance)) 9146 !busiest_rq->active_balance))
8917 goto out_unlock; 9147 goto out_unlock;
@@ -8923,7 +9153,7 @@ static int active_load_balance_cpu_stop(void *data)
8923 /* 9153 /*
8924 * This condition is "impossible", if it occurs 9154 * This condition is "impossible", if it occurs
8925 * we need to fix it. Originally reported by 9155 * we need to fix it. Originally reported by
8926 * Bjorn Helgaas on a 128-cpu setup. 9156 * Bjorn Helgaas on a 128-CPU setup.
8927 */ 9157 */
8928 BUG_ON(busiest_rq == target_rq); 9158 BUG_ON(busiest_rq == target_rq);
8929 9159
@@ -8977,141 +9207,6 @@ out_unlock:
8977 return 0; 9207 return 0;
8978} 9208}
8979 9209
8980static inline int on_null_domain(struct rq *rq)
8981{
8982 return unlikely(!rcu_dereference_sched(rq->sd));
8983}
8984
8985#ifdef CONFIG_NO_HZ_COMMON
8986/*
8987 * idle load balancing details
8988 * - When one of the busy CPUs notice that there may be an idle rebalancing
8989 * needed, they will kick the idle load balancer, which then does idle
8990 * load balancing for all the idle CPUs.
8991 */
8992static struct {
8993 cpumask_var_t idle_cpus_mask;
8994 atomic_t nr_cpus;
8995 unsigned long next_balance; /* in jiffy units */
8996} nohz ____cacheline_aligned;
8997
8998static inline int find_new_ilb(void)
8999{
9000 int ilb = cpumask_first(nohz.idle_cpus_mask);
9001
9002 if (ilb < nr_cpu_ids && idle_cpu(ilb))
9003 return ilb;
9004
9005 return nr_cpu_ids;
9006}
9007
9008/*
9009 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
9010 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
9011 * CPU (if there is one).
9012 */
9013static void nohz_balancer_kick(void)
9014{
9015 int ilb_cpu;
9016
9017 nohz.next_balance++;
9018
9019 ilb_cpu = find_new_ilb();
9020
9021 if (ilb_cpu >= nr_cpu_ids)
9022 return;
9023
9024 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
9025 return;
9026 /*
9027 * Use smp_send_reschedule() instead of resched_cpu().
9028 * This way we generate a sched IPI on the target cpu which
9029 * is idle. And the softirq performing nohz idle load balance
9030 * will be run before returning from the IPI.
9031 */
9032 smp_send_reschedule(ilb_cpu);
9033 return;
9034}
9035
9036void nohz_balance_exit_idle(unsigned int cpu)
9037{
9038 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
9039 /*
9040 * Completely isolated CPUs don't ever set, so we must test.
9041 */
9042 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
9043 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
9044 atomic_dec(&nohz.nr_cpus);
9045 }
9046 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
9047 }
9048}
9049
9050static inline void set_cpu_sd_state_busy(void)
9051{
9052 struct sched_domain *sd;
9053 int cpu = smp_processor_id();
9054
9055 rcu_read_lock();
9056 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9057
9058 if (!sd || !sd->nohz_idle)
9059 goto unlock;
9060 sd->nohz_idle = 0;
9061
9062 atomic_inc(&sd->shared->nr_busy_cpus);
9063unlock:
9064 rcu_read_unlock();
9065}
9066
9067void set_cpu_sd_state_idle(void)
9068{
9069 struct sched_domain *sd;
9070 int cpu = smp_processor_id();
9071
9072 rcu_read_lock();
9073 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9074
9075 if (!sd || sd->nohz_idle)
9076 goto unlock;
9077 sd->nohz_idle = 1;
9078
9079 atomic_dec(&sd->shared->nr_busy_cpus);
9080unlock:
9081 rcu_read_unlock();
9082}
9083
9084/*
9085 * This routine will record that the cpu is going idle with tick stopped.
9086 * This info will be used in performing idle load balancing in the future.
9087 */
9088void nohz_balance_enter_idle(int cpu)
9089{
9090 /*
9091 * If this cpu is going down, then nothing needs to be done.
9092 */
9093 if (!cpu_active(cpu))
9094 return;
9095
9096 /* Spare idle load balancing on CPUs that don't want to be disturbed: */
9097 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
9098 return;
9099
9100 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
9101 return;
9102
9103 /*
9104 * If we're a completely isolated CPU, we don't play.
9105 */
9106 if (on_null_domain(cpu_rq(cpu)))
9107 return;
9108
9109 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9110 atomic_inc(&nohz.nr_cpus);
9111 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
9112}
9113#endif
9114
9115static DEFINE_SPINLOCK(balancing); 9210static DEFINE_SPINLOCK(balancing);
9116 9211
9117/* 9212/*
@@ -9141,8 +9236,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
9141 int need_serialize, need_decay = 0; 9236 int need_serialize, need_decay = 0;
9142 u64 max_cost = 0; 9237 u64 max_cost = 0;
9143 9238
9144 update_blocked_averages(cpu);
9145
9146 rcu_read_lock(); 9239 rcu_read_lock();
9147 for_each_domain(cpu, sd) { 9240 for_each_domain(cpu, sd) {
9148 /* 9241 /*
@@ -9232,68 +9325,56 @@ out:
9232 } 9325 }
9233} 9326}
9234 9327
9328static inline int on_null_domain(struct rq *rq)
9329{
9330 return unlikely(!rcu_dereference_sched(rq->sd));
9331}
9332
9235#ifdef CONFIG_NO_HZ_COMMON 9333#ifdef CONFIG_NO_HZ_COMMON
9236/* 9334/*
9237 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the 9335 * idle load balancing details
9238 * rebalancing for all the cpus for whom scheduler ticks are stopped. 9336 * - When one of the busy CPUs notice that there may be an idle rebalancing
9337 * needed, they will kick the idle load balancer, which then does idle
9338 * load balancing for all the idle CPUs.
9239 */ 9339 */
9240static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9241{
9242 int this_cpu = this_rq->cpu;
9243 struct rq *rq;
9244 int balance_cpu;
9245 /* Earliest time when we have to do rebalance again */
9246 unsigned long next_balance = jiffies + 60*HZ;
9247 int update_next_balance = 0;
9248 9340
9249 if (idle != CPU_IDLE || 9341static inline int find_new_ilb(void)
9250 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) 9342{
9251 goto end; 9343 int ilb = cpumask_first(nohz.idle_cpus_mask);
9252 9344
9253 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { 9345 if (ilb < nr_cpu_ids && idle_cpu(ilb))
9254 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) 9346 return ilb;
9255 continue;
9256 9347
9257 /* 9348 return nr_cpu_ids;
9258 * If this cpu gets work to do, stop the load balancing 9349}
9259 * work being done for other cpus. Next load
9260 * balancing owner will pick it up.
9261 */
9262 if (need_resched())
9263 break;
9264 9350
9265 rq = cpu_rq(balance_cpu); 9351/*
9352 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
9353 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
9354 * CPU (if there is one).
9355 */
9356static void kick_ilb(unsigned int flags)
9357{
9358 int ilb_cpu;
9266 9359
9267 /* 9360 nohz.next_balance++;
9268 * If time for next balance is due,
9269 * do the balance.
9270 */
9271 if (time_after_eq(jiffies, rq->next_balance)) {
9272 struct rq_flags rf;
9273 9361
9274 rq_lock_irq(rq, &rf); 9362 ilb_cpu = find_new_ilb();
9275 update_rq_clock(rq);
9276 cpu_load_update_idle(rq);
9277 rq_unlock_irq(rq, &rf);
9278 9363
9279 rebalance_domains(rq, CPU_IDLE); 9364 if (ilb_cpu >= nr_cpu_ids)
9280 } 9365 return;
9281 9366
9282 if (time_after(next_balance, rq->next_balance)) { 9367 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
9283 next_balance = rq->next_balance; 9368 if (flags & NOHZ_KICK_MASK)
9284 update_next_balance = 1; 9369 return;
9285 }
9286 }
9287 9370
9288 /* 9371 /*
9289 * next_balance will be updated only when there is a need. 9372 * Use smp_send_reschedule() instead of resched_cpu().
9290 * When the CPU is attached to null domain for ex, it will not be 9373 * This way we generate a sched IPI on the target CPU which
9291 * updated. 9374 * is idle. And the softirq performing nohz idle load balance
9375 * will be run before returning from the IPI.
9292 */ 9376 */
9293 if (likely(update_next_balance)) 9377 smp_send_reschedule(ilb_cpu);
9294 nohz.next_balance = next_balance;
9295end:
9296 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
9297} 9378}
9298 9379
9299/* 9380/*
@@ -9307,36 +9388,41 @@ end:
9307 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler 9388 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
9308 * domain span are idle. 9389 * domain span are idle.
9309 */ 9390 */
9310static inline bool nohz_kick_needed(struct rq *rq) 9391static void nohz_balancer_kick(struct rq *rq)
9311{ 9392{
9312 unsigned long now = jiffies; 9393 unsigned long now = jiffies;
9313 struct sched_domain_shared *sds; 9394 struct sched_domain_shared *sds;
9314 struct sched_domain *sd; 9395 struct sched_domain *sd;
9315 int nr_busy, i, cpu = rq->cpu; 9396 int nr_busy, i, cpu = rq->cpu;
9316 bool kick = false; 9397 unsigned int flags = 0;
9317 9398
9318 if (unlikely(rq->idle_balance)) 9399 if (unlikely(rq->idle_balance))
9319 return false; 9400 return;
9320 9401
9321 /* 9402 /*
9322 * We may be recently in ticked or tickless idle mode. At the first 9403 * We may be recently in ticked or tickless idle mode. At the first
9323 * busy tick after returning from idle, we will update the busy stats. 9404 * busy tick after returning from idle, we will update the busy stats.
9324 */ 9405 */
9325 set_cpu_sd_state_busy(); 9406 nohz_balance_exit_idle(rq);
9326 nohz_balance_exit_idle(cpu);
9327 9407
9328 /* 9408 /*
9329 * None are in tickless mode and hence no need for NOHZ idle load 9409 * None are in tickless mode and hence no need for NOHZ idle load
9330 * balancing. 9410 * balancing.
9331 */ 9411 */
9332 if (likely(!atomic_read(&nohz.nr_cpus))) 9412 if (likely(!atomic_read(&nohz.nr_cpus)))
9333 return false; 9413 return;
9414
9415 if (READ_ONCE(nohz.has_blocked) &&
9416 time_after(now, READ_ONCE(nohz.next_blocked)))
9417 flags = NOHZ_STATS_KICK;
9334 9418
9335 if (time_before(now, nohz.next_balance)) 9419 if (time_before(now, nohz.next_balance))
9336 return false; 9420 goto out;
9337 9421
9338 if (rq->nr_running >= 2) 9422 if (rq->nr_running >= 2) {
9339 return true; 9423 flags = NOHZ_KICK_MASK;
9424 goto out;
9425 }
9340 9426
9341 rcu_read_lock(); 9427 rcu_read_lock();
9342 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); 9428 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
@@ -9347,7 +9433,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
9347 */ 9433 */
9348 nr_busy = atomic_read(&sds->nr_busy_cpus); 9434 nr_busy = atomic_read(&sds->nr_busy_cpus);
9349 if (nr_busy > 1) { 9435 if (nr_busy > 1) {
9350 kick = true; 9436 flags = NOHZ_KICK_MASK;
9351 goto unlock; 9437 goto unlock;
9352 } 9438 }
9353 9439
@@ -9357,7 +9443,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
9357 if (sd) { 9443 if (sd) {
9358 if ((rq->cfs.h_nr_running >= 1) && 9444 if ((rq->cfs.h_nr_running >= 1) &&
9359 check_cpu_capacity(rq, sd)) { 9445 check_cpu_capacity(rq, sd)) {
9360 kick = true; 9446 flags = NOHZ_KICK_MASK;
9361 goto unlock; 9447 goto unlock;
9362 } 9448 }
9363 } 9449 }
@@ -9370,18 +9456,421 @@ static inline bool nohz_kick_needed(struct rq *rq)
9370 continue; 9456 continue;
9371 9457
9372 if (sched_asym_prefer(i, cpu)) { 9458 if (sched_asym_prefer(i, cpu)) {
9373 kick = true; 9459 flags = NOHZ_KICK_MASK;
9374 goto unlock; 9460 goto unlock;
9375 } 9461 }
9376 } 9462 }
9377 } 9463 }
9378unlock: 9464unlock:
9379 rcu_read_unlock(); 9465 rcu_read_unlock();
9380 return kick; 9466out:
9467 if (flags)
9468 kick_ilb(flags);
9469}
9470
9471static void set_cpu_sd_state_busy(int cpu)
9472{
9473 struct sched_domain *sd;
9474
9475 rcu_read_lock();
9476 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9477
9478 if (!sd || !sd->nohz_idle)
9479 goto unlock;
9480 sd->nohz_idle = 0;
9481
9482 atomic_inc(&sd->shared->nr_busy_cpus);
9483unlock:
9484 rcu_read_unlock();
9485}
9486
9487void nohz_balance_exit_idle(struct rq *rq)
9488{
9489 SCHED_WARN_ON(rq != this_rq());
9490
9491 if (likely(!rq->nohz_tick_stopped))
9492 return;
9493
9494 rq->nohz_tick_stopped = 0;
9495 cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
9496 atomic_dec(&nohz.nr_cpus);
9497
9498 set_cpu_sd_state_busy(rq->cpu);
9499}
9500
9501static void set_cpu_sd_state_idle(int cpu)
9502{
9503 struct sched_domain *sd;
9504
9505 rcu_read_lock();
9506 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9507
9508 if (!sd || sd->nohz_idle)
9509 goto unlock;
9510 sd->nohz_idle = 1;
9511
9512 atomic_dec(&sd->shared->nr_busy_cpus);
9513unlock:
9514 rcu_read_unlock();
9515}
9516
9517/*
9518 * This routine will record that the CPU is going idle with tick stopped.
9519 * This info will be used in performing idle load balancing in the future.
9520 */
9521void nohz_balance_enter_idle(int cpu)
9522{
9523 struct rq *rq = cpu_rq(cpu);
9524
9525 SCHED_WARN_ON(cpu != smp_processor_id());
9526
9527 /* If this CPU is going down, then nothing needs to be done: */
9528 if (!cpu_active(cpu))
9529 return;
9530
9531 /* Spare idle load balancing on CPUs that don't want to be disturbed: */
9532 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
9533 return;
9534
9535 /*
9536 * Can be set safely without rq->lock held
9537 * If a clear happens, it will have evaluated last additions because
9538 * rq->lock is held during the check and the clear
9539 */
9540 rq->has_blocked_load = 1;
9541
9542 /*
9543 * The tick is still stopped but load could have been added in the
9544 * meantime. We set the nohz.has_blocked flag to trig a check of the
9545 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
9546 * of nohz.has_blocked can only happen after checking the new load
9547 */
9548 if (rq->nohz_tick_stopped)
9549 goto out;
9550
9551 /* If we're a completely isolated CPU, we don't play: */
9552 if (on_null_domain(rq))
9553 return;
9554
9555 rq->nohz_tick_stopped = 1;
9556
9557 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9558 atomic_inc(&nohz.nr_cpus);
9559
9560 /*
9561 * Ensures that if nohz_idle_balance() fails to observe our
9562 * @idle_cpus_mask store, it must observe the @has_blocked
9563 * store.
9564 */
9565 smp_mb__after_atomic();
9566
9567 set_cpu_sd_state_idle(cpu);
9568
9569out:
9570 /*
9571 * Each time a cpu enter idle, we assume that it has blocked load and
9572 * enable the periodic update of the load of idle cpus
9573 */
9574 WRITE_ONCE(nohz.has_blocked, 1);
9575}
9576
9577/*
9578 * Internal function that runs load balance for all idle cpus. The load balance
9579 * can be a simple update of blocked load or a complete load balance with
9580 * tasks movement depending of flags.
9581 * The function returns false if the loop has stopped before running
9582 * through all idle CPUs.
9583 */
9584static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
9585 enum cpu_idle_type idle)
9586{
9587 /* Earliest time when we have to do rebalance again */
9588 unsigned long now = jiffies;
9589 unsigned long next_balance = now + 60*HZ;
9590 bool has_blocked_load = false;
9591 int update_next_balance = 0;
9592 int this_cpu = this_rq->cpu;
9593 int balance_cpu;
9594 int ret = false;
9595 struct rq *rq;
9596
9597 SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
9598
9599 /*
9600 * We assume there will be no idle load after this update and clear
9601 * the has_blocked flag. If a cpu enters idle in the mean time, it will
9602 * set the has_blocked flag and trig another update of idle load.
9603 * Because a cpu that becomes idle, is added to idle_cpus_mask before
9604 * setting the flag, we are sure to not clear the state and not
9605 * check the load of an idle cpu.
9606 */
9607 WRITE_ONCE(nohz.has_blocked, 0);
9608
9609 /*
9610 * Ensures that if we miss the CPU, we must see the has_blocked
9611 * store from nohz_balance_enter_idle().
9612 */
9613 smp_mb();
9614
9615 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
9616 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
9617 continue;
9618
9619 /*
9620 * If this CPU gets work to do, stop the load balancing
9621 * work being done for other CPUs. Next load
9622 * balancing owner will pick it up.
9623 */
9624 if (need_resched()) {
9625 has_blocked_load = true;
9626 goto abort;
9627 }
9628
9629 rq = cpu_rq(balance_cpu);
9630
9631 has_blocked_load |= update_nohz_stats(rq, true);
9632
9633 /*
9634 * If time for next balance is due,
9635 * do the balance.
9636 */
9637 if (time_after_eq(jiffies, rq->next_balance)) {
9638 struct rq_flags rf;
9639
9640 rq_lock_irqsave(rq, &rf);
9641 update_rq_clock(rq);
9642 cpu_load_update_idle(rq);
9643 rq_unlock_irqrestore(rq, &rf);
9644
9645 if (flags & NOHZ_BALANCE_KICK)
9646 rebalance_domains(rq, CPU_IDLE);
9647 }
9648
9649 if (time_after(next_balance, rq->next_balance)) {
9650 next_balance = rq->next_balance;
9651 update_next_balance = 1;
9652 }
9653 }
9654
9655 /* Newly idle CPU doesn't need an update */
9656 if (idle != CPU_NEWLY_IDLE) {
9657 update_blocked_averages(this_cpu);
9658 has_blocked_load |= this_rq->has_blocked_load;
9659 }
9660
9661 if (flags & NOHZ_BALANCE_KICK)
9662 rebalance_domains(this_rq, CPU_IDLE);
9663
9664 WRITE_ONCE(nohz.next_blocked,
9665 now + msecs_to_jiffies(LOAD_AVG_PERIOD));
9666
9667 /* The full idle balance loop has been done */
9668 ret = true;
9669
9670abort:
9671 /* There is still blocked load, enable periodic update */
9672 if (has_blocked_load)
9673 WRITE_ONCE(nohz.has_blocked, 1);
9674
9675 /*
9676 * next_balance will be updated only when there is a need.
9677 * When the CPU is attached to null domain for ex, it will not be
9678 * updated.
9679 */
9680 if (likely(update_next_balance))
9681 nohz.next_balance = next_balance;
9682
9683 return ret;
9684}
9685
9686/*
9687 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
9688 * rebalancing for all the cpus for whom scheduler ticks are stopped.
9689 */
9690static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9691{
9692 int this_cpu = this_rq->cpu;
9693 unsigned int flags;
9694
9695 if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
9696 return false;
9697
9698 if (idle != CPU_IDLE) {
9699 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9700 return false;
9701 }
9702
9703 /*
9704 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
9705 */
9706 flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9707 if (!(flags & NOHZ_KICK_MASK))
9708 return false;
9709
9710 _nohz_idle_balance(this_rq, flags, idle);
9711
9712 return true;
9713}
9714
9715static void nohz_newidle_balance(struct rq *this_rq)
9716{
9717 int this_cpu = this_rq->cpu;
9718
9719 /*
9720 * This CPU doesn't want to be disturbed by scheduler
9721 * housekeeping
9722 */
9723 if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
9724 return;
9725
9726 /* Will wake up very soon. No time for doing anything else*/
9727 if (this_rq->avg_idle < sysctl_sched_migration_cost)
9728 return;
9729
9730 /* Don't need to update blocked load of idle CPUs*/
9731 if (!READ_ONCE(nohz.has_blocked) ||
9732 time_before(jiffies, READ_ONCE(nohz.next_blocked)))
9733 return;
9734
9735 raw_spin_unlock(&this_rq->lock);
9736 /*
9737 * This CPU is going to be idle and blocked load of idle CPUs
9738 * need to be updated. Run the ilb locally as it is a good
9739 * candidate for ilb instead of waking up another idle CPU.
9740 * Kick an normal ilb if we failed to do the update.
9741 */
9742 if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
9743 kick_ilb(NOHZ_STATS_KICK);
9744 raw_spin_lock(&this_rq->lock);
9745}
9746
9747#else /* !CONFIG_NO_HZ_COMMON */
9748static inline void nohz_balancer_kick(struct rq *rq) { }
9749
9750static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9751{
9752 return false;
9753}
9754
9755static inline void nohz_newidle_balance(struct rq *this_rq) { }
9756#endif /* CONFIG_NO_HZ_COMMON */
9757
9758/*
9759 * idle_balance is called by schedule() if this_cpu is about to become
9760 * idle. Attempts to pull tasks from other CPUs.
9761 */
9762static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
9763{
9764 unsigned long next_balance = jiffies + HZ;
9765 int this_cpu = this_rq->cpu;
9766 struct sched_domain *sd;
9767 int pulled_task = 0;
9768 u64 curr_cost = 0;
9769
9770 /*
9771 * We must set idle_stamp _before_ calling idle_balance(), such that we
9772 * measure the duration of idle_balance() as idle time.
9773 */
9774 this_rq->idle_stamp = rq_clock(this_rq);
9775
9776 /*
9777 * Do not pull tasks towards !active CPUs...
9778 */
9779 if (!cpu_active(this_cpu))
9780 return 0;
9781
9782 /*
9783 * This is OK, because current is on_cpu, which avoids it being picked
9784 * for load-balance and preemption/IRQs are still disabled avoiding
9785 * further scheduler activity on it and we're being very careful to
9786 * re-start the picking loop.
9787 */
9788 rq_unpin_lock(this_rq, rf);
9789
9790 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
9791 !this_rq->rd->overload) {
9792
9793 rcu_read_lock();
9794 sd = rcu_dereference_check_sched_domain(this_rq->sd);
9795 if (sd)
9796 update_next_balance(sd, &next_balance);
9797 rcu_read_unlock();
9798
9799 nohz_newidle_balance(this_rq);
9800
9801 goto out;
9802 }
9803
9804 raw_spin_unlock(&this_rq->lock);
9805
9806 update_blocked_averages(this_cpu);
9807 rcu_read_lock();
9808 for_each_domain(this_cpu, sd) {
9809 int continue_balancing = 1;
9810 u64 t0, domain_cost;
9811
9812 if (!(sd->flags & SD_LOAD_BALANCE))
9813 continue;
9814
9815 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
9816 update_next_balance(sd, &next_balance);
9817 break;
9818 }
9819
9820 if (sd->flags & SD_BALANCE_NEWIDLE) {
9821 t0 = sched_clock_cpu(this_cpu);
9822
9823 pulled_task = load_balance(this_cpu, this_rq,
9824 sd, CPU_NEWLY_IDLE,
9825 &continue_balancing);
9826
9827 domain_cost = sched_clock_cpu(this_cpu) - t0;
9828 if (domain_cost > sd->max_newidle_lb_cost)
9829 sd->max_newidle_lb_cost = domain_cost;
9830
9831 curr_cost += domain_cost;
9832 }
9833
9834 update_next_balance(sd, &next_balance);
9835
9836 /*
9837 * Stop searching for tasks to pull if there are
9838 * now runnable tasks on this rq.
9839 */
9840 if (pulled_task || this_rq->nr_running > 0)
9841 break;
9842 }
9843 rcu_read_unlock();
9844
9845 raw_spin_lock(&this_rq->lock);
9846
9847 if (curr_cost > this_rq->max_idle_balance_cost)
9848 this_rq->max_idle_balance_cost = curr_cost;
9849
9850 /*
9851 * While browsing the domains, we released the rq lock, a task could
9852 * have been enqueued in the meantime. Since we're not going idle,
9853 * pretend we pulled a task.
9854 */
9855 if (this_rq->cfs.h_nr_running && !pulled_task)
9856 pulled_task = 1;
9857
9858out:
9859 /* Move the next balance forward */
9860 if (time_after(this_rq->next_balance, next_balance))
9861 this_rq->next_balance = next_balance;
9862
9863 /* Is there a task of a high priority class? */
9864 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
9865 pulled_task = -1;
9866
9867 if (pulled_task)
9868 this_rq->idle_stamp = 0;
9869
9870 rq_repin_lock(this_rq, rf);
9871
9872 return pulled_task;
9381} 9873}
9382#else
9383static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
9384#endif
9385 9874
9386/* 9875/*
9387 * run_rebalance_domains is triggered when needed from the scheduler tick. 9876 * run_rebalance_domains is triggered when needed from the scheduler tick.
@@ -9394,14 +9883,18 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
9394 CPU_IDLE : CPU_NOT_IDLE; 9883 CPU_IDLE : CPU_NOT_IDLE;
9395 9884
9396 /* 9885 /*
9397 * If this cpu has a pending nohz_balance_kick, then do the 9886 * If this CPU has a pending nohz_balance_kick, then do the
9398 * balancing on behalf of the other idle cpus whose ticks are 9887 * balancing on behalf of the other idle CPUs whose ticks are
9399 * stopped. Do nohz_idle_balance *before* rebalance_domains to 9888 * stopped. Do nohz_idle_balance *before* rebalance_domains to
9400 * give the idle cpus a chance to load balance. Else we may 9889 * give the idle CPUs a chance to load balance. Else we may
9401 * load balance only within the local sched_domain hierarchy 9890 * load balance only within the local sched_domain hierarchy
9402 * and abort nohz_idle_balance altogether if we pull some load. 9891 * and abort nohz_idle_balance altogether if we pull some load.
9403 */ 9892 */
9404 nohz_idle_balance(this_rq, idle); 9893 if (nohz_idle_balance(this_rq, idle))
9894 return;
9895
9896 /* normal load balance */
9897 update_blocked_averages(this_rq->cpu);
9405 rebalance_domains(this_rq, idle); 9898 rebalance_domains(this_rq, idle);
9406} 9899}
9407 9900
@@ -9416,10 +9909,8 @@ void trigger_load_balance(struct rq *rq)
9416 9909
9417 if (time_after_eq(jiffies, rq->next_balance)) 9910 if (time_after_eq(jiffies, rq->next_balance))
9418 raise_softirq(SCHED_SOFTIRQ); 9911 raise_softirq(SCHED_SOFTIRQ);
9419#ifdef CONFIG_NO_HZ_COMMON 9912
9420 if (nohz_kick_needed(rq)) 9913 nohz_balancer_kick(rq);
9421 nohz_balancer_kick();
9422#endif
9423} 9914}
9424 9915
9425static void rq_online_fair(struct rq *rq) 9916static void rq_online_fair(struct rq *rq)
@@ -9440,7 +9931,12 @@ static void rq_offline_fair(struct rq *rq)
9440#endif /* CONFIG_SMP */ 9931#endif /* CONFIG_SMP */
9441 9932
9442/* 9933/*
9443 * scheduler tick hitting a task of our scheduling class: 9934 * scheduler tick hitting a task of our scheduling class.
9935 *
9936 * NOTE: This function can be called remotely by the tick offload that
9937 * goes along full dynticks. Therefore no local assumption can be made
9938 * and everything must be accessed through the @rq and @curr passed in
9939 * parameters.
9444 */ 9940 */
9445static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) 9941static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
9446{ 9942{
@@ -9591,7 +10087,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
9591 10087
9592 /* Synchronize entity with its cfs_rq */ 10088 /* Synchronize entity with its cfs_rq */
9593 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); 10089 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
9594 attach_entity_load_avg(cfs_rq, se); 10090 attach_entity_load_avg(cfs_rq, se, 0);
9595 update_tg_load_avg(cfs_rq, false); 10091 update_tg_load_avg(cfs_rq, false);
9596 propagate_entity_cfs_rq(se); 10092 propagate_entity_cfs_rq(se);
9597} 10093}
@@ -9993,6 +10489,7 @@ __init void init_sched_fair_class(void)
9993 10489
9994#ifdef CONFIG_NO_HZ_COMMON 10490#ifdef CONFIG_NO_HZ_COMMON
9995 nohz.next_balance = jiffies; 10491 nohz.next_balance = jiffies;
10492 nohz.next_blocked = jiffies;
9996 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 10493 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
9997#endif 10494#endif
9998#endif /* SMP */ 10495#endif /* SMP */
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 9552fd5854bf..85ae8488039c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)
85SCHED_FEAT(WA_IDLE, true) 85SCHED_FEAT(WA_IDLE, true)
86SCHED_FEAT(WA_WEIGHT, true) 86SCHED_FEAT(WA_WEIGHT, true)
87SCHED_FEAT(WA_BIAS, true) 87SCHED_FEAT(WA_BIAS, true)
88
89/*
90 * UtilEstimation. Use estimated CPU utilization.
91 */
92SCHED_FEAT(UTIL_EST, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7dae9eb8c042..2975f195e1c4 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -1,23 +1,14 @@
1/* 1/*
2 * Generic entry point for the idle threads 2 * Generic entry points for the idle threads and
3 * implementation of the idle task scheduling class.
4 *
5 * (NOTE: these are not related to SCHED_IDLE batch scheduled
6 * tasks which are handled in sched/fair.c )
3 */ 7 */
4#include <linux/sched.h> 8#include "sched.h"
5#include <linux/sched/idle.h>
6#include <linux/cpu.h>
7#include <linux/cpuidle.h>
8#include <linux/cpuhotplug.h>
9#include <linux/tick.h>
10#include <linux/mm.h>
11#include <linux/stackprotector.h>
12#include <linux/suspend.h>
13#include <linux/livepatch.h>
14
15#include <asm/tlb.h>
16 9
17#include <trace/events/power.h> 10#include <trace/events/power.h>
18 11
19#include "sched.h"
20
21/* Linker adds these: start and end of __cpuidle functions */ 12/* Linker adds these: start and end of __cpuidle functions */
22extern char __cpuidle_text_start[], __cpuidle_text_end[]; 13extern char __cpuidle_text_start[], __cpuidle_text_end[];
23 14
@@ -46,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable)
46static int __init cpu_idle_poll_setup(char *__unused) 37static int __init cpu_idle_poll_setup(char *__unused)
47{ 38{
48 cpu_idle_force_poll = 1; 39 cpu_idle_force_poll = 1;
40
49 return 1; 41 return 1;
50} 42}
51__setup("nohlt", cpu_idle_poll_setup); 43__setup("nohlt", cpu_idle_poll_setup);
@@ -53,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup);
53static int __init cpu_idle_nopoll_setup(char *__unused) 45static int __init cpu_idle_nopoll_setup(char *__unused)
54{ 46{
55 cpu_idle_force_poll = 0; 47 cpu_idle_force_poll = 0;
48
56 return 1; 49 return 1;
57} 50}
58__setup("hlt", cpu_idle_nopoll_setup); 51__setup("hlt", cpu_idle_nopoll_setup);
@@ -64,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void)
64 trace_cpu_idle_rcuidle(0, smp_processor_id()); 57 trace_cpu_idle_rcuidle(0, smp_processor_id());
65 local_irq_enable(); 58 local_irq_enable();
66 stop_critical_timings(); 59 stop_critical_timings();
60
67 while (!tif_need_resched() && 61 while (!tif_need_resched() &&
68 (cpu_idle_force_poll || tick_check_broadcast_expired())) 62 (cpu_idle_force_poll || tick_check_broadcast_expired()))
69 cpu_relax(); 63 cpu_relax();
70 start_critical_timings(); 64 start_critical_timings();
71 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 65 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
72 rcu_idle_exit(); 66 rcu_idle_exit();
67
73 return 1; 68 return 1;
74} 69}
75 70
@@ -332,8 +327,8 @@ void cpu_startup_entry(enum cpuhp_state state)
332{ 327{
333 /* 328 /*
334 * This #ifdef needs to die, but it's too late in the cycle to 329 * This #ifdef needs to die, but it's too late in the cycle to
335 * make this generic (arm and sh have never invoked the canary 330 * make this generic (ARM and SH have never invoked the canary
336 * init for the non boot cpus!). Will be fixed in 3.11 331 * init for the non boot CPUs!). Will be fixed in 3.11
337 */ 332 */
338#ifdef CONFIG_X86 333#ifdef CONFIG_X86
339 /* 334 /*
@@ -350,3 +345,116 @@ void cpu_startup_entry(enum cpuhp_state state)
350 while (1) 345 while (1)
351 do_idle(); 346 do_idle();
352} 347}
348
349/*
350 * idle-task scheduling class.
351 */
352
353#ifdef CONFIG_SMP
354static int
355select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
356{
357 return task_cpu(p); /* IDLE tasks as never migrated */
358}
359#endif
360
361/*
362 * Idle tasks are unconditionally rescheduled:
363 */
364static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
365{
366 resched_curr(rq);
367}
368
369static struct task_struct *
370pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
371{
372 put_prev_task(rq, prev);
373 update_idle_core(rq);
374 schedstat_inc(rq->sched_goidle);
375
376 return rq->idle;
377}
378
379/*
380 * It is not legal to sleep in the idle task - print a warning
381 * message if some code attempts to do it:
382 */
383static void
384dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
385{
386 raw_spin_unlock_irq(&rq->lock);
387 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
388 dump_stack();
389 raw_spin_lock_irq(&rq->lock);
390}
391
392static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
393{
394}
395
396/*
397 * scheduler tick hitting a task of our scheduling class.
398 *
399 * NOTE: This function can be called remotely by the tick offload that
400 * goes along full dynticks. Therefore no local assumption can be made
401 * and everything must be accessed through the @rq and @curr passed in
402 * parameters.
403 */
404static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
405{
406}
407
408static void set_curr_task_idle(struct rq *rq)
409{
410}
411
412static void switched_to_idle(struct rq *rq, struct task_struct *p)
413{
414 BUG();
415}
416
417static void
418prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
419{
420 BUG();
421}
422
423static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
424{
425 return 0;
426}
427
428static void update_curr_idle(struct rq *rq)
429{
430}
431
432/*
433 * Simple, special scheduling class for the per-CPU idle tasks:
434 */
435const struct sched_class idle_sched_class = {
436 /* .next is NULL */
437 /* no enqueue/yield_task for idle tasks */
438
439 /* dequeue is not valid, we print a debug message there: */
440 .dequeue_task = dequeue_task_idle,
441
442 .check_preempt_curr = check_preempt_curr_idle,
443
444 .pick_next_task = pick_next_task_idle,
445 .put_prev_task = put_prev_task_idle,
446
447#ifdef CONFIG_SMP
448 .select_task_rq = select_task_rq_idle,
449 .set_cpus_allowed = set_cpus_allowed_common,
450#endif
451
452 .set_curr_task = set_curr_task_idle,
453 .task_tick = task_tick_idle,
454
455 .get_rr_interval = get_rr_interval_idle,
456
457 .prio_changed = prio_changed_idle,
458 .switched_to = switched_to_idle,
459 .update_curr = update_curr_idle,
460};
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
deleted file mode 100644
index d518664cce4f..000000000000
--- a/kernel/sched/idle_task.c
+++ /dev/null
@@ -1,110 +0,0 @@
1// SPDX-License-Identifier: GPL-2.0
2#include "sched.h"
3
4/*
5 * idle-task scheduling class.
6 *
7 * (NOTE: these are not related to SCHED_IDLE tasks which are
8 * handled in sched/fair.c)
9 */
10
11#ifdef CONFIG_SMP
12static int
13select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
14{
15 return task_cpu(p); /* IDLE tasks as never migrated */
16}
17#endif /* CONFIG_SMP */
18
19/*
20 * Idle tasks are unconditionally rescheduled:
21 */
22static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
23{
24 resched_curr(rq);
25}
26
27static struct task_struct *
28pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
29{
30 put_prev_task(rq, prev);
31 update_idle_core(rq);
32 schedstat_inc(rq->sched_goidle);
33 return rq->idle;
34}
35
36/*
37 * It is not legal to sleep in the idle task - print a warning
38 * message if some code attempts to do it:
39 */
40static void
41dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
42{
43 raw_spin_unlock_irq(&rq->lock);
44 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
45 dump_stack();
46 raw_spin_lock_irq(&rq->lock);
47}
48
49static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
50{
51 rq_last_tick_reset(rq);
52}
53
54static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
55{
56}
57
58static void set_curr_task_idle(struct rq *rq)
59{
60}
61
62static void switched_to_idle(struct rq *rq, struct task_struct *p)
63{
64 BUG();
65}
66
67static void
68prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
69{
70 BUG();
71}
72
73static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
74{
75 return 0;
76}
77
78static void update_curr_idle(struct rq *rq)
79{
80}
81
82/*
83 * Simple, special scheduling class for the per-CPU idle tasks:
84 */
85const struct sched_class idle_sched_class = {
86 /* .next is NULL */
87 /* no enqueue/yield_task for idle tasks */
88
89 /* dequeue is not valid, we print a debug message there: */
90 .dequeue_task = dequeue_task_idle,
91
92 .check_preempt_curr = check_preempt_curr_idle,
93
94 .pick_next_task = pick_next_task_idle,
95 .put_prev_task = put_prev_task_idle,
96
97#ifdef CONFIG_SMP
98 .select_task_rq = select_task_rq_idle,
99 .set_cpus_allowed = set_cpus_allowed_common,
100#endif
101
102 .set_curr_task = set_curr_task_idle,
103 .task_tick = task_tick_idle,
104
105 .get_rr_interval = get_rr_interval_idle,
106
107 .prio_changed = prio_changed_idle,
108 .switched_to = switched_to_idle,
109 .update_curr = update_curr_idle,
110};
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index b71b436f59f2..e6802181900f 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -3,15 +3,10 @@
3 * any CPU: unbound workqueues, timers, kthreads and any offloadable work. 3 * any CPU: unbound workqueues, timers, kthreads and any offloadable work.
4 * 4 *
5 * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker 5 * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
6 * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
6 * 7 *
7 */ 8 */
8 9#include "sched.h"
9#include <linux/sched/isolation.h>
10#include <linux/tick.h>
11#include <linux/init.h>
12#include <linux/kernel.h>
13#include <linux/static_key.h>
14#include <linux/ctype.h>
15 10
16DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); 11DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
17EXPORT_SYMBOL_GPL(housekeeping_overriden); 12EXPORT_SYMBOL_GPL(housekeeping_overriden);
@@ -60,6 +55,9 @@ void __init housekeeping_init(void)
60 55
61 static_branch_enable(&housekeeping_overriden); 56 static_branch_enable(&housekeeping_overriden);
62 57
58 if (housekeeping_flags & HK_FLAG_TICK)
59 sched_tick_offload_init();
60
63 /* We need at least one CPU to handle housekeeping work */ 61 /* We need at least one CPU to handle housekeeping work */
64 WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); 62 WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
65} 63}
@@ -119,7 +117,7 @@ static int __init housekeeping_nohz_full_setup(char *str)
119{ 117{
120 unsigned int flags; 118 unsigned int flags;
121 119
122 flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; 120 flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
123 121
124 return housekeeping_setup(str, flags); 122 return housekeeping_setup(str, flags);
125} 123}
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 89a989e4d758..a171c1258109 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -6,10 +6,6 @@
6 * figure. Its a silly number but people think its important. We go through 6 * figure. Its a silly number but people think its important. We go through
7 * great pains to make it work on big machines and tickless kernels. 7 * great pains to make it work on big machines and tickless kernels.
8 */ 8 */
9
10#include <linux/export.h>
11#include <linux/sched/loadavg.h>
12
13#include "sched.h" 9#include "sched.h"
14 10
15/* 11/*
@@ -32,29 +28,29 @@
32 * Due to a number of reasons the above turns in the mess below: 28 * Due to a number of reasons the above turns in the mess below:
33 * 29 *
34 * - for_each_possible_cpu() is prohibitively expensive on machines with 30 * - for_each_possible_cpu() is prohibitively expensive on machines with
35 * serious number of cpus, therefore we need to take a distributed approach 31 * serious number of CPUs, therefore we need to take a distributed approach
36 * to calculating nr_active. 32 * to calculating nr_active.
37 * 33 *
38 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 34 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
39 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } 35 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
40 * 36 *
41 * So assuming nr_active := 0 when we start out -- true per definition, we 37 * So assuming nr_active := 0 when we start out -- true per definition, we
42 * can simply take per-cpu deltas and fold those into a global accumulate 38 * can simply take per-CPU deltas and fold those into a global accumulate
43 * to obtain the same result. See calc_load_fold_active(). 39 * to obtain the same result. See calc_load_fold_active().
44 * 40 *
45 * Furthermore, in order to avoid synchronizing all per-cpu delta folding 41 * Furthermore, in order to avoid synchronizing all per-CPU delta folding
46 * across the machine, we assume 10 ticks is sufficient time for every 42 * across the machine, we assume 10 ticks is sufficient time for every
47 * cpu to have completed this task. 43 * CPU to have completed this task.
48 * 44 *
49 * This places an upper-bound on the IRQ-off latency of the machine. Then 45 * This places an upper-bound on the IRQ-off latency of the machine. Then
50 * again, being late doesn't loose the delta, just wrecks the sample. 46 * again, being late doesn't loose the delta, just wrecks the sample.
51 * 47 *
52 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because 48 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
53 * this would add another cross-cpu cacheline miss and atomic operation 49 * this would add another cross-CPU cacheline miss and atomic operation
54 * to the wakeup path. Instead we increment on whatever cpu the task ran 50 * to the wakeup path. Instead we increment on whatever CPU the task ran
55 * when it went into uninterruptible state and decrement on whatever cpu 51 * when it went into uninterruptible state and decrement on whatever CPU
56 * did the wakeup. This means that only the sum of nr_uninterruptible over 52 * did the wakeup. This means that only the sum of nr_uninterruptible over
57 * all cpus yields the correct result. 53 * all CPUs yields the correct result.
58 * 54 *
59 * This covers the NO_HZ=n code, for extra head-aches, see the comment below. 55 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
60 */ 56 */
@@ -115,11 +111,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
115 * Handle NO_HZ for the global load-average. 111 * Handle NO_HZ for the global load-average.
116 * 112 *
117 * Since the above described distributed algorithm to compute the global 113 * Since the above described distributed algorithm to compute the global
118 * load-average relies on per-cpu sampling from the tick, it is affected by 114 * load-average relies on per-CPU sampling from the tick, it is affected by
119 * NO_HZ. 115 * NO_HZ.
120 * 116 *
121 * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon 117 * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
122 * entering NO_HZ state such that we can include this as an 'extra' cpu delta 118 * entering NO_HZ state such that we can include this as an 'extra' CPU delta
123 * when we read the global state. 119 * when we read the global state.
124 * 120 *
125 * Obviously reality has to ruin such a delightfully simple scheme: 121 * Obviously reality has to ruin such a delightfully simple scheme:
@@ -146,9 +142,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
146 * busy state. 142 * busy state.
147 * 143 *
148 * This is solved by pushing the window forward, and thus skipping the 144 * This is solved by pushing the window forward, and thus skipping the
149 * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which 145 * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which
150 * was in effect at the time the window opened). This also solves the issue 146 * was in effect at the time the window opened). This also solves the issue
151 * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ 147 * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ
152 * intervals. 148 * intervals.
153 * 149 *
154 * When making the ILB scale, we should try to pull this in as well. 150 * When making the ILB scale, we should try to pull this in as well.
@@ -299,7 +295,7 @@ calc_load_n(unsigned long load, unsigned long exp,
299} 295}
300 296
301/* 297/*
302 * NO_HZ can leave us missing all per-cpu ticks calling 298 * NO_HZ can leave us missing all per-CPU ticks calling
303 * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into 299 * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
304 * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold 300 * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
305 * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. 301 * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
@@ -363,7 +359,7 @@ void calc_global_load(unsigned long ticks)
363 return; 359 return;
364 360
365 /* 361 /*
366 * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. 362 * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
367 */ 363 */
368 delta = calc_load_nohz_fold(); 364 delta = calc_load_nohz_fold();
369 if (delta) 365 if (delta)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 5d0762633639..76e0eaf4654e 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -13,32 +13,25 @@
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details. 14 * GNU General Public License for more details.
15 */ 15 */
16 16#include "sched.h"
17#include <linux/syscalls.h>
18#include <linux/membarrier.h>
19#include <linux/tick.h>
20#include <linux/cpumask.h>
21#include <linux/atomic.h>
22
23#include "sched.h" /* for cpu_rq(). */
24 17
25/* 18/*
26 * Bitmask made from a "or" of all commands within enum membarrier_cmd, 19 * Bitmask made from a "or" of all commands within enum membarrier_cmd,
27 * except MEMBARRIER_CMD_QUERY. 20 * except MEMBARRIER_CMD_QUERY.
28 */ 21 */
29#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE 22#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
30#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ 23#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
31 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ 24 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
32 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) 25 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
33#else 26#else
34#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 27#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
35#endif 28#endif
36 29
37#define MEMBARRIER_CMD_BITMASK \ 30#define MEMBARRIER_CMD_BITMASK \
38 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ 31 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
39 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ 32 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
40 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ 33 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
41 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ 34 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
42 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) 35 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
43 36
44static void ipi_mb(void *info) 37static void ipi_mb(void *info)
@@ -85,6 +78,7 @@ static int membarrier_global_expedited(void)
85 */ 78 */
86 if (cpu == raw_smp_processor_id()) 79 if (cpu == raw_smp_processor_id())
87 continue; 80 continue;
81
88 rcu_read_lock(); 82 rcu_read_lock();
89 p = task_rcu_dereference(&cpu_rq(cpu)->curr); 83 p = task_rcu_dereference(&cpu_rq(cpu)->curr);
90 if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & 84 if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
@@ -188,6 +182,7 @@ static int membarrier_private_expedited(int flags)
188 * rq->curr modification in scheduler. 182 * rq->curr modification in scheduler.
189 */ 183 */
190 smp_mb(); /* exit from system call is not a mb */ 184 smp_mb(); /* exit from system call is not a mb */
185
191 return 0; 186 return 0;
192} 187}
193 188
@@ -219,6 +214,7 @@ static int membarrier_register_global_expedited(void)
219 } 214 }
220 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, 215 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
221 &mm->membarrier_state); 216 &mm->membarrier_state);
217
222 return 0; 218 return 0;
223} 219}
224 220
@@ -253,6 +249,7 @@ static int membarrier_register_private_expedited(int flags)
253 synchronize_sched(); 249 synchronize_sched();
254 } 250 }
255 atomic_or(state, &mm->membarrier_state); 251 atomic_or(state, &mm->membarrier_state);
252
256 return 0; 253 return 0;
257} 254}
258 255
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index aad49451584e..86b77987435e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -3,12 +3,8 @@
3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR 3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
4 * policies) 4 * policies)
5 */ 5 */
6
7#include "sched.h" 6#include "sched.h"
8 7
9#include <linux/slab.h>
10#include <linux/irq_work.h>
11
12int sched_rr_timeslice = RR_TIMESLICE; 8int sched_rr_timeslice = RR_TIMESLICE;
13int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; 9int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
14 10
@@ -359,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
359static void push_rt_tasks(struct rq *); 355static void push_rt_tasks(struct rq *);
360static void pull_rt_task(struct rq *); 356static void pull_rt_task(struct rq *);
361 357
362static inline void queue_push_tasks(struct rq *rq) 358static inline void rt_queue_push_tasks(struct rq *rq)
363{ 359{
364 if (!has_pushable_tasks(rq)) 360 if (!has_pushable_tasks(rq))
365 return; 361 return;
@@ -367,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq)
367 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); 363 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
368} 364}
369 365
370static inline void queue_pull_task(struct rq *rq) 366static inline void rt_queue_pull_task(struct rq *rq)
371{ 367{
372 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); 368 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
373} 369}
@@ -425,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq)
425{ 421{
426} 422}
427 423
428static inline void queue_push_tasks(struct rq *rq) 424static inline void rt_queue_push_tasks(struct rq *rq)
429{ 425{
430} 426}
431#endif /* CONFIG_SMP */ 427#endif /* CONFIG_SMP */
@@ -961,9 +957,6 @@ static void update_curr_rt(struct rq *rq)
961 if (unlikely((s64)delta_exec <= 0)) 957 if (unlikely((s64)delta_exec <= 0))
962 return; 958 return;
963 959
964 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
965 cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
966
967 schedstat_set(curr->se.statistics.exec_max, 960 schedstat_set(curr->se.statistics.exec_max,
968 max(curr->se.statistics.exec_max, delta_exec)); 961 max(curr->se.statistics.exec_max, delta_exec));
969 962
@@ -1005,6 +998,9 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
1005 998
1006 sub_nr_running(rq, rt_rq->rt_nr_running); 999 sub_nr_running(rq, rt_rq->rt_nr_running);
1007 rt_rq->rt_queued = 0; 1000 rt_rq->rt_queued = 0;
1001
1002 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1003 cpufreq_update_util(rq, 0);
1008} 1004}
1009 1005
1010static void 1006static void
@@ -1021,6 +1017,9 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
1021 1017
1022 add_nr_running(rq, rt_rq->rt_nr_running); 1018 add_nr_running(rq, rt_rq->rt_nr_running);
1023 rt_rq->rt_queued = 1; 1019 rt_rq->rt_queued = 1;
1020
1021 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1022 cpufreq_update_util(rq, 0);
1024} 1023}
1025 1024
1026#if defined CONFIG_SMP 1025#if defined CONFIG_SMP
@@ -1453,9 +1452,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1453 return; 1452 return;
1454 1453
1455 /* 1454 /*
1456 * There appears to be other cpus that can accept 1455 * There appear to be other CPUs that can accept
1457 * current and none to run 'p', so lets reschedule 1456 * the current task but none can run 'p', so lets reschedule
1458 * to try and push current away: 1457 * to try and push the current task away:
1459 */ 1458 */
1460 requeue_task_rt(rq, p, 1); 1459 requeue_task_rt(rq, p, 1);
1461 resched_curr(rq); 1460 resched_curr(rq);
@@ -1569,7 +1568,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1569 /* The running task is never eligible for pushing */ 1568 /* The running task is never eligible for pushing */
1570 dequeue_pushable_task(rq, p); 1569 dequeue_pushable_task(rq, p);
1571 1570
1572 queue_push_tasks(rq); 1571 rt_queue_push_tasks(rq);
1573 1572
1574 return p; 1573 return p;
1575} 1574}
@@ -1596,12 +1595,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1596 if (!task_running(rq, p) && 1595 if (!task_running(rq, p) &&
1597 cpumask_test_cpu(cpu, &p->cpus_allowed)) 1596 cpumask_test_cpu(cpu, &p->cpus_allowed))
1598 return 1; 1597 return 1;
1598
1599 return 0; 1599 return 0;
1600} 1600}
1601 1601
1602/* 1602/*
1603 * Return the highest pushable rq's task, which is suitable to be executed 1603 * Return the highest pushable rq's task, which is suitable to be executed
1604 * on the cpu, NULL otherwise 1604 * on the CPU, NULL otherwise
1605 */ 1605 */
1606static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) 1606static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1607{ 1607{
@@ -1639,11 +1639,11 @@ static int find_lowest_rq(struct task_struct *task)
1639 return -1; /* No targets found */ 1639 return -1; /* No targets found */
1640 1640
1641 /* 1641 /*
1642 * At this point we have built a mask of cpus representing the 1642 * At this point we have built a mask of CPUs representing the
1643 * lowest priority tasks in the system. Now we want to elect 1643 * lowest priority tasks in the system. Now we want to elect
1644 * the best one based on our affinity and topology. 1644 * the best one based on our affinity and topology.
1645 * 1645 *
1646 * We prioritize the last cpu that the task executed on since 1646 * We prioritize the last CPU that the task executed on since
1647 * it is most likely cache-hot in that location. 1647 * it is most likely cache-hot in that location.
1648 */ 1648 */
1649 if (cpumask_test_cpu(cpu, lowest_mask)) 1649 if (cpumask_test_cpu(cpu, lowest_mask))
@@ -1651,7 +1651,7 @@ static int find_lowest_rq(struct task_struct *task)
1651 1651
1652 /* 1652 /*
1653 * Otherwise, we consult the sched_domains span maps to figure 1653 * Otherwise, we consult the sched_domains span maps to figure
1654 * out which cpu is logically closest to our hot cache data. 1654 * out which CPU is logically closest to our hot cache data.
1655 */ 1655 */
1656 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1656 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1657 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1657 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
@@ -1692,6 +1692,7 @@ static int find_lowest_rq(struct task_struct *task)
1692 cpu = cpumask_any(lowest_mask); 1692 cpu = cpumask_any(lowest_mask);
1693 if (cpu < nr_cpu_ids) 1693 if (cpu < nr_cpu_ids)
1694 return cpu; 1694 return cpu;
1695
1695 return -1; 1696 return -1;
1696} 1697}
1697 1698
@@ -1827,7 +1828,7 @@ retry:
1827 * The task hasn't migrated, and is still the next 1828 * The task hasn't migrated, and is still the next
1828 * eligible task, but we failed to find a run-queue 1829 * eligible task, but we failed to find a run-queue
1829 * to push it to. Do not retry in this case, since 1830 * to push it to. Do not retry in this case, since
1830 * other cpus will pull from us when ready. 1831 * other CPUs will pull from us when ready.
1831 */ 1832 */
1832 goto out; 1833 goto out;
1833 } 1834 }
@@ -1919,7 +1920,7 @@ static int rto_next_cpu(struct root_domain *rd)
1919 * rt_next_cpu() will simply return the first CPU found in 1920 * rt_next_cpu() will simply return the first CPU found in
1920 * the rto_mask. 1921 * the rto_mask.
1921 * 1922 *
1922 * If rto_next_cpu() is called with rto_cpu is a valid cpu, it 1923 * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
1923 * will return the next CPU found in the rto_mask. 1924 * will return the next CPU found in the rto_mask.
1924 * 1925 *
1925 * If there are no more CPUs left in the rto_mask, then a check is made 1926 * If there are no more CPUs left in the rto_mask, then a check is made
@@ -1980,7 +1981,7 @@ static void tell_cpu_to_push(struct rq *rq)
1980 raw_spin_lock(&rq->rd->rto_lock); 1981 raw_spin_lock(&rq->rd->rto_lock);
1981 1982
1982 /* 1983 /*
1983 * The rto_cpu is updated under the lock, if it has a valid cpu 1984 * The rto_cpu is updated under the lock, if it has a valid CPU
1984 * then the IPI is still running and will continue due to the 1985 * then the IPI is still running and will continue due to the
1985 * update to loop_next, and nothing needs to be done here. 1986 * update to loop_next, and nothing needs to be done here.
1986 * Otherwise it is finishing up and an ipi needs to be sent. 1987 * Otherwise it is finishing up and an ipi needs to be sent.
@@ -2105,7 +2106,7 @@ static void pull_rt_task(struct rq *this_rq)
2105 2106
2106 /* 2107 /*
2107 * There's a chance that p is higher in priority 2108 * There's a chance that p is higher in priority
2108 * than what's currently running on its cpu. 2109 * than what's currently running on its CPU.
2109 * This is just that p is wakeing up and hasn't 2110 * This is just that p is wakeing up and hasn't
2110 * had a chance to schedule. We only pull 2111 * had a chance to schedule. We only pull
2111 * p if it is lower in priority than the 2112 * p if it is lower in priority than the
@@ -2187,7 +2188,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
2187 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) 2188 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2188 return; 2189 return;
2189 2190
2190 queue_pull_task(rq); 2191 rt_queue_pull_task(rq);
2191} 2192}
2192 2193
2193void __init init_sched_rt_class(void) 2194void __init init_sched_rt_class(void)
@@ -2218,7 +2219,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
2218 if (task_on_rq_queued(p) && rq->curr != p) { 2219 if (task_on_rq_queued(p) && rq->curr != p) {
2219#ifdef CONFIG_SMP 2220#ifdef CONFIG_SMP
2220 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) 2221 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2221 queue_push_tasks(rq); 2222 rt_queue_push_tasks(rq);
2222#endif /* CONFIG_SMP */ 2223#endif /* CONFIG_SMP */
2223 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) 2224 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
2224 resched_curr(rq); 2225 resched_curr(rq);
@@ -2242,7 +2243,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2242 * may need to pull tasks to this runqueue. 2243 * may need to pull tasks to this runqueue.
2243 */ 2244 */
2244 if (oldprio < p->prio) 2245 if (oldprio < p->prio)
2245 queue_pull_task(rq); 2246 rt_queue_pull_task(rq);
2246 2247
2247 /* 2248 /*
2248 * If there's a higher priority task waiting to run 2249 * If there's a higher priority task waiting to run
@@ -2292,6 +2293,14 @@ static void watchdog(struct rq *rq, struct task_struct *p)
2292static inline void watchdog(struct rq *rq, struct task_struct *p) { } 2293static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2293#endif 2294#endif
2294 2295
2296/*
2297 * scheduler tick hitting a task of our scheduling class.
2298 *
2299 * NOTE: This function can be called remotely by the tick offload that
2300 * goes along full dynticks. Therefore no local assumption can be made
2301 * and everything must be accessed through the @rq and @curr passed in
2302 * parameters.
2303 */
2295static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 2304static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2296{ 2305{
2297 struct sched_rt_entity *rt_se = &p->rt; 2306 struct sched_rt_entity *rt_se = &p->rt;
@@ -2685,6 +2694,7 @@ int sched_rr_handler(struct ctl_table *table, int write,
2685 msecs_to_jiffies(sysctl_sched_rr_timeslice); 2694 msecs_to_jiffies(sysctl_sched_rr_timeslice);
2686 } 2695 }
2687 mutex_unlock(&mutex); 2696 mutex_unlock(&mutex);
2697
2688 return ret; 2698 return ret;
2689} 2699}
2690 2700
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb5fc458547f..c3deaee7a7a2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,39 +1,73 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2 2/*
3 * Scheduler internal types and methods:
4 */
3#include <linux/sched.h> 5#include <linux/sched.h>
6
4#include <linux/sched/autogroup.h> 7#include <linux/sched/autogroup.h>
5#include <linux/sched/sysctl.h>
6#include <linux/sched/topology.h>
7#include <linux/sched/rt.h>
8#include <linux/sched/deadline.h>
9#include <linux/sched/clock.h> 8#include <linux/sched/clock.h>
10#include <linux/sched/wake_q.h> 9#include <linux/sched/coredump.h>
11#include <linux/sched/signal.h>
12#include <linux/sched/numa_balancing.h>
13#include <linux/sched/mm.h>
14#include <linux/sched/cpufreq.h> 10#include <linux/sched/cpufreq.h>
15#include <linux/sched/stat.h> 11#include <linux/sched/cputime.h>
16#include <linux/sched/nohz.h> 12#include <linux/sched/deadline.h>
17#include <linux/sched/debug.h> 13#include <linux/sched/debug.h>
18#include <linux/sched/hotplug.h> 14#include <linux/sched/hotplug.h>
15#include <linux/sched/idle.h>
16#include <linux/sched/init.h>
17#include <linux/sched/isolation.h>
18#include <linux/sched/jobctl.h>
19#include <linux/sched/loadavg.h>
20#include <linux/sched/mm.h>
21#include <linux/sched/nohz.h>
22#include <linux/sched/numa_balancing.h>
23#include <linux/sched/prio.h>
24#include <linux/sched/rt.h>
25#include <linux/sched/signal.h>
26#include <linux/sched/stat.h>
27#include <linux/sched/sysctl.h>
19#include <linux/sched/task.h> 28#include <linux/sched/task.h>
20#include <linux/sched/task_stack.h> 29#include <linux/sched/task_stack.h>
21#include <linux/sched/cputime.h> 30#include <linux/sched/topology.h>
22#include <linux/sched/init.h> 31#include <linux/sched/user.h>
32#include <linux/sched/wake_q.h>
33#include <linux/sched/xacct.h>
34
35#include <uapi/linux/sched/types.h>
23 36
24#include <linux/u64_stats_sync.h>
25#include <linux/kernel_stat.h>
26#include <linux/binfmts.h> 37#include <linux/binfmts.h>
27#include <linux/mutex.h> 38#include <linux/blkdev.h>
28#include <linux/spinlock.h> 39#include <linux/compat.h>
40#include <linux/context_tracking.h>
41#include <linux/cpufreq.h>
42#include <linux/cpuidle.h>
43#include <linux/cpuset.h>
44#include <linux/ctype.h>
45#include <linux/debugfs.h>
46#include <linux/delayacct.h>
47#include <linux/init_task.h>
48#include <linux/kprobes.h>
49#include <linux/kthread.h>
50#include <linux/membarrier.h>
51#include <linux/migrate.h>
52#include <linux/mmu_context.h>
53#include <linux/nmi.h>
54#include <linux/proc_fs.h>
55#include <linux/prefetch.h>
56#include <linux/profile.h>
57#include <linux/rcupdate_wait.h>
58#include <linux/security.h>
59#include <linux/stackprotector.h>
29#include <linux/stop_machine.h> 60#include <linux/stop_machine.h>
30#include <linux/irq_work.h> 61#include <linux/suspend.h>
31#include <linux/tick.h> 62#include <linux/swait.h>
32#include <linux/slab.h> 63#include <linux/syscalls.h>
33#include <linux/cgroup.h> 64#include <linux/task_work.h>
65#include <linux/tsacct_kern.h>
66
67#include <asm/tlb.h>
34 68
35#ifdef CONFIG_PARAVIRT 69#ifdef CONFIG_PARAVIRT
36#include <asm/paravirt.h> 70# include <asm/paravirt.h>
37#endif 71#endif
38 72
39#include "cpupri.h" 73#include "cpupri.h"
@@ -79,11 +113,11 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
79 * and does not change the user-interface for setting shares/weights. 113 * and does not change the user-interface for setting shares/weights.
80 * 114 *
81 * We increase resolution only if we have enough bits to allow this increased 115 * We increase resolution only if we have enough bits to allow this increased
82 * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are 116 * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit
83 * pretty high and the returns do not justify the increased costs. 117 * are pretty high and the returns do not justify the increased costs.
84 * 118 *
85 * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to 119 * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to
86 * increase coverage and consistency always enable it on 64bit platforms. 120 * increase coverage and consistency always enable it on 64-bit platforms.
87 */ 121 */
88#ifdef CONFIG_64BIT 122#ifdef CONFIG_64BIT
89# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) 123# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
@@ -111,16 +145,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
111 * 10 -> just above 1us 145 * 10 -> just above 1us
112 * 9 -> just above 0.5us 146 * 9 -> just above 0.5us
113 */ 147 */
114#define DL_SCALE (10) 148#define DL_SCALE 10
115 149
116/* 150/*
117 * These are the 'tuning knobs' of the scheduler: 151 * Single value that denotes runtime == period, ie unlimited time.
118 */ 152 */
119 153#define RUNTIME_INF ((u64)~0ULL)
120/*
121 * single value that denotes runtime == period, ie unlimited time.
122 */
123#define RUNTIME_INF ((u64)~0ULL)
124 154
125static inline int idle_policy(int policy) 155static inline int idle_policy(int policy)
126{ 156{
@@ -235,9 +265,9 @@ void __dl_clear_params(struct task_struct *p);
235 * control. 265 * control.
236 */ 266 */
237struct dl_bandwidth { 267struct dl_bandwidth {
238 raw_spinlock_t dl_runtime_lock; 268 raw_spinlock_t dl_runtime_lock;
239 u64 dl_runtime; 269 u64 dl_runtime;
240 u64 dl_period; 270 u64 dl_period;
241}; 271};
242 272
243static inline int dl_bandwidth_enabled(void) 273static inline int dl_bandwidth_enabled(void)
@@ -246,8 +276,9 @@ static inline int dl_bandwidth_enabled(void)
246} 276}
247 277
248struct dl_bw { 278struct dl_bw {
249 raw_spinlock_t lock; 279 raw_spinlock_t lock;
250 u64 bw, total_bw; 280 u64 bw;
281 u64 total_bw;
251}; 282};
252 283
253static inline void __dl_update(struct dl_bw *dl_b, s64 bw); 284static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
@@ -273,20 +304,17 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
273 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; 304 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
274} 305}
275 306
276void dl_change_utilization(struct task_struct *p, u64 new_bw); 307extern void dl_change_utilization(struct task_struct *p, u64 new_bw);
277extern void init_dl_bw(struct dl_bw *dl_b); 308extern void init_dl_bw(struct dl_bw *dl_b);
278extern int sched_dl_global_validate(void); 309extern int sched_dl_global_validate(void);
279extern void sched_dl_do_global(void); 310extern void sched_dl_do_global(void);
280extern int sched_dl_overflow(struct task_struct *p, int policy, 311extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
281 const struct sched_attr *attr);
282extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); 312extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
283extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); 313extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
284extern bool __checkparam_dl(const struct sched_attr *attr); 314extern bool __checkparam_dl(const struct sched_attr *attr);
285extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); 315extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
286extern int dl_task_can_attach(struct task_struct *p, 316extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
287 const struct cpumask *cs_cpus_allowed); 317extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
288extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
289 const struct cpumask *trial);
290extern bool dl_cpu_busy(unsigned int cpu); 318extern bool dl_cpu_busy(unsigned int cpu);
291 319
292#ifdef CONFIG_CGROUP_SCHED 320#ifdef CONFIG_CGROUP_SCHED
@@ -300,32 +328,36 @@ extern struct list_head task_groups;
300 328
301struct cfs_bandwidth { 329struct cfs_bandwidth {
302#ifdef CONFIG_CFS_BANDWIDTH 330#ifdef CONFIG_CFS_BANDWIDTH
303 raw_spinlock_t lock; 331 raw_spinlock_t lock;
304 ktime_t period; 332 ktime_t period;
305 u64 quota, runtime; 333 u64 quota;
306 s64 hierarchical_quota; 334 u64 runtime;
307 u64 runtime_expires; 335 s64 hierarchical_quota;
308 336 u64 runtime_expires;
309 int idle, period_active; 337
310 struct hrtimer period_timer, slack_timer; 338 int idle;
311 struct list_head throttled_cfs_rq; 339 int period_active;
312 340 struct hrtimer period_timer;
313 /* statistics */ 341 struct hrtimer slack_timer;
314 int nr_periods, nr_throttled; 342 struct list_head throttled_cfs_rq;
315 u64 throttled_time; 343
344 /* Statistics: */
345 int nr_periods;
346 int nr_throttled;
347 u64 throttled_time;
316#endif 348#endif
317}; 349};
318 350
319/* task group related information */ 351/* Task group related information */
320struct task_group { 352struct task_group {
321 struct cgroup_subsys_state css; 353 struct cgroup_subsys_state css;
322 354
323#ifdef CONFIG_FAIR_GROUP_SCHED 355#ifdef CONFIG_FAIR_GROUP_SCHED
324 /* schedulable entities of this group on each cpu */ 356 /* schedulable entities of this group on each CPU */
325 struct sched_entity **se; 357 struct sched_entity **se;
326 /* runqueue "owned" by this group on each cpu */ 358 /* runqueue "owned" by this group on each CPU */
327 struct cfs_rq **cfs_rq; 359 struct cfs_rq **cfs_rq;
328 unsigned long shares; 360 unsigned long shares;
329 361
330#ifdef CONFIG_SMP 362#ifdef CONFIG_SMP
331 /* 363 /*
@@ -333,29 +365,29 @@ struct task_group {
333 * it in its own cacheline separated from the fields above which 365 * it in its own cacheline separated from the fields above which
334 * will also be accessed at each tick. 366 * will also be accessed at each tick.
335 */ 367 */
336 atomic_long_t load_avg ____cacheline_aligned; 368 atomic_long_t load_avg ____cacheline_aligned;
337#endif 369#endif
338#endif 370#endif
339 371
340#ifdef CONFIG_RT_GROUP_SCHED 372#ifdef CONFIG_RT_GROUP_SCHED
341 struct sched_rt_entity **rt_se; 373 struct sched_rt_entity **rt_se;
342 struct rt_rq **rt_rq; 374 struct rt_rq **rt_rq;
343 375
344 struct rt_bandwidth rt_bandwidth; 376 struct rt_bandwidth rt_bandwidth;
345#endif 377#endif
346 378
347 struct rcu_head rcu; 379 struct rcu_head rcu;
348 struct list_head list; 380 struct list_head list;
349 381
350 struct task_group *parent; 382 struct task_group *parent;
351 struct list_head siblings; 383 struct list_head siblings;
352 struct list_head children; 384 struct list_head children;
353 385
354#ifdef CONFIG_SCHED_AUTOGROUP 386#ifdef CONFIG_SCHED_AUTOGROUP
355 struct autogroup *autogroup; 387 struct autogroup *autogroup;
356#endif 388#endif
357 389
358 struct cfs_bandwidth cfs_bandwidth; 390 struct cfs_bandwidth cfs_bandwidth;
359}; 391};
360 392
361#ifdef CONFIG_FAIR_GROUP_SCHED 393#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -369,8 +401,8 @@ struct task_group {
369 * (The default weight is 1024 - so there's no practical 401 * (The default weight is 1024 - so there's no practical
370 * limitation from this.) 402 * limitation from this.)
371 */ 403 */
372#define MIN_SHARES (1UL << 1) 404#define MIN_SHARES (1UL << 1)
373#define MAX_SHARES (1UL << 18) 405#define MAX_SHARES (1UL << 18)
374#endif 406#endif
375 407
376typedef int (*tg_visitor)(struct task_group *, void *); 408typedef int (*tg_visitor)(struct task_group *, void *);
@@ -443,35 +475,39 @@ struct cfs_bandwidth { };
443 475
444/* CFS-related fields in a runqueue */ 476/* CFS-related fields in a runqueue */
445struct cfs_rq { 477struct cfs_rq {
446 struct load_weight load; 478 struct load_weight load;
447 unsigned long runnable_weight; 479 unsigned long runnable_weight;
448 unsigned int nr_running, h_nr_running; 480 unsigned int nr_running;
481 unsigned int h_nr_running;
449 482
450 u64 exec_clock; 483 u64 exec_clock;
451 u64 min_vruntime; 484 u64 min_vruntime;
452#ifndef CONFIG_64BIT 485#ifndef CONFIG_64BIT
453 u64 min_vruntime_copy; 486 u64 min_vruntime_copy;
454#endif 487#endif
455 488
456 struct rb_root_cached tasks_timeline; 489 struct rb_root_cached tasks_timeline;
457 490
458 /* 491 /*
459 * 'curr' points to currently running entity on this cfs_rq. 492 * 'curr' points to currently running entity on this cfs_rq.
460 * It is set to NULL otherwise (i.e when none are currently running). 493 * It is set to NULL otherwise (i.e when none are currently running).
461 */ 494 */
462 struct sched_entity *curr, *next, *last, *skip; 495 struct sched_entity *curr;
496 struct sched_entity *next;
497 struct sched_entity *last;
498 struct sched_entity *skip;
463 499
464#ifdef CONFIG_SCHED_DEBUG 500#ifdef CONFIG_SCHED_DEBUG
465 unsigned int nr_spread_over; 501 unsigned int nr_spread_over;
466#endif 502#endif
467 503
468#ifdef CONFIG_SMP 504#ifdef CONFIG_SMP
469 /* 505 /*
470 * CFS load tracking 506 * CFS load tracking
471 */ 507 */
472 struct sched_avg avg; 508 struct sched_avg avg;
473#ifndef CONFIG_64BIT 509#ifndef CONFIG_64BIT
474 u64 load_last_update_time_copy; 510 u64 load_last_update_time_copy;
475#endif 511#endif
476 struct { 512 struct {
477 raw_spinlock_t lock ____cacheline_aligned; 513 raw_spinlock_t lock ____cacheline_aligned;
@@ -482,9 +518,9 @@ struct cfs_rq {
482 } removed; 518 } removed;
483 519
484#ifdef CONFIG_FAIR_GROUP_SCHED 520#ifdef CONFIG_FAIR_GROUP_SCHED
485 unsigned long tg_load_avg_contrib; 521 unsigned long tg_load_avg_contrib;
486 long propagate; 522 long propagate;
487 long prop_runnable_sum; 523 long prop_runnable_sum;
488 524
489 /* 525 /*
490 * h_load = weight * f(tg) 526 * h_load = weight * f(tg)
@@ -492,36 +528,38 @@ struct cfs_rq {
492 * Where f(tg) is the recursive weight fraction assigned to 528 * Where f(tg) is the recursive weight fraction assigned to
493 * this group. 529 * this group.
494 */ 530 */
495 unsigned long h_load; 531 unsigned long h_load;
496 u64 last_h_load_update; 532 u64 last_h_load_update;
497 struct sched_entity *h_load_next; 533 struct sched_entity *h_load_next;
498#endif /* CONFIG_FAIR_GROUP_SCHED */ 534#endif /* CONFIG_FAIR_GROUP_SCHED */
499#endif /* CONFIG_SMP */ 535#endif /* CONFIG_SMP */
500 536
501#ifdef CONFIG_FAIR_GROUP_SCHED 537#ifdef CONFIG_FAIR_GROUP_SCHED
502 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 538 struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
503 539
504 /* 540 /*
505 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 541 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
506 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 542 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
507 * (like users, containers etc.) 543 * (like users, containers etc.)
508 * 544 *
509 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 545 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
510 * list is used during load balance. 546 * This list is used during load balance.
511 */ 547 */
512 int on_list; 548 int on_list;
513 struct list_head leaf_cfs_rq_list; 549 struct list_head leaf_cfs_rq_list;
514 struct task_group *tg; /* group that "owns" this runqueue */ 550 struct task_group *tg; /* group that "owns" this runqueue */
515 551
516#ifdef CONFIG_CFS_BANDWIDTH 552#ifdef CONFIG_CFS_BANDWIDTH
517 int runtime_enabled; 553 int runtime_enabled;
518 u64 runtime_expires; 554 u64 runtime_expires;
519 s64 runtime_remaining; 555 s64 runtime_remaining;
520 556
521 u64 throttled_clock, throttled_clock_task; 557 u64 throttled_clock;
522 u64 throttled_clock_task_time; 558 u64 throttled_clock_task;
523 int throttled, throttle_count; 559 u64 throttled_clock_task_time;
524 struct list_head throttled_list; 560 int throttled;
561 int throttle_count;
562 struct list_head throttled_list;
525#endif /* CONFIG_CFS_BANDWIDTH */ 563#endif /* CONFIG_CFS_BANDWIDTH */
526#endif /* CONFIG_FAIR_GROUP_SCHED */ 564#endif /* CONFIG_FAIR_GROUP_SCHED */
527}; 565};
@@ -538,45 +576,45 @@ static inline int rt_bandwidth_enabled(void)
538 576
539/* Real-Time classes' related field in a runqueue: */ 577/* Real-Time classes' related field in a runqueue: */
540struct rt_rq { 578struct rt_rq {
541 struct rt_prio_array active; 579 struct rt_prio_array active;
542 unsigned int rt_nr_running; 580 unsigned int rt_nr_running;
543 unsigned int rr_nr_running; 581 unsigned int rr_nr_running;
544#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 582#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
545 struct { 583 struct {
546 int curr; /* highest queued rt task prio */ 584 int curr; /* highest queued rt task prio */
547#ifdef CONFIG_SMP 585#ifdef CONFIG_SMP
548 int next; /* next highest */ 586 int next; /* next highest */
549#endif 587#endif
550 } highest_prio; 588 } highest_prio;
551#endif 589#endif
552#ifdef CONFIG_SMP 590#ifdef CONFIG_SMP
553 unsigned long rt_nr_migratory; 591 unsigned long rt_nr_migratory;
554 unsigned long rt_nr_total; 592 unsigned long rt_nr_total;
555 int overloaded; 593 int overloaded;
556 struct plist_head pushable_tasks; 594 struct plist_head pushable_tasks;
557#endif /* CONFIG_SMP */ 595#endif /* CONFIG_SMP */
558 int rt_queued; 596 int rt_queued;
559 597
560 int rt_throttled; 598 int rt_throttled;
561 u64 rt_time; 599 u64 rt_time;
562 u64 rt_runtime; 600 u64 rt_runtime;
563 /* Nests inside the rq lock: */ 601 /* Nests inside the rq lock: */
564 raw_spinlock_t rt_runtime_lock; 602 raw_spinlock_t rt_runtime_lock;
565 603
566#ifdef CONFIG_RT_GROUP_SCHED 604#ifdef CONFIG_RT_GROUP_SCHED
567 unsigned long rt_nr_boosted; 605 unsigned long rt_nr_boosted;
568 606
569 struct rq *rq; 607 struct rq *rq;
570 struct task_group *tg; 608 struct task_group *tg;
571#endif 609#endif
572}; 610};
573 611
574/* Deadline class' related fields in a runqueue */ 612/* Deadline class' related fields in a runqueue */
575struct dl_rq { 613struct dl_rq {
576 /* runqueue is an rbtree, ordered by deadline */ 614 /* runqueue is an rbtree, ordered by deadline */
577 struct rb_root_cached root; 615 struct rb_root_cached root;
578 616
579 unsigned long dl_nr_running; 617 unsigned long dl_nr_running;
580 618
581#ifdef CONFIG_SMP 619#ifdef CONFIG_SMP
582 /* 620 /*
@@ -586,28 +624,28 @@ struct dl_rq {
586 * should migrate somewhere else. 624 * should migrate somewhere else.
587 */ 625 */
588 struct { 626 struct {
589 u64 curr; 627 u64 curr;
590 u64 next; 628 u64 next;
591 } earliest_dl; 629 } earliest_dl;
592 630
593 unsigned long dl_nr_migratory; 631 unsigned long dl_nr_migratory;
594 int overloaded; 632 int overloaded;
595 633
596 /* 634 /*
597 * Tasks on this rq that can be pushed away. They are kept in 635 * Tasks on this rq that can be pushed away. They are kept in
598 * an rb-tree, ordered by tasks' deadlines, with caching 636 * an rb-tree, ordered by tasks' deadlines, with caching
599 * of the leftmost (earliest deadline) element. 637 * of the leftmost (earliest deadline) element.
600 */ 638 */
601 struct rb_root_cached pushable_dl_tasks_root; 639 struct rb_root_cached pushable_dl_tasks_root;
602#else 640#else
603 struct dl_bw dl_bw; 641 struct dl_bw dl_bw;
604#endif 642#endif
605 /* 643 /*
606 * "Active utilization" for this runqueue: increased when a 644 * "Active utilization" for this runqueue: increased when a
607 * task wakes up (becomes TASK_RUNNING) and decreased when a 645 * task wakes up (becomes TASK_RUNNING) and decreased when a
608 * task blocks 646 * task blocks
609 */ 647 */
610 u64 running_bw; 648 u64 running_bw;
611 649
612 /* 650 /*
613 * Utilization of the tasks "assigned" to this runqueue (including 651 * Utilization of the tasks "assigned" to this runqueue (including
@@ -618,14 +656,14 @@ struct dl_rq {
618 * This is needed to compute the "inactive utilization" for the 656 * This is needed to compute the "inactive utilization" for the
619 * runqueue (inactive utilization = this_bw - running_bw). 657 * runqueue (inactive utilization = this_bw - running_bw).
620 */ 658 */
621 u64 this_bw; 659 u64 this_bw;
622 u64 extra_bw; 660 u64 extra_bw;
623 661
624 /* 662 /*
625 * Inverse of the fraction of CPU utilization that can be reclaimed 663 * Inverse of the fraction of CPU utilization that can be reclaimed
626 * by the GRUB algorithm. 664 * by the GRUB algorithm.
627 */ 665 */
628 u64 bw_ratio; 666 u64 bw_ratio;
629}; 667};
630 668
631#ifdef CONFIG_SMP 669#ifdef CONFIG_SMP
@@ -638,51 +676,51 @@ static inline bool sched_asym_prefer(int a, int b)
638/* 676/*
639 * We add the notion of a root-domain which will be used to define per-domain 677 * We add the notion of a root-domain which will be used to define per-domain
640 * variables. Each exclusive cpuset essentially defines an island domain by 678 * variables. Each exclusive cpuset essentially defines an island domain by
641 * fully partitioning the member cpus from any other cpuset. Whenever a new 679 * fully partitioning the member CPUs from any other cpuset. Whenever a new
642 * exclusive cpuset is created, we also create and attach a new root-domain 680 * exclusive cpuset is created, we also create and attach a new root-domain
643 * object. 681 * object.
644 * 682 *
645 */ 683 */
646struct root_domain { 684struct root_domain {
647 atomic_t refcount; 685 atomic_t refcount;
648 atomic_t rto_count; 686 atomic_t rto_count;
649 struct rcu_head rcu; 687 struct rcu_head rcu;
650 cpumask_var_t span; 688 cpumask_var_t span;
651 cpumask_var_t online; 689 cpumask_var_t online;
652 690
653 /* Indicate more than one runnable task for any CPU */ 691 /* Indicate more than one runnable task for any CPU */
654 bool overload; 692 bool overload;
655 693
656 /* 694 /*
657 * The bit corresponding to a CPU gets set here if such CPU has more 695 * The bit corresponding to a CPU gets set here if such CPU has more
658 * than one runnable -deadline task (as it is below for RT tasks). 696 * than one runnable -deadline task (as it is below for RT tasks).
659 */ 697 */
660 cpumask_var_t dlo_mask; 698 cpumask_var_t dlo_mask;
661 atomic_t dlo_count; 699 atomic_t dlo_count;
662 struct dl_bw dl_bw; 700 struct dl_bw dl_bw;
663 struct cpudl cpudl; 701 struct cpudl cpudl;
664 702
665#ifdef HAVE_RT_PUSH_IPI 703#ifdef HAVE_RT_PUSH_IPI
666 /* 704 /*
667 * For IPI pull requests, loop across the rto_mask. 705 * For IPI pull requests, loop across the rto_mask.
668 */ 706 */
669 struct irq_work rto_push_work; 707 struct irq_work rto_push_work;
670 raw_spinlock_t rto_lock; 708 raw_spinlock_t rto_lock;
671 /* These are only updated and read within rto_lock */ 709 /* These are only updated and read within rto_lock */
672 int rto_loop; 710 int rto_loop;
673 int rto_cpu; 711 int rto_cpu;
674 /* These atomics are updated outside of a lock */ 712 /* These atomics are updated outside of a lock */
675 atomic_t rto_loop_next; 713 atomic_t rto_loop_next;
676 atomic_t rto_loop_start; 714 atomic_t rto_loop_start;
677#endif 715#endif
678 /* 716 /*
679 * The "RT overload" flag: it gets set if a CPU has more than 717 * The "RT overload" flag: it gets set if a CPU has more than
680 * one runnable RT task. 718 * one runnable RT task.
681 */ 719 */
682 cpumask_var_t rto_mask; 720 cpumask_var_t rto_mask;
683 struct cpupri cpupri; 721 struct cpupri cpupri;
684 722
685 unsigned long max_cpu_capacity; 723 unsigned long max_cpu_capacity;
686}; 724};
687 725
688extern struct root_domain def_root_domain; 726extern struct root_domain def_root_domain;
@@ -708,41 +746,42 @@ extern void rto_push_irq_work_func(struct irq_work *work);
708 */ 746 */
709struct rq { 747struct rq {
710 /* runqueue lock: */ 748 /* runqueue lock: */
711 raw_spinlock_t lock; 749 raw_spinlock_t lock;
712 750
713 /* 751 /*
714 * nr_running and cpu_load should be in the same cacheline because 752 * nr_running and cpu_load should be in the same cacheline because
715 * remote CPUs use both these fields when doing load calculation. 753 * remote CPUs use both these fields when doing load calculation.
716 */ 754 */
717 unsigned int nr_running; 755 unsigned int nr_running;
718#ifdef CONFIG_NUMA_BALANCING 756#ifdef CONFIG_NUMA_BALANCING
719 unsigned int nr_numa_running; 757 unsigned int nr_numa_running;
720 unsigned int nr_preferred_running; 758 unsigned int nr_preferred_running;
721#endif 759#endif
722 #define CPU_LOAD_IDX_MAX 5 760 #define CPU_LOAD_IDX_MAX 5
723 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 761 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
724#ifdef CONFIG_NO_HZ_COMMON 762#ifdef CONFIG_NO_HZ_COMMON
725#ifdef CONFIG_SMP 763#ifdef CONFIG_SMP
726 unsigned long last_load_update_tick; 764 unsigned long last_load_update_tick;
765 unsigned long last_blocked_load_update_tick;
766 unsigned int has_blocked_load;
727#endif /* CONFIG_SMP */ 767#endif /* CONFIG_SMP */
728 unsigned long nohz_flags; 768 unsigned int nohz_tick_stopped;
769 atomic_t nohz_flags;
729#endif /* CONFIG_NO_HZ_COMMON */ 770#endif /* CONFIG_NO_HZ_COMMON */
730#ifdef CONFIG_NO_HZ_FULL
731 unsigned long last_sched_tick;
732#endif
733 /* capture load from *all* tasks on this cpu: */
734 struct load_weight load;
735 unsigned long nr_load_updates;
736 u64 nr_switches;
737 771
738 struct cfs_rq cfs; 772 /* capture load from *all* tasks on this CPU: */
739 struct rt_rq rt; 773 struct load_weight load;
740 struct dl_rq dl; 774 unsigned long nr_load_updates;
775 u64 nr_switches;
776
777 struct cfs_rq cfs;
778 struct rt_rq rt;
779 struct dl_rq dl;
741 780
742#ifdef CONFIG_FAIR_GROUP_SCHED 781#ifdef CONFIG_FAIR_GROUP_SCHED
743 /* list of leaf cfs_rq on this cpu: */ 782 /* list of leaf cfs_rq on this CPU: */
744 struct list_head leaf_cfs_rq_list; 783 struct list_head leaf_cfs_rq_list;
745 struct list_head *tmp_alone_branch; 784 struct list_head *tmp_alone_branch;
746#endif /* CONFIG_FAIR_GROUP_SCHED */ 785#endif /* CONFIG_FAIR_GROUP_SCHED */
747 786
748 /* 787 /*
@@ -751,94 +790,98 @@ struct rq {
751 * one CPU and if it got migrated afterwards it may decrease 790 * one CPU and if it got migrated afterwards it may decrease
752 * it on another CPU. Always updated under the runqueue lock: 791 * it on another CPU. Always updated under the runqueue lock:
753 */ 792 */
754 unsigned long nr_uninterruptible; 793 unsigned long nr_uninterruptible;
755 794
756 struct task_struct *curr, *idle, *stop; 795 struct task_struct *curr;
757 unsigned long next_balance; 796 struct task_struct *idle;
758 struct mm_struct *prev_mm; 797 struct task_struct *stop;
798 unsigned long next_balance;
799 struct mm_struct *prev_mm;
759 800
760 unsigned int clock_update_flags; 801 unsigned int clock_update_flags;
761 u64 clock; 802 u64 clock;
762 u64 clock_task; 803 u64 clock_task;
763 804
764 atomic_t nr_iowait; 805 atomic_t nr_iowait;
765 806
766#ifdef CONFIG_SMP 807#ifdef CONFIG_SMP
767 struct root_domain *rd; 808 struct root_domain *rd;
768 struct sched_domain *sd; 809 struct sched_domain *sd;
769 810
770 unsigned long cpu_capacity; 811 unsigned long cpu_capacity;
771 unsigned long cpu_capacity_orig; 812 unsigned long cpu_capacity_orig;
772 813
773 struct callback_head *balance_callback; 814 struct callback_head *balance_callback;
815
816 unsigned char idle_balance;
774 817
775 unsigned char idle_balance;
776 /* For active balancing */ 818 /* For active balancing */
777 int active_balance; 819 int active_balance;
778 int push_cpu; 820 int push_cpu;
779 struct cpu_stop_work active_balance_work; 821 struct cpu_stop_work active_balance_work;
780 /* cpu of this runqueue: */ 822
781 int cpu; 823 /* CPU of this runqueue: */
782 int online; 824 int cpu;
825 int online;
783 826
784 struct list_head cfs_tasks; 827 struct list_head cfs_tasks;
785 828
786 u64 rt_avg; 829 u64 rt_avg;
787 u64 age_stamp; 830 u64 age_stamp;
788 u64 idle_stamp; 831 u64 idle_stamp;
789 u64 avg_idle; 832 u64 avg_idle;
790 833
791 /* This is used to determine avg_idle's max value */ 834 /* This is used to determine avg_idle's max value */
792 u64 max_idle_balance_cost; 835 u64 max_idle_balance_cost;
793#endif 836#endif
794 837
795#ifdef CONFIG_IRQ_TIME_ACCOUNTING 838#ifdef CONFIG_IRQ_TIME_ACCOUNTING
796 u64 prev_irq_time; 839 u64 prev_irq_time;
797#endif 840#endif
798#ifdef CONFIG_PARAVIRT 841#ifdef CONFIG_PARAVIRT
799 u64 prev_steal_time; 842 u64 prev_steal_time;
800#endif 843#endif
801#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 844#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
802 u64 prev_steal_time_rq; 845 u64 prev_steal_time_rq;
803#endif 846#endif
804 847
805 /* calc_load related fields */ 848 /* calc_load related fields */
806 unsigned long calc_load_update; 849 unsigned long calc_load_update;
807 long calc_load_active; 850 long calc_load_active;
808 851
809#ifdef CONFIG_SCHED_HRTICK 852#ifdef CONFIG_SCHED_HRTICK
810#ifdef CONFIG_SMP 853#ifdef CONFIG_SMP
811 int hrtick_csd_pending; 854 int hrtick_csd_pending;
812 call_single_data_t hrtick_csd; 855 call_single_data_t hrtick_csd;
813#endif 856#endif
814 struct hrtimer hrtick_timer; 857 struct hrtimer hrtick_timer;
815#endif 858#endif
816 859
817#ifdef CONFIG_SCHEDSTATS 860#ifdef CONFIG_SCHEDSTATS
818 /* latency stats */ 861 /* latency stats */
819 struct sched_info rq_sched_info; 862 struct sched_info rq_sched_info;
820 unsigned long long rq_cpu_time; 863 unsigned long long rq_cpu_time;
821 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 864 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
822 865
823 /* sys_sched_yield() stats */ 866 /* sys_sched_yield() stats */
824 unsigned int yld_count; 867 unsigned int yld_count;
825 868
826 /* schedule() stats */ 869 /* schedule() stats */
827 unsigned int sched_count; 870 unsigned int sched_count;
828 unsigned int sched_goidle; 871 unsigned int sched_goidle;
829 872
830 /* try_to_wake_up() stats */ 873 /* try_to_wake_up() stats */
831 unsigned int ttwu_count; 874 unsigned int ttwu_count;
832 unsigned int ttwu_local; 875 unsigned int ttwu_local;
833#endif 876#endif
834 877
835#ifdef CONFIG_SMP 878#ifdef CONFIG_SMP
836 struct llist_head wake_list; 879 struct llist_head wake_list;
837#endif 880#endif
838 881
839#ifdef CONFIG_CPU_IDLE 882#ifdef CONFIG_CPU_IDLE
840 /* Must be inspected within a rcu lock section */ 883 /* Must be inspected within a rcu lock section */
841 struct cpuidle_state *idle_state; 884 struct cpuidle_state *idle_state;
842#endif 885#endif
843}; 886};
844 887
@@ -904,9 +947,9 @@ static inline u64 __rq_clock_broken(struct rq *rq)
904 * one position though, because the next rq_unpin_lock() will shift it 947 * one position though, because the next rq_unpin_lock() will shift it
905 * back. 948 * back.
906 */ 949 */
907#define RQCF_REQ_SKIP 0x01 950#define RQCF_REQ_SKIP 0x01
908#define RQCF_ACT_SKIP 0x02 951#define RQCF_ACT_SKIP 0x02
909#define RQCF_UPDATED 0x04 952#define RQCF_UPDATED 0x04
910 953
911static inline void assert_clock_updated(struct rq *rq) 954static inline void assert_clock_updated(struct rq *rq)
912{ 955{
@@ -1059,12 +1102,12 @@ extern void sched_ttwu_pending(void);
1059 1102
1060/** 1103/**
1061 * highest_flag_domain - Return highest sched_domain containing flag. 1104 * highest_flag_domain - Return highest sched_domain containing flag.
1062 * @cpu: The cpu whose highest level of sched domain is to 1105 * @cpu: The CPU whose highest level of sched domain is to
1063 * be returned. 1106 * be returned.
1064 * @flag: The flag to check for the highest sched_domain 1107 * @flag: The flag to check for the highest sched_domain
1065 * for the given cpu. 1108 * for the given CPU.
1066 * 1109 *
1067 * Returns the highest sched_domain of a cpu which contains the given flag. 1110 * Returns the highest sched_domain of a CPU which contains the given flag.
1068 */ 1111 */
1069static inline struct sched_domain *highest_flag_domain(int cpu, int flag) 1112static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
1070{ 1113{
@@ -1099,30 +1142,30 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
1099DECLARE_PER_CPU(struct sched_domain *, sd_asym); 1142DECLARE_PER_CPU(struct sched_domain *, sd_asym);
1100 1143
1101struct sched_group_capacity { 1144struct sched_group_capacity {
1102 atomic_t ref; 1145 atomic_t ref;
1103 /* 1146 /*
1104 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity 1147 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
1105 * for a single CPU. 1148 * for a single CPU.
1106 */ 1149 */
1107 unsigned long capacity; 1150 unsigned long capacity;
1108 unsigned long min_capacity; /* Min per-CPU capacity in group */ 1151 unsigned long min_capacity; /* Min per-CPU capacity in group */
1109 unsigned long next_update; 1152 unsigned long next_update;
1110 int imbalance; /* XXX unrelated to capacity but shared group state */ 1153 int imbalance; /* XXX unrelated to capacity but shared group state */
1111 1154
1112#ifdef CONFIG_SCHED_DEBUG 1155#ifdef CONFIG_SCHED_DEBUG
1113 int id; 1156 int id;
1114#endif 1157#endif
1115 1158
1116 unsigned long cpumask[0]; /* balance mask */ 1159 unsigned long cpumask[0]; /* Balance mask */
1117}; 1160};
1118 1161
1119struct sched_group { 1162struct sched_group {
1120 struct sched_group *next; /* Must be a circular list */ 1163 struct sched_group *next; /* Must be a circular list */
1121 atomic_t ref; 1164 atomic_t ref;
1122 1165
1123 unsigned int group_weight; 1166 unsigned int group_weight;
1124 struct sched_group_capacity *sgc; 1167 struct sched_group_capacity *sgc;
1125 int asym_prefer_cpu; /* cpu of highest priority in group */ 1168 int asym_prefer_cpu; /* CPU of highest priority in group */
1126 1169
1127 /* 1170 /*
1128 * The CPUs this group covers. 1171 * The CPUs this group covers.
@@ -1131,7 +1174,7 @@ struct sched_group {
1131 * by attaching extra space to the end of the structure, 1174 * by attaching extra space to the end of the structure,
1132 * depending on how many CPUs the kernel has booted up with) 1175 * depending on how many CPUs the kernel has booted up with)
1133 */ 1176 */
1134 unsigned long cpumask[0]; 1177 unsigned long cpumask[0];
1135}; 1178};
1136 1179
1137static inline struct cpumask *sched_group_span(struct sched_group *sg) 1180static inline struct cpumask *sched_group_span(struct sched_group *sg)
@@ -1148,8 +1191,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
1148} 1191}
1149 1192
1150/** 1193/**
1151 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. 1194 * group_first_cpu - Returns the first CPU in the cpumask of a sched_group.
1152 * @group: The group whose first cpu is to be returned. 1195 * @group: The group whose first CPU is to be returned.
1153 */ 1196 */
1154static inline unsigned int group_first_cpu(struct sched_group *group) 1197static inline unsigned int group_first_cpu(struct sched_group *group)
1155{ 1198{
@@ -1349,19 +1392,12 @@ static inline int task_on_rq_migrating(struct task_struct *p)
1349 return p->on_rq == TASK_ON_RQ_MIGRATING; 1392 return p->on_rq == TASK_ON_RQ_MIGRATING;
1350} 1393}
1351 1394
1352#ifndef prepare_arch_switch
1353# define prepare_arch_switch(next) do { } while (0)
1354#endif
1355#ifndef finish_arch_post_lock_switch
1356# define finish_arch_post_lock_switch() do { } while (0)
1357#endif
1358
1359/* 1395/*
1360 * wake flags 1396 * wake flags
1361 */ 1397 */
1362#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ 1398#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
1363#define WF_FORK 0x02 /* child wakeup after fork */ 1399#define WF_FORK 0x02 /* Child wakeup after fork */
1364#define WF_MIGRATED 0x4 /* internal use, task got migrated */ 1400#define WF_MIGRATED 0x4 /* Internal use, task got migrated */
1365 1401
1366/* 1402/*
1367 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1403 * To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1372,11 +1408,11 @@ static inline int task_on_rq_migrating(struct task_struct *p)
1372 * slice expiry etc. 1408 * slice expiry etc.
1373 */ 1409 */
1374 1410
1375#define WEIGHT_IDLEPRIO 3 1411#define WEIGHT_IDLEPRIO 3
1376#define WMULT_IDLEPRIO 1431655765 1412#define WMULT_IDLEPRIO 1431655765
1377 1413
1378extern const int sched_prio_to_weight[40]; 1414extern const int sched_prio_to_weight[40];
1379extern const u32 sched_prio_to_wmult[40]; 1415extern const u32 sched_prio_to_wmult[40];
1380 1416
1381/* 1417/*
1382 * {de,en}queue flags: 1418 * {de,en}queue flags:
@@ -1398,9 +1434,9 @@ extern const u32 sched_prio_to_wmult[40];
1398 */ 1434 */
1399 1435
1400#define DEQUEUE_SLEEP 0x01 1436#define DEQUEUE_SLEEP 0x01
1401#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ 1437#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
1402#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ 1438#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
1403#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ 1439#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
1404 1440
1405#define ENQUEUE_WAKEUP 0x01 1441#define ENQUEUE_WAKEUP 0x01
1406#define ENQUEUE_RESTORE 0x02 1442#define ENQUEUE_RESTORE 0x02
@@ -1422,10 +1458,10 @@ struct sched_class {
1422 1458
1423 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 1459 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
1424 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 1460 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
1425 void (*yield_task) (struct rq *rq); 1461 void (*yield_task) (struct rq *rq);
1426 bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); 1462 bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
1427 1463
1428 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); 1464 void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
1429 1465
1430 /* 1466 /*
1431 * It is the responsibility of the pick_next_task() method that will 1467 * It is the responsibility of the pick_next_task() method that will
@@ -1435,16 +1471,16 @@ struct sched_class {
1435 * May return RETRY_TASK when it finds a higher prio class has runnable 1471 * May return RETRY_TASK when it finds a higher prio class has runnable
1436 * tasks. 1472 * tasks.
1437 */ 1473 */
1438 struct task_struct * (*pick_next_task) (struct rq *rq, 1474 struct task_struct * (*pick_next_task)(struct rq *rq,
1439 struct task_struct *prev, 1475 struct task_struct *prev,
1440 struct rq_flags *rf); 1476 struct rq_flags *rf);
1441 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1477 void (*put_prev_task)(struct rq *rq, struct task_struct *p);
1442 1478
1443#ifdef CONFIG_SMP 1479#ifdef CONFIG_SMP
1444 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1480 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1445 void (*migrate_task_rq)(struct task_struct *p); 1481 void (*migrate_task_rq)(struct task_struct *p);
1446 1482
1447 void (*task_woken) (struct rq *this_rq, struct task_struct *task); 1483 void (*task_woken)(struct rq *this_rq, struct task_struct *task);
1448 1484
1449 void (*set_cpus_allowed)(struct task_struct *p, 1485 void (*set_cpus_allowed)(struct task_struct *p,
1450 const struct cpumask *newmask); 1486 const struct cpumask *newmask);
@@ -1453,31 +1489,31 @@ struct sched_class {
1453 void (*rq_offline)(struct rq *rq); 1489 void (*rq_offline)(struct rq *rq);
1454#endif 1490#endif
1455 1491
1456 void (*set_curr_task) (struct rq *rq); 1492 void (*set_curr_task)(struct rq *rq);
1457 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); 1493 void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
1458 void (*task_fork) (struct task_struct *p); 1494 void (*task_fork)(struct task_struct *p);
1459 void (*task_dead) (struct task_struct *p); 1495 void (*task_dead)(struct task_struct *p);
1460 1496
1461 /* 1497 /*
1462 * The switched_from() call is allowed to drop rq->lock, therefore we 1498 * The switched_from() call is allowed to drop rq->lock, therefore we
1463 * cannot assume the switched_from/switched_to pair is serliazed by 1499 * cannot assume the switched_from/switched_to pair is serliazed by
1464 * rq->lock. They are however serialized by p->pi_lock. 1500 * rq->lock. They are however serialized by p->pi_lock.
1465 */ 1501 */
1466 void (*switched_from) (struct rq *this_rq, struct task_struct *task); 1502 void (*switched_from)(struct rq *this_rq, struct task_struct *task);
1467 void (*switched_to) (struct rq *this_rq, struct task_struct *task); 1503 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1468 void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 1504 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1469 int oldprio); 1505 int oldprio);
1470 1506
1471 unsigned int (*get_rr_interval) (struct rq *rq, 1507 unsigned int (*get_rr_interval)(struct rq *rq,
1472 struct task_struct *task); 1508 struct task_struct *task);
1473 1509
1474 void (*update_curr) (struct rq *rq); 1510 void (*update_curr)(struct rq *rq);
1475 1511
1476#define TASK_SET_GROUP 0 1512#define TASK_SET_GROUP 0
1477#define TASK_MOVE_GROUP 1 1513#define TASK_MOVE_GROUP 1
1478 1514
1479#ifdef CONFIG_FAIR_GROUP_SCHED 1515#ifdef CONFIG_FAIR_GROUP_SCHED
1480 void (*task_change_group) (struct task_struct *p, int type); 1516 void (*task_change_group)(struct task_struct *p, int type);
1481#endif 1517#endif
1482}; 1518};
1483 1519
@@ -1526,6 +1562,7 @@ static inline void idle_set_state(struct rq *rq,
1526static inline struct cpuidle_state *idle_get_state(struct rq *rq) 1562static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1527{ 1563{
1528 SCHED_WARN_ON(!rcu_read_lock_held()); 1564 SCHED_WARN_ON(!rcu_read_lock_held());
1565
1529 return rq->idle_state; 1566 return rq->idle_state;
1530} 1567}
1531#else 1568#else
@@ -1564,9 +1601,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1564extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); 1601extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
1565extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); 1602extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
1566 1603
1567#define BW_SHIFT 20 1604#define BW_SHIFT 20
1568#define BW_UNIT (1 << BW_SHIFT) 1605#define BW_UNIT (1 << BW_SHIFT)
1569#define RATIO_SHIFT 8 1606#define RATIO_SHIFT 8
1570unsigned long to_ratio(u64 period, u64 runtime); 1607unsigned long to_ratio(u64 period, u64 runtime);
1571 1608
1572extern void init_entity_runnable_average(struct sched_entity *se); 1609extern void init_entity_runnable_average(struct sched_entity *se);
@@ -1574,6 +1611,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se);
1574 1611
1575#ifdef CONFIG_NO_HZ_FULL 1612#ifdef CONFIG_NO_HZ_FULL
1576extern bool sched_can_stop_tick(struct rq *rq); 1613extern bool sched_can_stop_tick(struct rq *rq);
1614extern int __init sched_tick_offload_init(void);
1577 1615
1578/* 1616/*
1579 * Tick may be needed by tasks in the runqueue depending on their policy and 1617 * Tick may be needed by tasks in the runqueue depending on their policy and
@@ -1598,6 +1636,7 @@ static inline void sched_update_tick_dependency(struct rq *rq)
1598 tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); 1636 tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
1599} 1637}
1600#else 1638#else
1639static inline int sched_tick_offload_init(void) { return 0; }
1601static inline void sched_update_tick_dependency(struct rq *rq) { } 1640static inline void sched_update_tick_dependency(struct rq *rq) { }
1602#endif 1641#endif
1603 1642
@@ -1624,13 +1663,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
1624 sched_update_tick_dependency(rq); 1663 sched_update_tick_dependency(rq);
1625} 1664}
1626 1665
1627static inline void rq_last_tick_reset(struct rq *rq)
1628{
1629#ifdef CONFIG_NO_HZ_FULL
1630 rq->last_sched_tick = jiffies;
1631#endif
1632}
1633
1634extern void update_rq_clock(struct rq *rq); 1666extern void update_rq_clock(struct rq *rq);
1635 1667
1636extern void activate_task(struct rq *rq, struct task_struct *p, int flags); 1668extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -1821,8 +1853,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1821/* 1853/*
1822 * Unfair double_lock_balance: Optimizes throughput at the expense of 1854 * Unfair double_lock_balance: Optimizes throughput at the expense of
1823 * latency by eliminating extra atomic operations when the locks are 1855 * latency by eliminating extra atomic operations when the locks are
1824 * already in proper order on entry. This favors lower cpu-ids and will 1856 * already in proper order on entry. This favors lower CPU-ids and will
1825 * grant the double lock to lower cpus over higher ids under contention, 1857 * grant the double lock to lower CPUs over higher ids under contention,
1826 * regardless of entry order into the function. 1858 * regardless of entry order into the function.
1827 */ 1859 */
1828static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1860static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
@@ -1854,7 +1886,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1854static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1886static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1855{ 1887{
1856 if (unlikely(!irqs_disabled())) { 1888 if (unlikely(!irqs_disabled())) {
1857 /* printk() doesn't work good under rq->lock */ 1889 /* printk() doesn't work well under rq->lock */
1858 raw_spin_unlock(&this_rq->lock); 1890 raw_spin_unlock(&this_rq->lock);
1859 BUG_ON(1); 1891 BUG_ON(1);
1860 } 1892 }
@@ -2005,16 +2037,19 @@ extern void cfs_bandwidth_usage_inc(void);
2005extern void cfs_bandwidth_usage_dec(void); 2037extern void cfs_bandwidth_usage_dec(void);
2006 2038
2007#ifdef CONFIG_NO_HZ_COMMON 2039#ifdef CONFIG_NO_HZ_COMMON
2008enum rq_nohz_flag_bits { 2040#define NOHZ_BALANCE_KICK_BIT 0
2009 NOHZ_TICK_STOPPED, 2041#define NOHZ_STATS_KICK_BIT 1
2010 NOHZ_BALANCE_KICK, 2042
2011}; 2043#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
2044#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
2045
2046#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
2012 2047
2013#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) 2048#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
2014 2049
2015extern void nohz_balance_exit_idle(unsigned int cpu); 2050extern void nohz_balance_exit_idle(struct rq *rq);
2016#else 2051#else
2017static inline void nohz_balance_exit_idle(unsigned int cpu) { } 2052static inline void nohz_balance_exit_idle(struct rq *rq) { }
2018#endif 2053#endif
2019 2054
2020 2055
@@ -2113,15 +2148,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
2113#endif /* CONFIG_CPU_FREQ */ 2148#endif /* CONFIG_CPU_FREQ */
2114 2149
2115#ifdef arch_scale_freq_capacity 2150#ifdef arch_scale_freq_capacity
2116#ifndef arch_scale_freq_invariant 2151# ifndef arch_scale_freq_invariant
2117#define arch_scale_freq_invariant() (true) 2152# define arch_scale_freq_invariant() true
2118#endif 2153# endif
2119#else /* arch_scale_freq_capacity */ 2154#else
2120#define arch_scale_freq_invariant() (false) 2155# define arch_scale_freq_invariant() false
2121#endif 2156#endif
2122 2157
2123#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL 2158#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
2124
2125static inline unsigned long cpu_util_dl(struct rq *rq) 2159static inline unsigned long cpu_util_dl(struct rq *rq)
2126{ 2160{
2127 return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; 2161 return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
@@ -2129,7 +2163,13 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
2129 2163
2130static inline unsigned long cpu_util_cfs(struct rq *rq) 2164static inline unsigned long cpu_util_cfs(struct rq *rq)
2131{ 2165{
2132 return rq->cfs.avg.util_avg; 2166 unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
2133} 2167
2168 if (sched_feat(UTIL_EST)) {
2169 util = max_t(unsigned long, util,
2170 READ_ONCE(rq->cfs.avg.util_est.enqueued));
2171 }
2134 2172
2173 return util;
2174}
2135#endif 2175#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 940b1fa1d2ce..ab112cbfd7c8 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -1,14 +1,13 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2 2/*
3#include <linux/slab.h> 3 * /proc/schedstat implementation
4#include <linux/fs.h> 4 */
5#include <linux/seq_file.h>
6#include <linux/proc_fs.h>
7
8#include "sched.h" 5#include "sched.h"
9 6
10/* 7/*
11 * bump this up when changing the output format or the meaning of an existing 8 * Current schedstat API version.
9 *
10 * Bump this up when changing the output format or the meaning of an existing
12 * format, so that tools can adapt (or abort) 11 * format, so that tools can adapt (or abort)
13 */ 12 */
14#define SCHEDSTAT_VERSION 15 13#define SCHEDSTAT_VERSION 15
@@ -78,8 +77,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
78 * This itererator needs some explanation. 77 * This itererator needs some explanation.
79 * It returns 1 for the header position. 78 * It returns 1 for the header position.
80 * This means 2 is cpu 0. 79 * This means 2 is cpu 0.
81 * In a hotplugged system some cpus, including cpu 0, may be missing so we have 80 * In a hotplugged system some CPUs, including cpu 0, may be missing so we have
82 * to use cpumask_* to iterate over the cpus. 81 * to use cpumask_* to iterate over the CPUs.
83 */ 82 */
84static void *schedstat_start(struct seq_file *file, loff_t *offset) 83static void *schedstat_start(struct seq_file *file, loff_t *offset)
85{ 84{
@@ -99,12 +98,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset)
99 98
100 if (n < nr_cpu_ids) 99 if (n < nr_cpu_ids)
101 return (void *)(unsigned long)(n + 2); 100 return (void *)(unsigned long)(n + 2);
101
102 return NULL; 102 return NULL;
103} 103}
104 104
105static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) 105static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
106{ 106{
107 (*offset)++; 107 (*offset)++;
108
108 return schedstat_start(file, offset); 109 return schedstat_start(file, offset);
109} 110}
110 111
@@ -134,6 +135,7 @@ static const struct file_operations proc_schedstat_operations = {
134static int __init proc_schedstat_init(void) 135static int __init proc_schedstat_init(void)
135{ 136{
136 proc_create("schedstat", 0, NULL, &proc_schedstat_operations); 137 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
138
137 return 0; 139 return 0;
138} 140}
139subsys_initcall(proc_schedstat_init); 141subsys_initcall(proc_schedstat_init);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 8e7b58de61e7..8aea199a39b4 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -30,35 +30,29 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
30 if (rq) 30 if (rq)
31 rq->rq_sched_info.run_delay += delta; 31 rq->rq_sched_info.run_delay += delta;
32} 32}
33#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) 33#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
34#define __schedstat_inc(var) do { var++; } while (0) 34#define __schedstat_inc(var) do { var++; } while (0)
35#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) 35#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
36#define __schedstat_add(var, amt) do { var += (amt); } while (0) 36#define __schedstat_add(var, amt) do { var += (amt); } while (0)
37#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) 37#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
38#define __schedstat_set(var, val) do { var = (val); } while (0) 38#define __schedstat_set(var, val) do { var = (val); } while (0)
39#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) 39#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
40#define schedstat_val(var) (var) 40#define schedstat_val(var) (var)
41#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) 41#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
42 42
43#else /* !CONFIG_SCHEDSTATS */ 43#else /* !CONFIG_SCHEDSTATS: */
44static inline void 44static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { }
45rq_sched_info_arrive(struct rq *rq, unsigned long long delta) 45static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { }
46{} 46static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { }
47static inline void 47# define schedstat_enabled() 0
48rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) 48# define __schedstat_inc(var) do { } while (0)
49{} 49# define schedstat_inc(var) do { } while (0)
50static inline void 50# define __schedstat_add(var, amt) do { } while (0)
51rq_sched_info_depart(struct rq *rq, unsigned long long delta) 51# define schedstat_add(var, amt) do { } while (0)
52{} 52# define __schedstat_set(var, val) do { } while (0)
53#define schedstat_enabled() 0 53# define schedstat_set(var, val) do { } while (0)
54#define __schedstat_inc(var) do { } while (0) 54# define schedstat_val(var) 0
55#define schedstat_inc(var) do { } while (0) 55# define schedstat_val_or_zero(var) 0
56#define __schedstat_add(var, amt) do { } while (0)
57#define schedstat_add(var, amt) do { } while (0)
58#define __schedstat_set(var, val) do { } while (0)
59#define schedstat_set(var, val) do { } while (0)
60#define schedstat_val(var) 0
61#define schedstat_val_or_zero(var) 0
62#endif /* CONFIG_SCHEDSTATS */ 56#endif /* CONFIG_SCHEDSTATS */
63 57
64#ifdef CONFIG_SCHED_INFO 58#ifdef CONFIG_SCHED_INFO
@@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
69 63
70/* 64/*
71 * We are interested in knowing how long it was from the *first* time a 65 * We are interested in knowing how long it was from the *first* time a
72 * task was queued to the time that it finally hit a cpu, we call this routine 66 * task was queued to the time that it finally hit a CPU, we call this routine
73 * from dequeue_task() to account for possible rq->clock skew across cpus. The 67 * from dequeue_task() to account for possible rq->clock skew across CPUs. The
74 * delta taken on each cpu would annul the skew. 68 * delta taken on each CPU would annul the skew.
75 */ 69 */
76static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) 70static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
77{ 71{
@@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
87} 81}
88 82
89/* 83/*
90 * Called when a task finally hits the cpu. We can now calculate how 84 * Called when a task finally hits the CPU. We can now calculate how
91 * long it was waiting to run. We also note when it began so that we 85 * long it was waiting to run. We also note when it began so that we
92 * can keep stats on how long its timeslice is. 86 * can keep stats on how long its timeslice is.
93 */ 87 */
@@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
112 */ 106 */
113static inline void sched_info_queued(struct rq *rq, struct task_struct *t) 107static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
114{ 108{
115 if (unlikely(sched_info_on())) 109 if (unlikely(sched_info_on())) {
116 if (!t->sched_info.last_queued) 110 if (!t->sched_info.last_queued)
117 t->sched_info.last_queued = rq_clock(rq); 111 t->sched_info.last_queued = rq_clock(rq);
112 }
118} 113}
119 114
120/* 115/*
@@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
127 */ 122 */
128static inline void sched_info_depart(struct rq *rq, struct task_struct *t) 123static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
129{ 124{
130 unsigned long long delta = rq_clock(rq) - 125 unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival;
131 t->sched_info.last_arrival;
132 126
133 rq_sched_info_depart(rq, delta); 127 rq_sched_info_depart(rq, delta);
134 128
@@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
142 * the idle task.) We are only called when prev != next. 136 * the idle task.) We are only called when prev != next.
143 */ 137 */
144static inline void 138static inline void
145__sched_info_switch(struct rq *rq, 139__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
146 struct task_struct *prev, struct task_struct *next)
147{ 140{
148 /* 141 /*
149 * prev now departs the cpu. It's not interesting to record 142 * prev now departs the CPU. It's not interesting to record
150 * stats about how efficient we were at scheduling the idle 143 * stats about how efficient we were at scheduling the idle
151 * process, however. 144 * process, however.
152 */ 145 */
@@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq,
156 if (next != rq->idle) 149 if (next != rq->idle)
157 sched_info_arrive(rq, next); 150 sched_info_arrive(rq, next);
158} 151}
152
159static inline void 153static inline void
160sched_info_switch(struct rq *rq, 154sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
161 struct task_struct *prev, struct task_struct *next)
162{ 155{
163 if (unlikely(sched_info_on())) 156 if (unlikely(sched_info_on()))
164 __sched_info_switch(rq, prev, next); 157 __sched_info_switch(rq, prev, next);
165} 158}
166#else 159
167#define sched_info_queued(rq, t) do { } while (0) 160#else /* !CONFIG_SCHED_INFO: */
168#define sched_info_reset_dequeued(t) do { } while (0) 161# define sched_info_queued(rq, t) do { } while (0)
169#define sched_info_dequeued(rq, t) do { } while (0) 162# define sched_info_reset_dequeued(t) do { } while (0)
170#define sched_info_depart(rq, t) do { } while (0) 163# define sched_info_dequeued(rq, t) do { } while (0)
171#define sched_info_arrive(rq, next) do { } while (0) 164# define sched_info_depart(rq, t) do { } while (0)
172#define sched_info_switch(rq, t, next) do { } while (0) 165# define sched_info_arrive(rq, next) do { } while (0)
166# define sched_info_switch(rq, t, next) do { } while (0)
173#endif /* CONFIG_SCHED_INFO */ 167#endif /* CONFIG_SCHED_INFO */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 210b1f2146ff..c183b790ca54 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,6 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include "sched.h"
3
4/* 2/*
5 * stop-task scheduling class. 3 * stop-task scheduling class.
6 * 4 *
@@ -9,6 +7,7 @@
9 * 7 *
10 * See kernel/stop_machine.c 8 * See kernel/stop_machine.c
11 */ 9 */
10#include "sched.h"
12 11
13#ifdef CONFIG_SMP 12#ifdef CONFIG_SMP
14static int 13static int
@@ -75,6 +74,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
75 cgroup_account_cputime(curr, delta_exec); 74 cgroup_account_cputime(curr, delta_exec);
76} 75}
77 76
77/*
78 * scheduler tick hitting a task of our scheduling class.
79 *
80 * NOTE: This function can be called remotely by the tick offload that
81 * goes along full dynticks. Therefore no local assumption can be made
82 * and everything must be accessed through the @rq and @curr passed in
83 * parameters.
84 */
78static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) 85static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
79{ 86{
80} 87}
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 9ff1555341ed..b6fb2c3b3ff7 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -1,6 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include <linux/sched/signal.h> 2/*
3#include <linux/swait.h> 3 * <linux/swait.h> (simple wait queues ) implementation:
4 */
5#include "sched.h"
4 6
5void __init_swait_queue_head(struct swait_queue_head *q, const char *name, 7void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
6 struct lock_class_key *key) 8 struct lock_class_key *key)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 519b024f4e94..64cc564f5255 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2,10 +2,6 @@
2/* 2/*
3 * Scheduler topology setup/handling methods 3 * Scheduler topology setup/handling methods
4 */ 4 */
5#include <linux/sched.h>
6#include <linux/mutex.h>
7#include <linux/sched/isolation.h>
8
9#include "sched.h" 5#include "sched.h"
10 6
11DEFINE_MUTEX(sched_domains_mutex); 7DEFINE_MUTEX(sched_domains_mutex);
@@ -41,8 +37,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
41 if (!(sd->flags & SD_LOAD_BALANCE)) { 37 if (!(sd->flags & SD_LOAD_BALANCE)) {
42 printk("does not load-balance\n"); 38 printk("does not load-balance\n");
43 if (sd->parent) 39 if (sd->parent)
44 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 40 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
45 " has parent");
46 return -1; 41 return -1;
47 } 42 }
48 43
@@ -50,12 +45,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
50 cpumask_pr_args(sched_domain_span(sd)), sd->name); 45 cpumask_pr_args(sched_domain_span(sd)), sd->name);
51 46
52 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 47 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
53 printk(KERN_ERR "ERROR: domain->span does not contain " 48 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
54 "CPU%d\n", cpu);
55 } 49 }
56 if (!cpumask_test_cpu(cpu, sched_group_span(group))) { 50 if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
57 printk(KERN_ERR "ERROR: domain->groups does not contain" 51 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
58 " CPU%d\n", cpu);
59 } 52 }
60 53
61 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 54 printk(KERN_DEBUG "%*s groups:", level + 1, "");
@@ -115,8 +108,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
115 108
116 if (sd->parent && 109 if (sd->parent &&
117 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 110 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
118 printk(KERN_ERR "ERROR: parent span is not a superset " 111 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
119 "of domain->span\n");
120 return 0; 112 return 0;
121} 113}
122 114
@@ -595,7 +587,7 @@ int group_balance_cpu(struct sched_group *sg)
595 * are not. 587 * are not.
596 * 588 *
597 * This leads to a few particularly weird cases where the sched_domain's are 589 * This leads to a few particularly weird cases where the sched_domain's are
598 * not of the same number for each cpu. Consider: 590 * not of the same number for each CPU. Consider:
599 * 591 *
600 * NUMA-2 0-3 0-3 592 * NUMA-2 0-3 0-3
601 * groups: {0-2},{1-3} {1-3},{0-2} 593 * groups: {0-2},{1-3} {1-3},{0-2}
@@ -780,7 +772,7 @@ fail:
780 * ^ ^ ^ ^ 772 * ^ ^ ^ ^
781 * `-' `-' 773 * `-' `-'
782 * 774 *
783 * The sched_domains are per-cpu and have a two way link (parent & child) and 775 * The sched_domains are per-CPU and have a two way link (parent & child) and
784 * denote the ever growing mask of CPUs belonging to that level of topology. 776 * denote the ever growing mask of CPUs belonging to that level of topology.
785 * 777 *
786 * Each sched_domain has a circular (double) linked list of sched_group's, each 778 * Each sched_domain has a circular (double) linked list of sched_group's, each
@@ -1021,6 +1013,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
1021 d->rd = alloc_rootdomain(); 1013 d->rd = alloc_rootdomain();
1022 if (!d->rd) 1014 if (!d->rd)
1023 return sa_sd; 1015 return sa_sd;
1016
1024 return sa_rootdomain; 1017 return sa_rootdomain;
1025} 1018}
1026 1019
@@ -1047,12 +1040,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
1047} 1040}
1048 1041
1049#ifdef CONFIG_NUMA 1042#ifdef CONFIG_NUMA
1050static int sched_domains_numa_levels;
1051enum numa_topology_type sched_numa_topology_type; 1043enum numa_topology_type sched_numa_topology_type;
1052static int *sched_domains_numa_distance; 1044
1053int sched_max_numa_distance; 1045static int sched_domains_numa_levels;
1054static struct cpumask ***sched_domains_numa_masks; 1046static int sched_domains_curr_level;
1055static int sched_domains_curr_level; 1047
1048int sched_max_numa_distance;
1049static int *sched_domains_numa_distance;
1050static struct cpumask ***sched_domains_numa_masks;
1056#endif 1051#endif
1057 1052
1058/* 1053/*
@@ -1074,11 +1069,11 @@ static int sched_domains_curr_level;
1074 * SD_ASYM_PACKING - describes SMT quirks 1069 * SD_ASYM_PACKING - describes SMT quirks
1075 */ 1070 */
1076#define TOPOLOGY_SD_FLAGS \ 1071#define TOPOLOGY_SD_FLAGS \
1077 (SD_SHARE_CPUCAPACITY | \ 1072 (SD_SHARE_CPUCAPACITY | \
1078 SD_SHARE_PKG_RESOURCES | \ 1073 SD_SHARE_PKG_RESOURCES | \
1079 SD_NUMA | \ 1074 SD_NUMA | \
1080 SD_ASYM_PACKING | \ 1075 SD_ASYM_PACKING | \
1081 SD_ASYM_CPUCAPACITY | \ 1076 SD_ASYM_CPUCAPACITY | \
1082 SD_SHARE_POWERDOMAIN) 1077 SD_SHARE_POWERDOMAIN)
1083 1078
1084static struct sched_domain * 1079static struct sched_domain *
@@ -1628,7 +1623,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
1628 pr_err(" the %s domain not a subset of the %s domain\n", 1623 pr_err(" the %s domain not a subset of the %s domain\n",
1629 child->name, sd->name); 1624 child->name, sd->name);
1630#endif 1625#endif
1631 /* Fixup, ensure @sd has at least @child cpus. */ 1626 /* Fixup, ensure @sd has at least @child CPUs. */
1632 cpumask_or(sched_domain_span(sd), 1627 cpumask_or(sched_domain_span(sd),
1633 sched_domain_span(sd), 1628 sched_domain_span(sd),
1634 sched_domain_span(child)); 1629 sched_domain_span(child));
@@ -1720,6 +1715,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
1720 ret = 0; 1715 ret = 0;
1721error: 1716error:
1722 __free_domain_allocs(&d, alloc_state, cpu_map); 1717 __free_domain_allocs(&d, alloc_state, cpu_map);
1718
1723 return ret; 1719 return ret;
1724} 1720}
1725 1721
@@ -1824,6 +1820,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
1824 return 1; 1820 return 1;
1825 1821
1826 tmp = SD_ATTR_INIT; 1822 tmp = SD_ATTR_INIT;
1823
1827 return !memcmp(cur ? (cur + idx_cur) : &tmp, 1824 return !memcmp(cur ? (cur + idx_cur) : &tmp,
1828 new ? (new + idx_new) : &tmp, 1825 new ? (new + idx_new) : &tmp,
1829 sizeof(struct sched_domain_attr)); 1826 sizeof(struct sched_domain_attr));
@@ -1929,4 +1926,3 @@ match2:
1929 1926
1930 mutex_unlock(&sched_domains_mutex); 1927 mutex_unlock(&sched_domains_mutex);
1931} 1928}
1932
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 929ecb7d6b78..928be527477e 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -3,14 +3,7 @@
3 * 3 *
4 * (C) 2004 Nadia Yvette Chambers, Oracle 4 * (C) 2004 Nadia Yvette Chambers, Oracle
5 */ 5 */
6#include <linux/init.h> 6#include "sched.h"
7#include <linux/export.h>
8#include <linux/sched/signal.h>
9#include <linux/sched/debug.h>
10#include <linux/mm.h>
11#include <linux/wait.h>
12#include <linux/hash.h>
13#include <linux/kthread.h>
14 7
15void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) 8void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
16{ 9{
@@ -107,6 +100,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
107 break; 100 break;
108 } 101 }
109 } 102 }
103
110 return nr_exclusive; 104 return nr_exclusive;
111} 105}
112 106
@@ -317,6 +311,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
317 spin_unlock(&wq->lock); 311 spin_unlock(&wq->lock);
318 schedule(); 312 schedule();
319 spin_lock(&wq->lock); 313 spin_lock(&wq->lock);
314
320 return 0; 315 return 0;
321} 316}
322EXPORT_SYMBOL(do_wait_intr); 317EXPORT_SYMBOL(do_wait_intr);
@@ -333,6 +328,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
333 spin_unlock_irq(&wq->lock); 328 spin_unlock_irq(&wq->lock);
334 schedule(); 329 schedule();
335 spin_lock_irq(&wq->lock); 330 spin_lock_irq(&wq->lock);
331
336 return 0; 332 return 0;
337} 333}
338EXPORT_SYMBOL(do_wait_intr_irq); 334EXPORT_SYMBOL(do_wait_intr_irq);
@@ -378,6 +374,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
378 374
379 if (ret) 375 if (ret)
380 list_del_init(&wq_entry->entry); 376 list_del_init(&wq_entry->entry);
377
381 return ret; 378 return ret;
382} 379}
383EXPORT_SYMBOL(autoremove_wake_function); 380EXPORT_SYMBOL(autoremove_wake_function);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 84cb3acd9260..c67c6d24adc2 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,10 +1,7 @@
1/* 1/*
2 * The implementation of the wait_bit*() and related waiting APIs: 2 * The implementation of the wait_bit*() and related waiting APIs:
3 */ 3 */
4#include <linux/wait_bit.h> 4#include "sched.h"
5#include <linux/sched/signal.h>
6#include <linux/sched/debug.h>
7#include <linux/hash.h>
8 5
9#define WAIT_TABLE_BITS 8 6#define WAIT_TABLE_BITS 8
10#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) 7#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
@@ -29,8 +26,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
29 wait_bit->key.bit_nr != key->bit_nr || 26 wait_bit->key.bit_nr != key->bit_nr ||
30 test_bit(key->bit_nr, key->flags)) 27 test_bit(key->bit_nr, key->flags))
31 return 0; 28 return 0;
32 else 29
33 return autoremove_wake_function(wq_entry, mode, sync, key); 30 return autoremove_wake_function(wq_entry, mode, sync, key);
34} 31}
35EXPORT_SYMBOL(wake_bit_function); 32EXPORT_SYMBOL(wake_bit_function);
36 33
@@ -50,7 +47,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
50 if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) 47 if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
51 ret = (*action)(&wbq_entry->key, mode); 48 ret = (*action)(&wbq_entry->key, mode);
52 } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); 49 } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
50
53 finish_wait(wq_head, &wbq_entry->wq_entry); 51 finish_wait(wq_head, &wbq_entry->wq_entry);
52
54 return ret; 53 return ret;
55} 54}
56EXPORT_SYMBOL(__wait_on_bit); 55EXPORT_SYMBOL(__wait_on_bit);
@@ -73,6 +72,7 @@ int __sched out_of_line_wait_on_bit_timeout(
73 DEFINE_WAIT_BIT(wq_entry, word, bit); 72 DEFINE_WAIT_BIT(wq_entry, word, bit);
74 73
75 wq_entry.key.timeout = jiffies + timeout; 74 wq_entry.key.timeout = jiffies + timeout;
75
76 return __wait_on_bit(wq_head, &wq_entry, action, mode); 76 return __wait_on_bit(wq_head, &wq_entry, action, mode);
77} 77}
78EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); 78EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
@@ -120,6 +120,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
120void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) 120void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
121{ 121{
122 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); 122 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
123
123 if (waitqueue_active(wq_head)) 124 if (waitqueue_active(wq_head))
124 __wake_up(wq_head, TASK_NORMAL, 1, &key); 125 __wake_up(wq_head, TASK_NORMAL, 1, &key);
125} 126}
@@ -148,108 +149,55 @@ void wake_up_bit(void *word, int bit)
148} 149}
149EXPORT_SYMBOL(wake_up_bit); 150EXPORT_SYMBOL(wake_up_bit);
150 151
151/* 152wait_queue_head_t *__var_waitqueue(void *p)
152 * Manipulate the atomic_t address to produce a better bit waitqueue table hash
153 * index (we're keying off bit -1, but that would produce a horrible hash
154 * value).
155 */
156static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
157{ 153{
158 if (BITS_PER_LONG == 64) { 154 return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS);
159 unsigned long q = (unsigned long)p;
160 return bit_waitqueue((void *)(q & ~1), q & 1);
161 }
162 return bit_waitqueue(p, 0);
163} 155}
156EXPORT_SYMBOL(__var_waitqueue);
164 157
165static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, 158static int
166 void *arg) 159var_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode,
160 int sync, void *arg)
167{ 161{
168 struct wait_bit_key *key = arg; 162 struct wait_bit_key *key = arg;
169 struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); 163 struct wait_bit_queue_entry *wbq_entry =
170 atomic_t *val = key->flags; 164 container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
171 165
172 if (wait_bit->key.flags != key->flags || 166 if (wbq_entry->key.flags != key->flags ||
173 wait_bit->key.bit_nr != key->bit_nr || 167 wbq_entry->key.bit_nr != key->bit_nr)
174 atomic_read(val) != 0)
175 return 0; 168 return 0;
176 return autoremove_wake_function(wq_entry, mode, sync, key);
177}
178 169
179/* 170 return autoremove_wake_function(wq_entry, mode, sync, key);
180 * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
181 * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero
182 * return codes halt waiting and return.
183 */
184static __sched
185int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
186 wait_atomic_t_action_f action, unsigned int mode)
187{
188 atomic_t *val;
189 int ret = 0;
190
191 do {
192 prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
193 val = wbq_entry->key.flags;
194 if (atomic_read(val) == 0)
195 break;
196 ret = (*action)(val, mode);
197 } while (!ret && atomic_read(val) != 0);
198 finish_wait(wq_head, &wbq_entry->wq_entry);
199 return ret;
200} 171}
201 172
202#define DEFINE_WAIT_ATOMIC_T(name, p) \ 173void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags)
203 struct wait_bit_queue_entry name = { \
204 .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \
205 .wq_entry = { \
206 .private = current, \
207 .func = wake_atomic_t_function, \
208 .entry = \
209 LIST_HEAD_INIT((name).wq_entry.entry), \
210 }, \
211 }
212
213__sched int out_of_line_wait_on_atomic_t(atomic_t *p,
214 wait_atomic_t_action_f action,
215 unsigned int mode)
216{ 174{
217 struct wait_queue_head *wq_head = atomic_t_waitqueue(p); 175 *wbq_entry = (struct wait_bit_queue_entry){
218 DEFINE_WAIT_ATOMIC_T(wq_entry, p); 176 .key = {
219 177 .flags = (var),
220 return __wait_on_atomic_t(wq_head, &wq_entry, action, mode); 178 .bit_nr = -1,
179 },
180 .wq_entry = {
181 .private = current,
182 .func = var_wake_function,
183 .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),
184 },
185 };
221} 186}
222EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); 187EXPORT_SYMBOL(init_wait_var_entry);
223 188
224__sched int atomic_t_wait(atomic_t *counter, unsigned int mode) 189void wake_up_var(void *var)
225{ 190{
226 schedule(); 191 __wake_up_bit(__var_waitqueue(var), var, -1);
227 if (signal_pending_state(mode, current))
228 return -EINTR;
229 return 0;
230} 192}
231EXPORT_SYMBOL(atomic_t_wait); 193EXPORT_SYMBOL(wake_up_var);
232
233/**
234 * wake_up_atomic_t - Wake up a waiter on a atomic_t
235 * @p: The atomic_t being waited on, a kernel virtual address
236 *
237 * Wake up anyone waiting for the atomic_t to go to zero.
238 *
239 * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
240 * check is done by the waiter's wake function, not the by the waker itself).
241 */
242void wake_up_atomic_t(atomic_t *p)
243{
244 __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
245}
246EXPORT_SYMBOL(wake_up_atomic_t);
247 194
248__sched int bit_wait(struct wait_bit_key *word, int mode) 195__sched int bit_wait(struct wait_bit_key *word, int mode)
249{ 196{
250 schedule(); 197 schedule();
251 if (signal_pending_state(mode, current)) 198 if (signal_pending_state(mode, current))
252 return -EINTR; 199 return -EINTR;
200
253 return 0; 201 return 0;
254} 202}
255EXPORT_SYMBOL(bit_wait); 203EXPORT_SYMBOL(bit_wait);
@@ -259,6 +207,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode)
259 io_schedule(); 207 io_schedule();
260 if (signal_pending_state(mode, current)) 208 if (signal_pending_state(mode, current))
261 return -EINTR; 209 return -EINTR;
210
262 return 0; 211 return 0;
263} 212}
264EXPORT_SYMBOL(bit_wait_io); 213EXPORT_SYMBOL(bit_wait_io);
@@ -266,11 +215,13 @@ EXPORT_SYMBOL(bit_wait_io);
266__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) 215__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
267{ 216{
268 unsigned long now = READ_ONCE(jiffies); 217 unsigned long now = READ_ONCE(jiffies);
218
269 if (time_after_eq(now, word->timeout)) 219 if (time_after_eq(now, word->timeout))
270 return -EAGAIN; 220 return -EAGAIN;
271 schedule_timeout(word->timeout - now); 221 schedule_timeout(word->timeout - now);
272 if (signal_pending_state(mode, current)) 222 if (signal_pending_state(mode, current))
273 return -EINTR; 223 return -EINTR;
224
274 return 0; 225 return 0;
275} 226}
276EXPORT_SYMBOL_GPL(bit_wait_timeout); 227EXPORT_SYMBOL_GPL(bit_wait_timeout);
@@ -278,11 +229,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
278__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) 229__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
279{ 230{
280 unsigned long now = READ_ONCE(jiffies); 231 unsigned long now = READ_ONCE(jiffies);
232
281 if (time_after_eq(now, word->timeout)) 233 if (time_after_eq(now, word->timeout))
282 return -EAGAIN; 234 return -EAGAIN;
283 io_schedule_timeout(word->timeout - now); 235 io_schedule_timeout(word->timeout - now);
284 if (signal_pending_state(mode, current)) 236 if (signal_pending_state(mode, current))
285 return -EINTR; 237 return -EINTR;
238
286 return 0; 239 return 0;
287} 240}
288EXPORT_SYMBOL_GPL(bit_wait_io_timeout); 241EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/kernel/signal.c b/kernel/signal.c
index c6e4c83dc090..f04466655238 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3573,9 +3573,8 @@ int __save_altstack(stack_t __user *uss, unsigned long sp)
3573} 3573}
3574 3574
3575#ifdef CONFIG_COMPAT 3575#ifdef CONFIG_COMPAT
3576COMPAT_SYSCALL_DEFINE2(sigaltstack, 3576static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr,
3577 const compat_stack_t __user *, uss_ptr, 3577 compat_stack_t __user *uoss_ptr)
3578 compat_stack_t __user *, uoss_ptr)
3579{ 3578{
3580 stack_t uss, uoss; 3579 stack_t uss, uoss;
3581 int ret; 3580 int ret;
@@ -3602,9 +3601,16 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack,
3602 return ret; 3601 return ret;
3603} 3602}
3604 3603
3604COMPAT_SYSCALL_DEFINE2(sigaltstack,
3605 const compat_stack_t __user *, uss_ptr,
3606 compat_stack_t __user *, uoss_ptr)
3607{
3608 return do_compat_sigaltstack(uss_ptr, uoss_ptr);
3609}
3610
3605int compat_restore_altstack(const compat_stack_t __user *uss) 3611int compat_restore_altstack(const compat_stack_t __user *uss)
3606{ 3612{
3607 int err = compat_sys_sigaltstack(uss, NULL); 3613 int err = do_compat_sigaltstack(uss, NULL);
3608 /* squash all but -EFAULT for now */ 3614 /* squash all but -EFAULT for now */
3609 return err == -EFAULT ? err : 0; 3615 return err == -EFAULT ? err : 0;
3610} 3616}
@@ -3629,11 +3635,20 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
3629 3635
3630/** 3636/**
3631 * sys_sigpending - examine pending signals 3637 * sys_sigpending - examine pending signals
3632 * @set: where mask of pending signal is returned 3638 * @uset: where mask of pending signal is returned
3633 */ 3639 */
3634SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) 3640SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset)
3635{ 3641{
3636 return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); 3642 sigset_t set;
3643 int err;
3644
3645 if (sizeof(old_sigset_t) > sizeof(*uset))
3646 return -EINVAL;
3647
3648 err = do_sigpending(&set);
3649 if (!err && copy_to_user(uset, &set, sizeof(old_sigset_t)))
3650 err = -EFAULT;
3651 return err;
3637} 3652}
3638 3653
3639#ifdef CONFIG_COMPAT 3654#ifdef CONFIG_COMPAT
diff --git a/kernel/sys.c b/kernel/sys.c
index f2289de20e19..ad692183dfe9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -69,6 +69,8 @@
69#include <asm/io.h> 69#include <asm/io.h>
70#include <asm/unistd.h> 70#include <asm/unistd.h>
71 71
72#include "uid16.h"
73
72#ifndef SET_UNALIGN_CTL 74#ifndef SET_UNALIGN_CTL
73# define SET_UNALIGN_CTL(a, b) (-EINVAL) 75# define SET_UNALIGN_CTL(a, b) (-EINVAL)
74#endif 76#endif
@@ -340,7 +342,7 @@ out_unlock:
340 * operations (as far as semantic preservation is concerned). 342 * operations (as far as semantic preservation is concerned).
341 */ 343 */
342#ifdef CONFIG_MULTIUSER 344#ifdef CONFIG_MULTIUSER
343SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) 345long __sys_setregid(gid_t rgid, gid_t egid)
344{ 346{
345 struct user_namespace *ns = current_user_ns(); 347 struct user_namespace *ns = current_user_ns();
346 const struct cred *old; 348 const struct cred *old;
@@ -392,12 +394,17 @@ error:
392 return retval; 394 return retval;
393} 395}
394 396
397SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
398{
399 return __sys_setregid(rgid, egid);
400}
401
395/* 402/*
396 * setgid() is implemented like SysV w/ SAVED_IDS 403 * setgid() is implemented like SysV w/ SAVED_IDS
397 * 404 *
398 * SMP: Same implicit races as above. 405 * SMP: Same implicit races as above.
399 */ 406 */
400SYSCALL_DEFINE1(setgid, gid_t, gid) 407long __sys_setgid(gid_t gid)
401{ 408{
402 struct user_namespace *ns = current_user_ns(); 409 struct user_namespace *ns = current_user_ns();
403 const struct cred *old; 410 const struct cred *old;
@@ -429,6 +436,11 @@ error:
429 return retval; 436 return retval;
430} 437}
431 438
439SYSCALL_DEFINE1(setgid, gid_t, gid)
440{
441 return __sys_setgid(gid);
442}
443
432/* 444/*
433 * change the user struct in a credentials set to match the new UID 445 * change the user struct in a credentials set to match the new UID
434 */ 446 */
@@ -473,7 +485,7 @@ static int set_user(struct cred *new)
473 * 100% compatible with BSD. A program which uses just setuid() will be 485 * 100% compatible with BSD. A program which uses just setuid() will be
474 * 100% compatible with POSIX with saved IDs. 486 * 100% compatible with POSIX with saved IDs.
475 */ 487 */
476SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) 488long __sys_setreuid(uid_t ruid, uid_t euid)
477{ 489{
478 struct user_namespace *ns = current_user_ns(); 490 struct user_namespace *ns = current_user_ns();
479 const struct cred *old; 491 const struct cred *old;
@@ -533,6 +545,11 @@ error:
533 return retval; 545 return retval;
534} 546}
535 547
548SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
549{
550 return __sys_setreuid(ruid, euid);
551}
552
536/* 553/*
537 * setuid() is implemented like SysV with SAVED_IDS 554 * setuid() is implemented like SysV with SAVED_IDS
538 * 555 *
@@ -544,7 +561,7 @@ error:
544 * will allow a root program to temporarily drop privileges and be able to 561 * will allow a root program to temporarily drop privileges and be able to
545 * regain them by swapping the real and effective uid. 562 * regain them by swapping the real and effective uid.
546 */ 563 */
547SYSCALL_DEFINE1(setuid, uid_t, uid) 564long __sys_setuid(uid_t uid)
548{ 565{
549 struct user_namespace *ns = current_user_ns(); 566 struct user_namespace *ns = current_user_ns();
550 const struct cred *old; 567 const struct cred *old;
@@ -586,12 +603,17 @@ error:
586 return retval; 603 return retval;
587} 604}
588 605
606SYSCALL_DEFINE1(setuid, uid_t, uid)
607{
608 return __sys_setuid(uid);
609}
610
589 611
590/* 612/*
591 * This function implements a generic ability to update ruid, euid, 613 * This function implements a generic ability to update ruid, euid,
592 * and suid. This allows you to implement the 4.4 compatible seteuid(). 614 * and suid. This allows you to implement the 4.4 compatible seteuid().
593 */ 615 */
594SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) 616long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
595{ 617{
596 struct user_namespace *ns = current_user_ns(); 618 struct user_namespace *ns = current_user_ns();
597 const struct cred *old; 619 const struct cred *old;
@@ -656,6 +678,11 @@ error:
656 return retval; 678 return retval;
657} 679}
658 680
681SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
682{
683 return __sys_setresuid(ruid, euid, suid);
684}
685
659SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) 686SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
660{ 687{
661 const struct cred *cred = current_cred(); 688 const struct cred *cred = current_cred();
@@ -678,7 +705,7 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _
678/* 705/*
679 * Same as above, but for rgid, egid, sgid. 706 * Same as above, but for rgid, egid, sgid.
680 */ 707 */
681SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) 708long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
682{ 709{
683 struct user_namespace *ns = current_user_ns(); 710 struct user_namespace *ns = current_user_ns();
684 const struct cred *old; 711 const struct cred *old;
@@ -730,6 +757,11 @@ error:
730 return retval; 757 return retval;
731} 758}
732 759
760SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
761{
762 return __sys_setresgid(rgid, egid, sgid);
763}
764
733SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) 765SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
734{ 766{
735 const struct cred *cred = current_cred(); 767 const struct cred *cred = current_cred();
@@ -757,7 +789,7 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _
757 * whatever uid it wants to). It normally shadows "euid", except when 789 * whatever uid it wants to). It normally shadows "euid", except when
758 * explicitly set by setfsuid() or for access.. 790 * explicitly set by setfsuid() or for access..
759 */ 791 */
760SYSCALL_DEFINE1(setfsuid, uid_t, uid) 792long __sys_setfsuid(uid_t uid)
761{ 793{
762 const struct cred *old; 794 const struct cred *old;
763 struct cred *new; 795 struct cred *new;
@@ -793,10 +825,15 @@ change_okay:
793 return old_fsuid; 825 return old_fsuid;
794} 826}
795 827
828SYSCALL_DEFINE1(setfsuid, uid_t, uid)
829{
830 return __sys_setfsuid(uid);
831}
832
796/* 833/*
797 * Samma på svenska.. 834 * Samma på svenska..
798 */ 835 */
799SYSCALL_DEFINE1(setfsgid, gid_t, gid) 836long __sys_setfsgid(gid_t gid)
800{ 837{
801 const struct cred *old; 838 const struct cred *old;
802 struct cred *new; 839 struct cred *new;
@@ -830,6 +867,11 @@ change_okay:
830 commit_creds(new); 867 commit_creds(new);
831 return old_fsgid; 868 return old_fsgid;
832} 869}
870
871SYSCALL_DEFINE1(setfsgid, gid_t, gid)
872{
873 return __sys_setfsgid(gid);
874}
833#endif /* CONFIG_MULTIUSER */ 875#endif /* CONFIG_MULTIUSER */
834 876
835/** 877/**
@@ -1027,7 +1069,7 @@ out:
1027 return err; 1069 return err;
1028} 1070}
1029 1071
1030SYSCALL_DEFINE1(getpgid, pid_t, pid) 1072static int do_getpgid(pid_t pid)
1031{ 1073{
1032 struct task_struct *p; 1074 struct task_struct *p;
1033 struct pid *grp; 1075 struct pid *grp;
@@ -1055,11 +1097,16 @@ out:
1055 return retval; 1097 return retval;
1056} 1098}
1057 1099
1100SYSCALL_DEFINE1(getpgid, pid_t, pid)
1101{
1102 return do_getpgid(pid);
1103}
1104
1058#ifdef __ARCH_WANT_SYS_GETPGRP 1105#ifdef __ARCH_WANT_SYS_GETPGRP
1059 1106
1060SYSCALL_DEFINE0(getpgrp) 1107SYSCALL_DEFINE0(getpgrp)
1061{ 1108{
1062 return sys_getpgid(0); 1109 return do_getpgid(0);
1063} 1110}
1064 1111
1065#endif 1112#endif
@@ -1103,7 +1150,7 @@ static void set_special_pids(struct pid *pid)
1103 change_pid(curr, PIDTYPE_PGID, pid); 1150 change_pid(curr, PIDTYPE_PGID, pid);
1104} 1151}
1105 1152
1106SYSCALL_DEFINE0(setsid) 1153int ksys_setsid(void)
1107{ 1154{
1108 struct task_struct *group_leader = current->group_leader; 1155 struct task_struct *group_leader = current->group_leader;
1109 struct pid *sid = task_pid(group_leader); 1156 struct pid *sid = task_pid(group_leader);
@@ -1136,6 +1183,11 @@ out:
1136 return err; 1183 return err;
1137} 1184}
1138 1185
1186SYSCALL_DEFINE0(setsid)
1187{
1188 return ksys_setsid();
1189}
1190
1139DECLARE_RWSEM(uts_sem); 1191DECLARE_RWSEM(uts_sem);
1140 1192
1141#ifdef COMPAT_UTS_MACHINE 1193#ifdef COMPAT_UTS_MACHINE
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index b5189762d275..6cafc008f6db 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -17,245 +17,406 @@ asmlinkage long sys_ni_syscall(void)
17 return -ENOSYS; 17 return -ENOSYS;
18} 18}
19 19
20cond_syscall(sys_quotactl); 20#define COND_SYSCALL(name) cond_syscall(sys_##name)
21cond_syscall(sys32_quotactl); 21#define COND_SYSCALL_COMPAT(name) cond_syscall(compat_sys_##name)
22cond_syscall(sys_acct); 22
23cond_syscall(sys_lookup_dcookie); 23/*
24cond_syscall(compat_sys_lookup_dcookie); 24 * This list is kept in the same order as include/uapi/asm-generic/unistd.h.
25cond_syscall(sys_swapon); 25 * Architecture specific entries go below, followed by deprecated or obsolete
26cond_syscall(sys_swapoff); 26 * system calls.
27cond_syscall(sys_kexec_load); 27 */
28cond_syscall(compat_sys_kexec_load); 28
29cond_syscall(sys_kexec_file_load); 29COND_SYSCALL(io_setup);
30cond_syscall(sys_init_module); 30COND_SYSCALL_COMPAT(io_setup);
31cond_syscall(sys_finit_module); 31COND_SYSCALL(io_destroy);
32cond_syscall(sys_delete_module); 32COND_SYSCALL(io_submit);
33cond_syscall(sys_socketpair); 33COND_SYSCALL_COMPAT(io_submit);
34cond_syscall(sys_bind); 34COND_SYSCALL(io_cancel);
35cond_syscall(sys_listen); 35COND_SYSCALL(io_getevents);
36cond_syscall(sys_accept); 36COND_SYSCALL_COMPAT(io_getevents);
37cond_syscall(sys_accept4); 37
38cond_syscall(sys_connect); 38/* fs/xattr.c */
39cond_syscall(sys_getsockname); 39
40cond_syscall(sys_getpeername); 40/* fs/dcache.c */
41cond_syscall(sys_sendto); 41
42cond_syscall(sys_send); 42/* fs/cookies.c */
43cond_syscall(sys_recvfrom); 43COND_SYSCALL(lookup_dcookie);
44cond_syscall(sys_recv); 44COND_SYSCALL_COMPAT(lookup_dcookie);
45cond_syscall(sys_socket); 45
46cond_syscall(sys_setsockopt); 46/* fs/eventfd.c */
47cond_syscall(compat_sys_setsockopt); 47COND_SYSCALL(eventfd2);
48cond_syscall(sys_getsockopt); 48
49cond_syscall(compat_sys_getsockopt); 49/* fs/eventfd.c */
50cond_syscall(sys_shutdown); 50COND_SYSCALL(epoll_create1);
51cond_syscall(sys_sendmsg); 51COND_SYSCALL(epoll_ctl);
52cond_syscall(sys_sendmmsg); 52COND_SYSCALL(epoll_pwait);
53cond_syscall(compat_sys_sendmsg); 53COND_SYSCALL_COMPAT(epoll_pwait);
54cond_syscall(compat_sys_sendmmsg); 54
55cond_syscall(sys_recvmsg); 55/* fs/fcntl.c */
56cond_syscall(sys_recvmmsg); 56
57cond_syscall(compat_sys_recvmsg); 57/* fs/inotify_user.c */
58cond_syscall(compat_sys_recv); 58COND_SYSCALL(inotify_init1);
59cond_syscall(compat_sys_recvfrom); 59COND_SYSCALL(inotify_add_watch);
60cond_syscall(compat_sys_recvmmsg); 60COND_SYSCALL(inotify_rm_watch);
61cond_syscall(sys_socketcall); 61
62cond_syscall(sys_futex); 62/* fs/ioctl.c */
63cond_syscall(compat_sys_futex); 63
64cond_syscall(sys_set_robust_list); 64/* fs/ioprio.c */
65cond_syscall(compat_sys_set_robust_list); 65COND_SYSCALL(ioprio_set);
66cond_syscall(sys_get_robust_list); 66COND_SYSCALL(ioprio_get);
67cond_syscall(compat_sys_get_robust_list); 67
68cond_syscall(sys_epoll_create); 68/* fs/locks.c */
69cond_syscall(sys_epoll_create1); 69COND_SYSCALL(flock);
70cond_syscall(sys_epoll_ctl); 70
71cond_syscall(sys_epoll_wait); 71/* fs/namei.c */
72cond_syscall(sys_epoll_pwait); 72
73cond_syscall(compat_sys_epoll_pwait); 73/* fs/namespace.c */
74cond_syscall(sys_semget); 74
75cond_syscall(sys_semop); 75/* fs/nfsctl.c */
76cond_syscall(sys_semtimedop); 76
77cond_syscall(compat_sys_semtimedop); 77/* fs/open.c */
78cond_syscall(sys_semctl); 78
79cond_syscall(compat_sys_semctl); 79/* fs/pipe.c */
80cond_syscall(sys_msgget); 80
81cond_syscall(sys_msgsnd); 81/* fs/quota.c */
82cond_syscall(compat_sys_msgsnd); 82COND_SYSCALL(quotactl);
83cond_syscall(sys_msgrcv); 83
84cond_syscall(compat_sys_msgrcv); 84/* fs/readdir.c */
85cond_syscall(sys_msgctl); 85
86cond_syscall(compat_sys_msgctl); 86/* fs/read_write.c */
87cond_syscall(sys_shmget); 87
88cond_syscall(sys_shmat); 88/* fs/sendfile.c */
89cond_syscall(compat_sys_shmat); 89
90cond_syscall(sys_shmdt); 90/* fs/select.c */
91cond_syscall(sys_shmctl); 91
92cond_syscall(compat_sys_shmctl); 92/* fs/signalfd.c */
93cond_syscall(sys_mq_open); 93COND_SYSCALL(signalfd4);
94cond_syscall(sys_mq_unlink); 94COND_SYSCALL_COMPAT(signalfd4);
95cond_syscall(sys_mq_timedsend); 95
96cond_syscall(sys_mq_timedreceive); 96/* fs/splice.c */
97cond_syscall(sys_mq_notify); 97
98cond_syscall(sys_mq_getsetattr); 98/* fs/stat.c */
99cond_syscall(compat_sys_mq_open); 99
100cond_syscall(compat_sys_mq_timedsend); 100/* fs/sync.c */
101cond_syscall(compat_sys_mq_timedreceive); 101
102cond_syscall(compat_sys_mq_notify); 102/* fs/timerfd.c */
103cond_syscall(compat_sys_mq_getsetattr); 103COND_SYSCALL(timerfd_create);
104cond_syscall(sys_mbind); 104COND_SYSCALL(timerfd_settime);
105cond_syscall(sys_get_mempolicy); 105COND_SYSCALL_COMPAT(timerfd_settime);
106cond_syscall(sys_set_mempolicy); 106COND_SYSCALL(timerfd_gettime);
107cond_syscall(compat_sys_mbind); 107COND_SYSCALL_COMPAT(timerfd_gettime);
108cond_syscall(compat_sys_get_mempolicy); 108
109cond_syscall(compat_sys_set_mempolicy); 109/* fs/utimes.c */
110cond_syscall(sys_add_key); 110
111cond_syscall(sys_request_key); 111/* kernel/acct.c */
112cond_syscall(sys_keyctl); 112COND_SYSCALL(acct);
113cond_syscall(compat_sys_keyctl); 113
114cond_syscall(compat_sys_socketcall); 114/* kernel/capability.c */
115cond_syscall(sys_inotify_init); 115COND_SYSCALL(capget);
116cond_syscall(sys_inotify_init1); 116COND_SYSCALL(capset);
117cond_syscall(sys_inotify_add_watch); 117
118cond_syscall(sys_inotify_rm_watch); 118/* kernel/exec_domain.c */
119cond_syscall(sys_migrate_pages); 119
120cond_syscall(sys_move_pages); 120/* kernel/exit.c */
121cond_syscall(sys_chown16); 121
122cond_syscall(sys_fchown16); 122/* kernel/fork.c */
123cond_syscall(sys_getegid16); 123
124cond_syscall(sys_geteuid16); 124/* kernel/futex.c */
125cond_syscall(sys_getgid16); 125COND_SYSCALL(futex);
126cond_syscall(sys_getgroups16); 126COND_SYSCALL_COMPAT(futex);
127cond_syscall(sys_getresgid16); 127COND_SYSCALL(set_robust_list);
128cond_syscall(sys_getresuid16); 128COND_SYSCALL_COMPAT(set_robust_list);
129cond_syscall(sys_getuid16); 129COND_SYSCALL(get_robust_list);
130cond_syscall(sys_lchown16); 130COND_SYSCALL_COMPAT(get_robust_list);
131cond_syscall(sys_setfsgid16); 131
132cond_syscall(sys_setfsuid16); 132/* kernel/hrtimer.c */
133cond_syscall(sys_setgid16); 133
134cond_syscall(sys_setgroups16); 134/* kernel/itimer.c */
135cond_syscall(sys_setregid16); 135
136cond_syscall(sys_setresgid16); 136/* kernel/kexec.c */
137cond_syscall(sys_setresuid16); 137COND_SYSCALL(kexec_load);
138cond_syscall(sys_setreuid16); 138COND_SYSCALL_COMPAT(kexec_load);
139cond_syscall(sys_setuid16); 139
140cond_syscall(sys_sgetmask); 140/* kernel/module.c */
141cond_syscall(sys_ssetmask); 141COND_SYSCALL(init_module);
142cond_syscall(sys_vm86old); 142COND_SYSCALL(delete_module);
143cond_syscall(sys_vm86); 143
144cond_syscall(sys_modify_ldt); 144/* kernel/posix-timers.c */
145cond_syscall(sys_ipc); 145
146cond_syscall(compat_sys_ipc); 146/* kernel/printk.c */
147cond_syscall(compat_sys_sysctl); 147COND_SYSCALL(syslog);
148cond_syscall(sys_flock); 148
149cond_syscall(sys_io_setup); 149/* kernel/ptrace.c */
150cond_syscall(sys_io_destroy); 150
151cond_syscall(sys_io_submit); 151/* kernel/sched/core.c */
152cond_syscall(sys_io_cancel); 152
153cond_syscall(sys_io_getevents); 153/* kernel/signal.c */
154cond_syscall(compat_sys_io_setup); 154
155cond_syscall(compat_sys_io_submit); 155/* kernel/sys.c */
156cond_syscall(compat_sys_io_getevents); 156COND_SYSCALL(setregid);
157cond_syscall(sys_sysfs); 157COND_SYSCALL(setgid);
158cond_syscall(sys_syslog); 158COND_SYSCALL(setreuid);
159cond_syscall(sys_process_vm_readv); 159COND_SYSCALL(setuid);
160cond_syscall(sys_process_vm_writev); 160COND_SYSCALL(setresuid);
161cond_syscall(compat_sys_process_vm_readv); 161COND_SYSCALL(getresuid);
162cond_syscall(compat_sys_process_vm_writev); 162COND_SYSCALL(setresgid);
163cond_syscall(sys_uselib); 163COND_SYSCALL(getresgid);
164cond_syscall(sys_fadvise64); 164COND_SYSCALL(setfsuid);
165cond_syscall(sys_fadvise64_64); 165COND_SYSCALL(setfsgid);
166cond_syscall(sys_madvise); 166COND_SYSCALL(setgroups);
167cond_syscall(sys_setuid); 167COND_SYSCALL(getgroups);
168cond_syscall(sys_setregid); 168
169cond_syscall(sys_setgid); 169/* kernel/time.c */
170cond_syscall(sys_setreuid); 170
171cond_syscall(sys_setresuid); 171/* kernel/timer.c */
172cond_syscall(sys_getresuid); 172
173cond_syscall(sys_setresgid); 173/* ipc/mqueue.c */
174cond_syscall(sys_getresgid); 174COND_SYSCALL(mq_open);
175cond_syscall(sys_setgroups); 175COND_SYSCALL_COMPAT(mq_open);
176cond_syscall(sys_getgroups); 176COND_SYSCALL(mq_unlink);
177cond_syscall(sys_setfsuid); 177COND_SYSCALL(mq_timedsend);
178cond_syscall(sys_setfsgid); 178COND_SYSCALL_COMPAT(mq_timedsend);
179cond_syscall(sys_capget); 179COND_SYSCALL(mq_timedreceive);
180cond_syscall(sys_capset); 180COND_SYSCALL_COMPAT(mq_timedreceive);
181cond_syscall(sys_copy_file_range); 181COND_SYSCALL(mq_notify);
182 182COND_SYSCALL_COMPAT(mq_notify);
183/* arch-specific weak syscall entries */ 183COND_SYSCALL(mq_getsetattr);
184cond_syscall(sys_pciconfig_read); 184COND_SYSCALL_COMPAT(mq_getsetattr);
185cond_syscall(sys_pciconfig_write); 185
186cond_syscall(sys_pciconfig_iobase); 186/* ipc/msg.c */
187cond_syscall(compat_sys_s390_ipc); 187COND_SYSCALL(msgget);
188cond_syscall(ppc_rtas); 188COND_SYSCALL(msgctl);
189cond_syscall(sys_spu_run); 189COND_SYSCALL_COMPAT(msgctl);
190cond_syscall(sys_spu_create); 190COND_SYSCALL(msgrcv);
191cond_syscall(sys_subpage_prot); 191COND_SYSCALL_COMPAT(msgrcv);
192cond_syscall(sys_s390_pci_mmio_read); 192COND_SYSCALL(msgsnd);
193cond_syscall(sys_s390_pci_mmio_write); 193COND_SYSCALL_COMPAT(msgsnd);
194 194
195/* mmu depending weak syscall entries */ 195/* ipc/sem.c */
196cond_syscall(sys_mprotect); 196COND_SYSCALL(semget);
197cond_syscall(sys_msync); 197COND_SYSCALL(semctl);
198cond_syscall(sys_mlock); 198COND_SYSCALL_COMPAT(semctl);
199cond_syscall(sys_munlock); 199COND_SYSCALL(semtimedop);
200cond_syscall(sys_mlockall); 200COND_SYSCALL_COMPAT(semtimedop);
201cond_syscall(sys_munlockall); 201COND_SYSCALL(semop);
202cond_syscall(sys_mlock2); 202
203cond_syscall(sys_mincore); 203/* ipc/shm.c */
204cond_syscall(sys_madvise); 204COND_SYSCALL(shmget);
205cond_syscall(sys_mremap); 205COND_SYSCALL(shmctl);
206cond_syscall(sys_remap_file_pages); 206COND_SYSCALL_COMPAT(shmctl);
207cond_syscall(compat_sys_move_pages); 207COND_SYSCALL(shmat);
208cond_syscall(compat_sys_migrate_pages); 208COND_SYSCALL_COMPAT(shmat);
209 209COND_SYSCALL(shmdt);
210/* block-layer dependent */ 210
211cond_syscall(sys_bdflush); 211/* net/socket.c */
212cond_syscall(sys_ioprio_set); 212COND_SYSCALL(socket);
213cond_syscall(sys_ioprio_get); 213COND_SYSCALL(socketpair);
214 214COND_SYSCALL(bind);
215/* New file descriptors */ 215COND_SYSCALL(listen);
216cond_syscall(sys_signalfd); 216COND_SYSCALL(accept);
217cond_syscall(sys_signalfd4); 217COND_SYSCALL(connect);
218cond_syscall(compat_sys_signalfd); 218COND_SYSCALL(getsockname);
219cond_syscall(compat_sys_signalfd4); 219COND_SYSCALL(getpeername);
220cond_syscall(sys_timerfd_create); 220COND_SYSCALL(setsockopt);
221cond_syscall(sys_timerfd_settime); 221COND_SYSCALL_COMPAT(setsockopt);
222cond_syscall(sys_timerfd_gettime); 222COND_SYSCALL(getsockopt);
223cond_syscall(compat_sys_timerfd_settime); 223COND_SYSCALL_COMPAT(getsockopt);
224cond_syscall(compat_sys_timerfd_gettime); 224COND_SYSCALL(sendto);
225cond_syscall(sys_eventfd); 225COND_SYSCALL(shutdown);
226cond_syscall(sys_eventfd2); 226COND_SYSCALL(recvfrom);
227cond_syscall(sys_memfd_create); 227COND_SYSCALL_COMPAT(recvfrom);
228cond_syscall(sys_userfaultfd); 228COND_SYSCALL(sendmsg);
229 229COND_SYSCALL_COMPAT(sendmsg);
230/* performance counters: */ 230COND_SYSCALL(recvmsg);
231cond_syscall(sys_perf_event_open); 231COND_SYSCALL_COMPAT(recvmsg);
232 232
233/* fanotify! */ 233/* mm/filemap.c */
234cond_syscall(sys_fanotify_init); 234
235cond_syscall(sys_fanotify_mark); 235/* mm/nommu.c, also with MMU */
236cond_syscall(compat_sys_fanotify_mark); 236COND_SYSCALL(mremap);
237
238/* security/keys/keyctl.c */
239COND_SYSCALL(add_key);
240COND_SYSCALL(request_key);
241COND_SYSCALL(keyctl);
242COND_SYSCALL_COMPAT(keyctl);
243
244/* arch/example/kernel/sys_example.c */
245
246/* mm/fadvise.c */
247COND_SYSCALL(fadvise64_64);
248
249/* mm/, CONFIG_MMU only */
250COND_SYSCALL(swapon);
251COND_SYSCALL(swapoff);
252COND_SYSCALL(mprotect);
253COND_SYSCALL(msync);
254COND_SYSCALL(mlock);
255COND_SYSCALL(munlock);
256COND_SYSCALL(mlockall);
257COND_SYSCALL(munlockall);
258COND_SYSCALL(mincore);
259COND_SYSCALL(madvise);
260COND_SYSCALL(remap_file_pages);
261COND_SYSCALL(mbind);
262COND_SYSCALL_COMPAT(mbind);
263COND_SYSCALL(get_mempolicy);
264COND_SYSCALL_COMPAT(get_mempolicy);
265COND_SYSCALL(set_mempolicy);
266COND_SYSCALL_COMPAT(set_mempolicy);
267COND_SYSCALL(migrate_pages);
268COND_SYSCALL_COMPAT(migrate_pages);
269COND_SYSCALL(move_pages);
270COND_SYSCALL_COMPAT(move_pages);
271
272COND_SYSCALL(perf_event_open);
273COND_SYSCALL(accept4);
274COND_SYSCALL(recvmmsg);
275COND_SYSCALL_COMPAT(recvmmsg);
276
277/*
278 * Architecture specific syscalls: see further below
279 */
280
281/* fanotify */
282COND_SYSCALL(fanotify_init);
283COND_SYSCALL(fanotify_mark);
237 284
238/* open by handle */ 285/* open by handle */
239cond_syscall(sys_name_to_handle_at); 286COND_SYSCALL(name_to_handle_at);
240cond_syscall(sys_open_by_handle_at); 287COND_SYSCALL(open_by_handle_at);
241cond_syscall(compat_sys_open_by_handle_at); 288COND_SYSCALL_COMPAT(open_by_handle_at);
289
290COND_SYSCALL(sendmmsg);
291COND_SYSCALL_COMPAT(sendmmsg);
292COND_SYSCALL(process_vm_readv);
293COND_SYSCALL_COMPAT(process_vm_readv);
294COND_SYSCALL(process_vm_writev);
295COND_SYSCALL_COMPAT(process_vm_writev);
242 296
243/* compare kernel pointers */ 297/* compare kernel pointers */
244cond_syscall(sys_kcmp); 298COND_SYSCALL(kcmp);
299
300COND_SYSCALL(finit_module);
245 301
246/* operate on Secure Computing state */ 302/* operate on Secure Computing state */
247cond_syscall(sys_seccomp); 303COND_SYSCALL(seccomp);
304
305COND_SYSCALL(memfd_create);
248 306
249/* access BPF programs and maps */ 307/* access BPF programs and maps */
250cond_syscall(sys_bpf); 308COND_SYSCALL(bpf);
251 309
252/* execveat */ 310/* execveat */
253cond_syscall(sys_execveat); 311COND_SYSCALL(execveat);
312
313COND_SYSCALL(userfaultfd);
254 314
255/* membarrier */ 315/* membarrier */
256cond_syscall(sys_membarrier); 316COND_SYSCALL(membarrier);
317
318COND_SYSCALL(mlock2);
319
320COND_SYSCALL(copy_file_range);
257 321
258/* memory protection keys */ 322/* memory protection keys */
259cond_syscall(sys_pkey_mprotect); 323COND_SYSCALL(pkey_mprotect);
260cond_syscall(sys_pkey_alloc); 324COND_SYSCALL(pkey_alloc);
261cond_syscall(sys_pkey_free); 325COND_SYSCALL(pkey_free);
326
327
328/*
329 * Architecture specific weak syscall entries.
330 */
331
332/* pciconfig: alpha, arm, arm64, ia64, sparc */
333COND_SYSCALL(pciconfig_read);
334COND_SYSCALL(pciconfig_write);
335COND_SYSCALL(pciconfig_iobase);
336
337/* sys_socketcall: arm, mips, x86, ... */
338COND_SYSCALL(socketcall);
339COND_SYSCALL_COMPAT(socketcall);
340
341/* compat syscalls for arm64, x86, ... */
342COND_SYSCALL_COMPAT(sysctl);
343COND_SYSCALL_COMPAT(fanotify_mark);
344
345/* x86 */
346COND_SYSCALL(vm86old);
347COND_SYSCALL(modify_ldt);
348COND_SYSCALL_COMPAT(quotactl32);
349COND_SYSCALL(vm86);
350COND_SYSCALL(kexec_file_load);
351
352/* s390 */
353COND_SYSCALL(s390_pci_mmio_read);
354COND_SYSCALL(s390_pci_mmio_write);
355COND_SYSCALL_COMPAT(s390_ipc);
356
357/* powerpc */
358cond_syscall(ppc_rtas);
359COND_SYSCALL(spu_run);
360COND_SYSCALL(spu_create);
361COND_SYSCALL(subpage_prot);
362
363
364/*
365 * Deprecated system calls which are still defined in
366 * include/uapi/asm-generic/unistd.h and wanted by >= 1 arch
367 */
368
369/* __ARCH_WANT_SYSCALL_NO_FLAGS */
370COND_SYSCALL(epoll_create);
371COND_SYSCALL(inotify_init);
372COND_SYSCALL(eventfd);
373COND_SYSCALL(signalfd);
374COND_SYSCALL_COMPAT(signalfd);
375
376/* __ARCH_WANT_SYSCALL_OFF_T */
377COND_SYSCALL(fadvise64);
378
379/* __ARCH_WANT_SYSCALL_DEPRECATED */
380COND_SYSCALL(epoll_wait);
381COND_SYSCALL(recv);
382COND_SYSCALL_COMPAT(recv);
383COND_SYSCALL(send);
384COND_SYSCALL(bdflush);
385COND_SYSCALL(uselib);
386
387
388/*
389 * The syscalls below are not found in include/uapi/asm-generic/unistd.h
390 */
391
392/* obsolete: SGETMASK_SYSCALL */
393COND_SYSCALL(sgetmask);
394COND_SYSCALL(ssetmask);
395
396/* obsolete: SYSFS_SYSCALL */
397COND_SYSCALL(sysfs);
398
399/* obsolete: __ARCH_WANT_SYS_IPC */
400COND_SYSCALL(ipc);
401COND_SYSCALL_COMPAT(ipc);
402
403/* obsolete: UID16 */
404COND_SYSCALL(chown16);
405COND_SYSCALL(fchown16);
406COND_SYSCALL(getegid16);
407COND_SYSCALL(geteuid16);
408COND_SYSCALL(getgid16);
409COND_SYSCALL(getgroups16);
410COND_SYSCALL(getresgid16);
411COND_SYSCALL(getresuid16);
412COND_SYSCALL(getuid16);
413COND_SYSCALL(lchown16);
414COND_SYSCALL(setfsgid16);
415COND_SYSCALL(setfsuid16);
416COND_SYSCALL(setgid16);
417COND_SYSCALL(setgroups16);
418COND_SYSCALL(setregid16);
419COND_SYSCALL(setresgid16);
420COND_SYSCALL(setresuid16);
421COND_SYSCALL(setreuid16);
422COND_SYSCALL(setuid16);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f6b5f19223d6..78eabc41eaa6 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -113,16 +113,6 @@ config NO_HZ_FULL
113 113
114endchoice 114endchoice
115 115
116config NO_HZ_FULL_ALL
117 bool "Full dynticks system on all CPUs by default (except CPU 0)"
118 depends on NO_HZ_FULL
119 help
120 If the user doesn't pass the nohz_full boot option to
121 define the range of full dynticks CPUs, consider that all
122 CPUs in the system are full dynticks by default.
123 Note the boot CPU will still be kept outside the range to
124 handle the timekeeping duty.
125
126config NO_HZ 116config NO_HZ
127 bool "Old Idle dynticks config" 117 bool "Old Idle dynticks config"
128 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS 118 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 29a5733eff83..5d4a0342f934 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -405,30 +405,12 @@ static int tick_nohz_cpu_down(unsigned int cpu)
405 return 0; 405 return 0;
406} 406}
407 407
408static int tick_nohz_init_all(void)
409{
410 int err = -1;
411
412#ifdef CONFIG_NO_HZ_FULL_ALL
413 if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
414 WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n");
415 return err;
416 }
417 err = 0;
418 cpumask_setall(tick_nohz_full_mask);
419 tick_nohz_full_running = true;
420#endif
421 return err;
422}
423
424void __init tick_nohz_init(void) 408void __init tick_nohz_init(void)
425{ 409{
426 int cpu, ret; 410 int cpu, ret;
427 411
428 if (!tick_nohz_full_running) { 412 if (!tick_nohz_full_running)
429 if (tick_nohz_init_all() < 0) 413 return;
430 return;
431 }
432 414
433 /* 415 /*
434 * Full dynticks uses irq work to drive the tick rescheduling on safe 416 * Full dynticks uses irq work to drive the tick rescheduling on safe
@@ -481,11 +463,18 @@ static int __init setup_tick_nohz(char *str)
481 463
482__setup("nohz=", setup_tick_nohz); 464__setup("nohz=", setup_tick_nohz);
483 465
484int tick_nohz_tick_stopped(void) 466bool tick_nohz_tick_stopped(void)
485{ 467{
486 return __this_cpu_read(tick_cpu_sched.tick_stopped); 468 return __this_cpu_read(tick_cpu_sched.tick_stopped);
487} 469}
488 470
471bool tick_nohz_tick_stopped_cpu(int cpu)
472{
473 struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
474
475 return ts->tick_stopped;
476}
477
489/** 478/**
490 * tick_nohz_update_jiffies - update jiffies when idle was interrupted 479 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
491 * 480 *
@@ -741,12 +730,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
741 delta = KTIME_MAX; 730 delta = KTIME_MAX;
742 } 731 }
743 732
744#ifdef CONFIG_NO_HZ_FULL
745 /* Limit the tick delta to the maximum scheduler deferment */
746 if (!ts->inidle)
747 delta = min(delta, scheduler_tick_max_deferment());
748#endif
749
750 /* Calculate the next expiry time */ 733 /* Calculate the next expiry time */
751 if (delta < (KTIME_MAX - basemono)) 734 if (delta < (KTIME_MAX - basemono))
752 expires = basemono + delta; 735 expires = basemono + delta;
@@ -953,13 +936,6 @@ void tick_nohz_idle_enter(void)
953 struct tick_sched *ts; 936 struct tick_sched *ts;
954 937
955 lockdep_assert_irqs_enabled(); 938 lockdep_assert_irqs_enabled();
956 /*
957 * Update the idle state in the scheduler domain hierarchy
958 * when tick_nohz_stop_sched_tick() is called from the idle loop.
959 * State will be updated to busy during the first busy tick after
960 * exiting idle.
961 */
962 set_cpu_sd_state_idle();
963 939
964 local_irq_disable(); 940 local_irq_disable();
965 941
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 55d6dff37daf..2c416509b834 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -8,6 +8,7 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11#include "trace_probe.h"
11 12
12static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; 13static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
13 14
@@ -237,6 +238,107 @@ void perf_trace_destroy(struct perf_event *p_event)
237 mutex_unlock(&event_mutex); 238 mutex_unlock(&event_mutex);
238} 239}
239 240
241#ifdef CONFIG_KPROBE_EVENTS
242int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
243{
244 int ret;
245 char *func = NULL;
246 struct trace_event_call *tp_event;
247
248 if (p_event->attr.kprobe_func) {
249 func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL);
250 if (!func)
251 return -ENOMEM;
252 ret = strncpy_from_user(
253 func, u64_to_user_ptr(p_event->attr.kprobe_func),
254 KSYM_NAME_LEN);
255 if (ret < 0)
256 goto out;
257
258 if (func[0] == '\0') {
259 kfree(func);
260 func = NULL;
261 }
262 }
263
264 tp_event = create_local_trace_kprobe(
265 func, (void *)(unsigned long)(p_event->attr.kprobe_addr),
266 p_event->attr.probe_offset, is_retprobe);
267 if (IS_ERR(tp_event)) {
268 ret = PTR_ERR(tp_event);
269 goto out;
270 }
271
272 ret = perf_trace_event_init(tp_event, p_event);
273 if (ret)
274 destroy_local_trace_kprobe(tp_event);
275out:
276 kfree(func);
277 return ret;
278}
279
280void perf_kprobe_destroy(struct perf_event *p_event)
281{
282 perf_trace_event_close(p_event);
283 perf_trace_event_unreg(p_event);
284
285 destroy_local_trace_kprobe(p_event->tp_event);
286}
287#endif /* CONFIG_KPROBE_EVENTS */
288
289#ifdef CONFIG_UPROBE_EVENTS
290int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe)
291{
292 int ret;
293 char *path = NULL;
294 struct trace_event_call *tp_event;
295
296 if (!p_event->attr.uprobe_path)
297 return -EINVAL;
298 path = kzalloc(PATH_MAX, GFP_KERNEL);
299 if (!path)
300 return -ENOMEM;
301 ret = strncpy_from_user(
302 path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX);
303 if (ret < 0)
304 goto out;
305 if (path[0] == '\0') {
306 ret = -EINVAL;
307 goto out;
308 }
309
310 tp_event = create_local_trace_uprobe(
311 path, p_event->attr.probe_offset, is_retprobe);
312 if (IS_ERR(tp_event)) {
313 ret = PTR_ERR(tp_event);
314 goto out;
315 }
316
317 /*
318 * local trace_uprobe need to hold event_mutex to call
319 * uprobe_buffer_enable() and uprobe_buffer_disable().
320 * event_mutex is not required for local trace_kprobes.
321 */
322 mutex_lock(&event_mutex);
323 ret = perf_trace_event_init(tp_event, p_event);
324 if (ret)
325 destroy_local_trace_uprobe(tp_event);
326 mutex_unlock(&event_mutex);
327out:
328 kfree(path);
329 return ret;
330}
331
332void perf_uprobe_destroy(struct perf_event *p_event)
333{
334 mutex_lock(&event_mutex);
335 perf_trace_event_close(p_event);
336 perf_trace_event_unreg(p_event);
337 mutex_unlock(&event_mutex);
338 destroy_local_trace_uprobe(p_event->tp_event);
339}
340#endif /* CONFIG_UPROBE_EVENTS */
341
240int perf_trace_add(struct perf_event *p_event, int flags) 342int perf_trace_add(struct perf_event *p_event, int flags)
241{ 343{
242 struct trace_event_call *tp_event = p_event->tp_event; 344 struct trace_event_call *tp_event = p_event->tp_event;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index ae4147eaebd4..1cd3fb4d70f8 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -462,6 +462,14 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
462 disable_kprobe(&tk->rp.kp); 462 disable_kprobe(&tk->rp.kp);
463 wait = 1; 463 wait = 1;
464 } 464 }
465
466 /*
467 * if tk is not added to any list, it must be a local trace_kprobe
468 * created with perf_event_open. We don't need to wait for these
469 * trace_kprobes
470 */
471 if (list_empty(&tk->list))
472 wait = 0;
465 out: 473 out:
466 if (wait) { 474 if (wait) {
467 /* 475 /*
@@ -1358,12 +1366,9 @@ static struct trace_event_functions kprobe_funcs = {
1358 .trace = print_kprobe_event 1366 .trace = print_kprobe_event
1359}; 1367};
1360 1368
1361static int register_kprobe_event(struct trace_kprobe *tk) 1369static inline void init_trace_event_call(struct trace_kprobe *tk,
1370 struct trace_event_call *call)
1362{ 1371{
1363 struct trace_event_call *call = &tk->tp.call;
1364 int ret;
1365
1366 /* Initialize trace_event_call */
1367 INIT_LIST_HEAD(&call->class->fields); 1372 INIT_LIST_HEAD(&call->class->fields);
1368 if (trace_kprobe_is_return(tk)) { 1373 if (trace_kprobe_is_return(tk)) {
1369 call->event.funcs = &kretprobe_funcs; 1374 call->event.funcs = &kretprobe_funcs;
@@ -1372,6 +1377,19 @@ static int register_kprobe_event(struct trace_kprobe *tk)
1372 call->event.funcs = &kprobe_funcs; 1377 call->event.funcs = &kprobe_funcs;
1373 call->class->define_fields = kprobe_event_define_fields; 1378 call->class->define_fields = kprobe_event_define_fields;
1374 } 1379 }
1380
1381 call->flags = TRACE_EVENT_FL_KPROBE;
1382 call->class->reg = kprobe_register;
1383 call->data = tk;
1384}
1385
1386static int register_kprobe_event(struct trace_kprobe *tk)
1387{
1388 struct trace_event_call *call = &tk->tp.call;
1389 int ret = 0;
1390
1391 init_trace_event_call(tk, call);
1392
1375 if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) 1393 if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)
1376 return -ENOMEM; 1394 return -ENOMEM;
1377 ret = register_trace_event(&call->event); 1395 ret = register_trace_event(&call->event);
@@ -1379,9 +1397,6 @@ static int register_kprobe_event(struct trace_kprobe *tk)
1379 kfree(call->print_fmt); 1397 kfree(call->print_fmt);
1380 return -ENODEV; 1398 return -ENODEV;
1381 } 1399 }
1382 call->flags = TRACE_EVENT_FL_KPROBE;
1383 call->class->reg = kprobe_register;
1384 call->data = tk;
1385 ret = trace_add_event_call(call); 1400 ret = trace_add_event_call(call);
1386 if (ret) { 1401 if (ret) {
1387 pr_info("Failed to register kprobe event: %s\n", 1402 pr_info("Failed to register kprobe event: %s\n",
@@ -1403,6 +1418,66 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
1403 return ret; 1418 return ret;
1404} 1419}
1405 1420
1421#ifdef CONFIG_PERF_EVENTS
1422/* create a trace_kprobe, but don't add it to global lists */
1423struct trace_event_call *
1424create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
1425 bool is_return)
1426{
1427 struct trace_kprobe *tk;
1428 int ret;
1429 char *event;
1430
1431 /*
1432 * local trace_kprobes are not added to probe_list, so they are never
1433 * searched in find_trace_kprobe(). Therefore, there is no concern of
1434 * duplicated name here.
1435 */
1436 event = func ? func : "DUMMY_EVENT";
1437
1438 tk = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event, (void *)addr, func,
1439 offs, 0 /* maxactive */, 0 /* nargs */,
1440 is_return);
1441
1442 if (IS_ERR(tk)) {
1443 pr_info("Failed to allocate trace_probe.(%d)\n",
1444 (int)PTR_ERR(tk));
1445 return ERR_CAST(tk);
1446 }
1447
1448 init_trace_event_call(tk, &tk->tp.call);
1449
1450 if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) {
1451 ret = -ENOMEM;
1452 goto error;
1453 }
1454
1455 ret = __register_trace_kprobe(tk);
1456 if (ret < 0)
1457 goto error;
1458
1459 return &tk->tp.call;
1460error:
1461 free_trace_kprobe(tk);
1462 return ERR_PTR(ret);
1463}
1464
1465void destroy_local_trace_kprobe(struct trace_event_call *event_call)
1466{
1467 struct trace_kprobe *tk;
1468
1469 tk = container_of(event_call, struct trace_kprobe, tp.call);
1470
1471 if (trace_probe_is_enabled(&tk->tp)) {
1472 WARN_ON(1);
1473 return;
1474 }
1475
1476 __unregister_trace_kprobe(tk);
1477 free_trace_kprobe(tk);
1478}
1479#endif /* CONFIG_PERF_EVENTS */
1480
1406/* Make a tracefs interface for controlling probe points */ 1481/* Make a tracefs interface for controlling probe points */
1407static __init int init_kprobe_trace(void) 1482static __init int init_kprobe_trace(void)
1408{ 1483{
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 6a4d3fa94042..75daff22ccea 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -416,3 +416,14 @@ store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs,
416} 416}
417 417
418extern int set_print_fmt(struct trace_probe *tp, bool is_return); 418extern int set_print_fmt(struct trace_probe *tp, bool is_return);
419
420#ifdef CONFIG_PERF_EVENTS
421extern struct trace_event_call *
422create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
423 bool is_return);
424extern void destroy_local_trace_kprobe(struct trace_event_call *event_call);
425
426extern struct trace_event_call *
427create_local_trace_uprobe(char *name, unsigned long offs, bool is_return);
428extern void destroy_local_trace_uprobe(struct trace_event_call *event_call);
429#endif
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 268029ae1be6..2014f4351ae0 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1292,16 +1292,25 @@ static struct trace_event_functions uprobe_funcs = {
1292 .trace = print_uprobe_event 1292 .trace = print_uprobe_event
1293}; 1293};
1294 1294
1295static int register_uprobe_event(struct trace_uprobe *tu) 1295static inline void init_trace_event_call(struct trace_uprobe *tu,
1296 struct trace_event_call *call)
1296{ 1297{
1297 struct trace_event_call *call = &tu->tp.call;
1298 int ret;
1299
1300 /* Initialize trace_event_call */
1301 INIT_LIST_HEAD(&call->class->fields); 1298 INIT_LIST_HEAD(&call->class->fields);
1302 call->event.funcs = &uprobe_funcs; 1299 call->event.funcs = &uprobe_funcs;
1303 call->class->define_fields = uprobe_event_define_fields; 1300 call->class->define_fields = uprobe_event_define_fields;
1304 1301
1302 call->flags = TRACE_EVENT_FL_UPROBE;
1303 call->class->reg = trace_uprobe_register;
1304 call->data = tu;
1305}
1306
1307static int register_uprobe_event(struct trace_uprobe *tu)
1308{
1309 struct trace_event_call *call = &tu->tp.call;
1310 int ret = 0;
1311
1312 init_trace_event_call(tu, call);
1313
1305 if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) 1314 if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0)
1306 return -ENOMEM; 1315 return -ENOMEM;
1307 1316
@@ -1311,9 +1320,6 @@ static int register_uprobe_event(struct trace_uprobe *tu)
1311 return -ENODEV; 1320 return -ENODEV;
1312 } 1321 }
1313 1322
1314 call->flags = TRACE_EVENT_FL_UPROBE;
1315 call->class->reg = trace_uprobe_register;
1316 call->data = tu;
1317 ret = trace_add_event_call(call); 1323 ret = trace_add_event_call(call);
1318 1324
1319 if (ret) { 1325 if (ret) {
@@ -1339,6 +1345,70 @@ static int unregister_uprobe_event(struct trace_uprobe *tu)
1339 return 0; 1345 return 0;
1340} 1346}
1341 1347
1348#ifdef CONFIG_PERF_EVENTS
1349struct trace_event_call *
1350create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
1351{
1352 struct trace_uprobe *tu;
1353 struct inode *inode;
1354 struct path path;
1355 int ret;
1356
1357 ret = kern_path(name, LOOKUP_FOLLOW, &path);
1358 if (ret)
1359 return ERR_PTR(ret);
1360
1361 inode = igrab(d_inode(path.dentry));
1362 path_put(&path);
1363
1364 if (!inode || !S_ISREG(inode->i_mode)) {
1365 iput(inode);
1366 return ERR_PTR(-EINVAL);
1367 }
1368
1369 /*
1370 * local trace_kprobes are not added to probe_list, so they are never
1371 * searched in find_trace_kprobe(). Therefore, there is no concern of
1372 * duplicated name "DUMMY_EVENT" here.
1373 */
1374 tu = alloc_trace_uprobe(UPROBE_EVENT_SYSTEM, "DUMMY_EVENT", 0,
1375 is_return);
1376
1377 if (IS_ERR(tu)) {
1378 pr_info("Failed to allocate trace_uprobe.(%d)\n",
1379 (int)PTR_ERR(tu));
1380 return ERR_CAST(tu);
1381 }
1382
1383 tu->offset = offs;
1384 tu->inode = inode;
1385 tu->filename = kstrdup(name, GFP_KERNEL);
1386 init_trace_event_call(tu, &tu->tp.call);
1387
1388 if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) {
1389 ret = -ENOMEM;
1390 goto error;
1391 }
1392
1393 return &tu->tp.call;
1394error:
1395 free_trace_uprobe(tu);
1396 return ERR_PTR(ret);
1397}
1398
1399void destroy_local_trace_uprobe(struct trace_event_call *event_call)
1400{
1401 struct trace_uprobe *tu;
1402
1403 tu = container_of(event_call, struct trace_uprobe, tp.call);
1404
1405 kfree(tu->tp.call.print_fmt);
1406 tu->tp.call.print_fmt = NULL;
1407
1408 free_trace_uprobe(tu);
1409}
1410#endif /* CONFIG_PERF_EVENTS */
1411
1342/* Make a trace interface for controling probe points */ 1412/* Make a trace interface for controling probe points */
1343static __init int init_uprobe_trace(void) 1413static __init int init_uprobe_trace(void)
1344{ 1414{
diff --git a/kernel/uid16.c b/kernel/uid16.c
index ef1da2a5f9bd..af6925d8599b 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -18,44 +18,46 @@
18 18
19#include <linux/uaccess.h> 19#include <linux/uaccess.h>
20 20
21#include "uid16.h"
22
21SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) 23SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
22{ 24{
23 return sys_chown(filename, low2highuid(user), low2highgid(group)); 25 return ksys_chown(filename, low2highuid(user), low2highgid(group));
24} 26}
25 27
26SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) 28SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
27{ 29{
28 return sys_lchown(filename, low2highuid(user), low2highgid(group)); 30 return ksys_lchown(filename, low2highuid(user), low2highgid(group));
29} 31}
30 32
31SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) 33SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
32{ 34{
33 return sys_fchown(fd, low2highuid(user), low2highgid(group)); 35 return ksys_fchown(fd, low2highuid(user), low2highgid(group));
34} 36}
35 37
36SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) 38SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid)
37{ 39{
38 return sys_setregid(low2highgid(rgid), low2highgid(egid)); 40 return __sys_setregid(low2highgid(rgid), low2highgid(egid));
39} 41}
40 42
41SYSCALL_DEFINE1(setgid16, old_gid_t, gid) 43SYSCALL_DEFINE1(setgid16, old_gid_t, gid)
42{ 44{
43 return sys_setgid(low2highgid(gid)); 45 return __sys_setgid(low2highgid(gid));
44} 46}
45 47
46SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) 48SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid)
47{ 49{
48 return sys_setreuid(low2highuid(ruid), low2highuid(euid)); 50 return __sys_setreuid(low2highuid(ruid), low2highuid(euid));
49} 51}
50 52
51SYSCALL_DEFINE1(setuid16, old_uid_t, uid) 53SYSCALL_DEFINE1(setuid16, old_uid_t, uid)
52{ 54{
53 return sys_setuid(low2highuid(uid)); 55 return __sys_setuid(low2highuid(uid));
54} 56}
55 57
56SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) 58SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
57{ 59{
58 return sys_setresuid(low2highuid(ruid), low2highuid(euid), 60 return __sys_setresuid(low2highuid(ruid), low2highuid(euid),
59 low2highuid(suid)); 61 low2highuid(suid));
60} 62}
61 63
@@ -78,11 +80,10 @@ SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euid
78 80
79SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) 81SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
80{ 82{
81 return sys_setresgid(low2highgid(rgid), low2highgid(egid), 83 return __sys_setresgid(low2highgid(rgid), low2highgid(egid),
82 low2highgid(sgid)); 84 low2highgid(sgid));
83} 85}
84 86
85
86SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp) 87SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp)
87{ 88{
88 const struct cred *cred = current_cred(); 89 const struct cred *cred = current_cred();
@@ -102,12 +103,12 @@ SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egid
102 103
103SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) 104SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid)
104{ 105{
105 return sys_setfsuid(low2highuid(uid)); 106 return __sys_setfsuid(low2highuid(uid));
106} 107}
107 108
108SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) 109SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
109{ 110{
110 return sys_setfsgid(low2highgid(gid)); 111 return __sys_setfsgid(low2highgid(gid));
111} 112}
112 113
113static int groups16_to_user(old_gid_t __user *grouplist, 114static int groups16_to_user(old_gid_t __user *grouplist,
diff --git a/kernel/uid16.h b/kernel/uid16.h
new file mode 100644
index 000000000000..cdca040f7602
--- /dev/null
+++ b/kernel/uid16.h
@@ -0,0 +1,14 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef LINUX_UID16_H
3#define LINUX_UID16_H
4
5long __sys_setuid(uid_t uid);
6long __sys_setgid(gid_t gid);
7long __sys_setreuid(uid_t ruid, uid_t euid);
8long __sys_setregid(gid_t rgid, gid_t egid);
9long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid);
10long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid);
11long __sys_setfsuid(uid_t uid);
12long __sys_setfsgid(gid_t gid);
13
14#endif /* LINUX_UID16_H */
diff --git a/kernel/umh.c b/kernel/umh.c
index 18e5fa4b0e71..f76b3ff876cf 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -118,7 +118,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
118{ 118{
119 pid_t pid; 119 pid_t pid;
120 120
121 /* If SIGCLD is ignored sys_wait4 won't populate the status. */ 121 /* If SIGCLD is ignored kernel_wait4 won't populate the status. */
122 kernel_sigaction(SIGCHLD, SIG_DFL); 122 kernel_sigaction(SIGCHLD, SIG_DFL);
123 pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); 123 pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
124 if (pid < 0) { 124 if (pid < 0) {
@@ -135,7 +135,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
135 * 135 *
136 * Thus the __user pointer cast is valid here. 136 * Thus the __user pointer cast is valid here.
137 */ 137 */
138 sys_wait4(pid, (int __user *)&ret, 0, NULL); 138 kernel_wait4(pid, (int __user *)&ret, 0, NULL);
139 139
140 /* 140 /*
141 * If ret is 0, either call_usermodehelper_exec_async failed and 141 * If ret is 0, either call_usermodehelper_exec_async failed and
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6ec6ba65127b..254e636a3d6b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5573,12 +5573,13 @@ static void __init wq_numa_init(void)
5573int __init workqueue_init_early(void) 5573int __init workqueue_init_early(void)
5574{ 5574{
5575 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; 5575 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
5576 int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
5576 int i, cpu; 5577 int i, cpu;
5577 5578
5578 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); 5579 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
5579 5580
5580 BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); 5581 BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
5581 cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); 5582 cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));
5582 5583
5583 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); 5584 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
5584 5585