diff options
author | Ingo Molnar <mingo@kernel.org> | 2018-04-05 03:20:34 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2018-04-05 03:20:34 -0400 |
commit | ea2a6af517714c52a1209795a03e863e96b460bb (patch) | |
tree | 3bd443bc9b23ceeaf3743eaf2d6d35ec63c620c9 /kernel | |
parent | 1b5d43cfb69759d8ef8d30469cea31d0c037aed5 (diff) | |
parent | 642e7fd23353e22290e3d51719fcb658dc252342 (diff) |
Merge branch 'linus' into sched/urgent, to pick up fixes and updates
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
71 files changed, 3840 insertions, 2396 deletions
diff --git a/kernel/compat.c b/kernel/compat.c index 3f5fa8902e7d..6d21894806b4 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -488,61 +488,6 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) | |||
488 | } | 488 | } |
489 | EXPORT_SYMBOL_GPL(get_compat_sigset); | 489 | EXPORT_SYMBOL_GPL(get_compat_sigset); |
490 | 490 | ||
491 | #ifdef CONFIG_NUMA | ||
492 | COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, | ||
493 | compat_uptr_t __user *, pages32, | ||
494 | const int __user *, nodes, | ||
495 | int __user *, status, | ||
496 | int, flags) | ||
497 | { | ||
498 | const void __user * __user *pages; | ||
499 | int i; | ||
500 | |||
501 | pages = compat_alloc_user_space(nr_pages * sizeof(void *)); | ||
502 | for (i = 0; i < nr_pages; i++) { | ||
503 | compat_uptr_t p; | ||
504 | |||
505 | if (get_user(p, pages32 + i) || | ||
506 | put_user(compat_ptr(p), pages + i)) | ||
507 | return -EFAULT; | ||
508 | } | ||
509 | return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); | ||
510 | } | ||
511 | |||
512 | COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, | ||
513 | compat_ulong_t, maxnode, | ||
514 | const compat_ulong_t __user *, old_nodes, | ||
515 | const compat_ulong_t __user *, new_nodes) | ||
516 | { | ||
517 | unsigned long __user *old = NULL; | ||
518 | unsigned long __user *new = NULL; | ||
519 | nodemask_t tmp_mask; | ||
520 | unsigned long nr_bits; | ||
521 | unsigned long size; | ||
522 | |||
523 | nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); | ||
524 | size = ALIGN(nr_bits, BITS_PER_LONG) / 8; | ||
525 | if (old_nodes) { | ||
526 | if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) | ||
527 | return -EFAULT; | ||
528 | old = compat_alloc_user_space(new_nodes ? size * 2 : size); | ||
529 | if (new_nodes) | ||
530 | new = old + size / sizeof(unsigned long); | ||
531 | if (copy_to_user(old, nodes_addr(tmp_mask), size)) | ||
532 | return -EFAULT; | ||
533 | } | ||
534 | if (new_nodes) { | ||
535 | if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) | ||
536 | return -EFAULT; | ||
537 | if (new == NULL) | ||
538 | new = compat_alloc_user_space(size); | ||
539 | if (copy_to_user(new, nodes_addr(tmp_mask), size)) | ||
540 | return -EFAULT; | ||
541 | } | ||
542 | return sys_migrate_pages(pid, nr_bits + 1, old, new); | ||
543 | } | ||
544 | #endif | ||
545 | |||
546 | /* | 491 | /* |
547 | * Allocate user-space memory for the duration of a single system call, | 492 | * Allocate user-space memory for the duration of a single system call, |
548 | * in order to marshall parameters inside a compat thunk. | 493 | * in order to marshall parameters inside a compat thunk. |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 53f7dc65f9a3..0db8938fbb23 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -124,24 +124,11 @@ struct cpuhp_step { | |||
124 | }; | 124 | }; |
125 | 125 | ||
126 | static DEFINE_MUTEX(cpuhp_state_mutex); | 126 | static DEFINE_MUTEX(cpuhp_state_mutex); |
127 | static struct cpuhp_step cpuhp_bp_states[]; | 127 | static struct cpuhp_step cpuhp_hp_states[]; |
128 | static struct cpuhp_step cpuhp_ap_states[]; | ||
129 | |||
130 | static bool cpuhp_is_ap_state(enum cpuhp_state state) | ||
131 | { | ||
132 | /* | ||
133 | * The extra check for CPUHP_TEARDOWN_CPU is only for documentation | ||
134 | * purposes as that state is handled explicitly in cpu_down. | ||
135 | */ | ||
136 | return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; | ||
137 | } | ||
138 | 128 | ||
139 | static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) | 129 | static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) |
140 | { | 130 | { |
141 | struct cpuhp_step *sp; | 131 | return cpuhp_hp_states + state; |
142 | |||
143 | sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; | ||
144 | return sp + state; | ||
145 | } | 132 | } |
146 | 133 | ||
147 | /** | 134 | /** |
@@ -239,6 +226,15 @@ err: | |||
239 | } | 226 | } |
240 | 227 | ||
241 | #ifdef CONFIG_SMP | 228 | #ifdef CONFIG_SMP |
229 | static bool cpuhp_is_ap_state(enum cpuhp_state state) | ||
230 | { | ||
231 | /* | ||
232 | * The extra check for CPUHP_TEARDOWN_CPU is only for documentation | ||
233 | * purposes as that state is handled explicitly in cpu_down. | ||
234 | */ | ||
235 | return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; | ||
236 | } | ||
237 | |||
242 | static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) | 238 | static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) |
243 | { | 239 | { |
244 | struct completion *done = bringup ? &st->done_up : &st->done_down; | 240 | struct completion *done = bringup ? &st->done_up : &st->done_down; |
@@ -1224,7 +1220,7 @@ int __boot_cpu_id; | |||
1224 | #endif /* CONFIG_SMP */ | 1220 | #endif /* CONFIG_SMP */ |
1225 | 1221 | ||
1226 | /* Boot processor state steps */ | 1222 | /* Boot processor state steps */ |
1227 | static struct cpuhp_step cpuhp_bp_states[] = { | 1223 | static struct cpuhp_step cpuhp_hp_states[] = { |
1228 | [CPUHP_OFFLINE] = { | 1224 | [CPUHP_OFFLINE] = { |
1229 | .name = "offline", | 1225 | .name = "offline", |
1230 | .startup.single = NULL, | 1226 | .startup.single = NULL, |
@@ -1289,24 +1285,6 @@ static struct cpuhp_step cpuhp_bp_states[] = { | |||
1289 | .teardown.single = NULL, | 1285 | .teardown.single = NULL, |
1290 | .cant_stop = true, | 1286 | .cant_stop = true, |
1291 | }, | 1287 | }, |
1292 | /* | ||
1293 | * Handled on controll processor until the plugged processor manages | ||
1294 | * this itself. | ||
1295 | */ | ||
1296 | [CPUHP_TEARDOWN_CPU] = { | ||
1297 | .name = "cpu:teardown", | ||
1298 | .startup.single = NULL, | ||
1299 | .teardown.single = takedown_cpu, | ||
1300 | .cant_stop = true, | ||
1301 | }, | ||
1302 | #else | ||
1303 | [CPUHP_BRINGUP_CPU] = { }, | ||
1304 | #endif | ||
1305 | }; | ||
1306 | |||
1307 | /* Application processor state steps */ | ||
1308 | static struct cpuhp_step cpuhp_ap_states[] = { | ||
1309 | #ifdef CONFIG_SMP | ||
1310 | /* Final state before CPU kills itself */ | 1288 | /* Final state before CPU kills itself */ |
1311 | [CPUHP_AP_IDLE_DEAD] = { | 1289 | [CPUHP_AP_IDLE_DEAD] = { |
1312 | .name = "idle:dead", | 1290 | .name = "idle:dead", |
@@ -1340,6 +1318,16 @@ static struct cpuhp_step cpuhp_ap_states[] = { | |||
1340 | [CPUHP_AP_ONLINE] = { | 1318 | [CPUHP_AP_ONLINE] = { |
1341 | .name = "ap:online", | 1319 | .name = "ap:online", |
1342 | }, | 1320 | }, |
1321 | /* | ||
1322 | * Handled on controll processor until the plugged processor manages | ||
1323 | * this itself. | ||
1324 | */ | ||
1325 | [CPUHP_TEARDOWN_CPU] = { | ||
1326 | .name = "cpu:teardown", | ||
1327 | .startup.single = NULL, | ||
1328 | .teardown.single = takedown_cpu, | ||
1329 | .cant_stop = true, | ||
1330 | }, | ||
1343 | /* Handle smpboot threads park/unpark */ | 1331 | /* Handle smpboot threads park/unpark */ |
1344 | [CPUHP_AP_SMPBOOT_THREADS] = { | 1332 | [CPUHP_AP_SMPBOOT_THREADS] = { |
1345 | .name = "smpboot/threads:online", | 1333 | .name = "smpboot/threads:online", |
@@ -1408,11 +1396,11 @@ static int cpuhp_reserve_state(enum cpuhp_state state) | |||
1408 | 1396 | ||
1409 | switch (state) { | 1397 | switch (state) { |
1410 | case CPUHP_AP_ONLINE_DYN: | 1398 | case CPUHP_AP_ONLINE_DYN: |
1411 | step = cpuhp_ap_states + CPUHP_AP_ONLINE_DYN; | 1399 | step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN; |
1412 | end = CPUHP_AP_ONLINE_DYN_END; | 1400 | end = CPUHP_AP_ONLINE_DYN_END; |
1413 | break; | 1401 | break; |
1414 | case CPUHP_BP_PREPARE_DYN: | 1402 | case CPUHP_BP_PREPARE_DYN: |
1415 | step = cpuhp_bp_states + CPUHP_BP_PREPARE_DYN; | 1403 | step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN; |
1416 | end = CPUHP_BP_PREPARE_DYN_END; | 1404 | end = CPUHP_BP_PREPARE_DYN_END; |
1417 | break; | 1405 | break; |
1418 | default: | 1406 | default: |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 709a55b9ad97..fc1c330c6bd6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -430,7 +430,7 @@ static void update_perf_cpu_limits(void) | |||
430 | WRITE_ONCE(perf_sample_allowed_ns, tmp); | 430 | WRITE_ONCE(perf_sample_allowed_ns, tmp); |
431 | } | 431 | } |
432 | 432 | ||
433 | static int perf_rotate_context(struct perf_cpu_context *cpuctx); | 433 | static bool perf_rotate_context(struct perf_cpu_context *cpuctx); |
434 | 434 | ||
435 | int perf_proc_update_handler(struct ctl_table *table, int write, | 435 | int perf_proc_update_handler(struct ctl_table *table, int write, |
436 | void __user *buffer, size_t *lenp, | 436 | void __user *buffer, size_t *lenp, |
@@ -643,7 +643,7 @@ static void perf_event_update_sibling_time(struct perf_event *leader) | |||
643 | { | 643 | { |
644 | struct perf_event *sibling; | 644 | struct perf_event *sibling; |
645 | 645 | ||
646 | list_for_each_entry(sibling, &leader->sibling_list, group_entry) | 646 | for_each_sibling_event(sibling, leader) |
647 | perf_event_update_time(sibling); | 647 | perf_event_update_time(sibling); |
648 | } | 648 | } |
649 | 649 | ||
@@ -948,27 +948,39 @@ list_update_cgroup_event(struct perf_event *event, | |||
948 | if (!is_cgroup_event(event)) | 948 | if (!is_cgroup_event(event)) |
949 | return; | 949 | return; |
950 | 950 | ||
951 | if (add && ctx->nr_cgroups++) | ||
952 | return; | ||
953 | else if (!add && --ctx->nr_cgroups) | ||
954 | return; | ||
955 | /* | 951 | /* |
956 | * Because cgroup events are always per-cpu events, | 952 | * Because cgroup events are always per-cpu events, |
957 | * this will always be called from the right CPU. | 953 | * this will always be called from the right CPU. |
958 | */ | 954 | */ |
959 | cpuctx = __get_cpu_context(ctx); | 955 | cpuctx = __get_cpu_context(ctx); |
960 | cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; | 956 | |
961 | /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/ | 957 | /* |
962 | if (add) { | 958 | * Since setting cpuctx->cgrp is conditional on the current @cgrp |
959 | * matching the event's cgroup, we must do this for every new event, | ||
960 | * because if the first would mismatch, the second would not try again | ||
961 | * and we would leave cpuctx->cgrp unset. | ||
962 | */ | ||
963 | if (add && !cpuctx->cgrp) { | ||
963 | struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); | 964 | struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); |
964 | 965 | ||
965 | list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); | ||
966 | if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) | 966 | if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) |
967 | cpuctx->cgrp = cgrp; | 967 | cpuctx->cgrp = cgrp; |
968 | } else { | ||
969 | list_del(cpuctx_entry); | ||
970 | cpuctx->cgrp = NULL; | ||
971 | } | 968 | } |
969 | |||
970 | if (add && ctx->nr_cgroups++) | ||
971 | return; | ||
972 | else if (!add && --ctx->nr_cgroups) | ||
973 | return; | ||
974 | |||
975 | /* no cgroup running */ | ||
976 | if (!add) | ||
977 | cpuctx->cgrp = NULL; | ||
978 | |||
979 | cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; | ||
980 | if (add) | ||
981 | list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); | ||
982 | else | ||
983 | list_del(cpuctx_entry); | ||
972 | } | 984 | } |
973 | 985 | ||
974 | #else /* !CONFIG_CGROUP_PERF */ | 986 | #else /* !CONFIG_CGROUP_PERF */ |
@@ -1052,7 +1064,7 @@ list_update_cgroup_event(struct perf_event *event, | |||
1052 | static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) | 1064 | static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) |
1053 | { | 1065 | { |
1054 | struct perf_cpu_context *cpuctx; | 1066 | struct perf_cpu_context *cpuctx; |
1055 | int rotations = 0; | 1067 | bool rotations; |
1056 | 1068 | ||
1057 | lockdep_assert_irqs_disabled(); | 1069 | lockdep_assert_irqs_disabled(); |
1058 | 1070 | ||
@@ -1471,8 +1483,21 @@ static enum event_type_t get_event_type(struct perf_event *event) | |||
1471 | return event_type; | 1483 | return event_type; |
1472 | } | 1484 | } |
1473 | 1485 | ||
1474 | static struct list_head * | 1486 | /* |
1475 | ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) | 1487 | * Helper function to initialize event group nodes. |
1488 | */ | ||
1489 | static void init_event_group(struct perf_event *event) | ||
1490 | { | ||
1491 | RB_CLEAR_NODE(&event->group_node); | ||
1492 | event->group_index = 0; | ||
1493 | } | ||
1494 | |||
1495 | /* | ||
1496 | * Extract pinned or flexible groups from the context | ||
1497 | * based on event attrs bits. | ||
1498 | */ | ||
1499 | static struct perf_event_groups * | ||
1500 | get_event_groups(struct perf_event *event, struct perf_event_context *ctx) | ||
1476 | { | 1501 | { |
1477 | if (event->attr.pinned) | 1502 | if (event->attr.pinned) |
1478 | return &ctx->pinned_groups; | 1503 | return &ctx->pinned_groups; |
@@ -1481,6 +1506,156 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) | |||
1481 | } | 1506 | } |
1482 | 1507 | ||
1483 | /* | 1508 | /* |
1509 | * Helper function to initializes perf_event_group trees. | ||
1510 | */ | ||
1511 | static void perf_event_groups_init(struct perf_event_groups *groups) | ||
1512 | { | ||
1513 | groups->tree = RB_ROOT; | ||
1514 | groups->index = 0; | ||
1515 | } | ||
1516 | |||
1517 | /* | ||
1518 | * Compare function for event groups; | ||
1519 | * | ||
1520 | * Implements complex key that first sorts by CPU and then by virtual index | ||
1521 | * which provides ordering when rotating groups for the same CPU. | ||
1522 | */ | ||
1523 | static bool | ||
1524 | perf_event_groups_less(struct perf_event *left, struct perf_event *right) | ||
1525 | { | ||
1526 | if (left->cpu < right->cpu) | ||
1527 | return true; | ||
1528 | if (left->cpu > right->cpu) | ||
1529 | return false; | ||
1530 | |||
1531 | if (left->group_index < right->group_index) | ||
1532 | return true; | ||
1533 | if (left->group_index > right->group_index) | ||
1534 | return false; | ||
1535 | |||
1536 | return false; | ||
1537 | } | ||
1538 | |||
1539 | /* | ||
1540 | * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for | ||
1541 | * key (see perf_event_groups_less). This places it last inside the CPU | ||
1542 | * subtree. | ||
1543 | */ | ||
1544 | static void | ||
1545 | perf_event_groups_insert(struct perf_event_groups *groups, | ||
1546 | struct perf_event *event) | ||
1547 | { | ||
1548 | struct perf_event *node_event; | ||
1549 | struct rb_node *parent; | ||
1550 | struct rb_node **node; | ||
1551 | |||
1552 | event->group_index = ++groups->index; | ||
1553 | |||
1554 | node = &groups->tree.rb_node; | ||
1555 | parent = *node; | ||
1556 | |||
1557 | while (*node) { | ||
1558 | parent = *node; | ||
1559 | node_event = container_of(*node, struct perf_event, group_node); | ||
1560 | |||
1561 | if (perf_event_groups_less(event, node_event)) | ||
1562 | node = &parent->rb_left; | ||
1563 | else | ||
1564 | node = &parent->rb_right; | ||
1565 | } | ||
1566 | |||
1567 | rb_link_node(&event->group_node, parent, node); | ||
1568 | rb_insert_color(&event->group_node, &groups->tree); | ||
1569 | } | ||
1570 | |||
1571 | /* | ||
1572 | * Helper function to insert event into the pinned or flexible groups. | ||
1573 | */ | ||
1574 | static void | ||
1575 | add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) | ||
1576 | { | ||
1577 | struct perf_event_groups *groups; | ||
1578 | |||
1579 | groups = get_event_groups(event, ctx); | ||
1580 | perf_event_groups_insert(groups, event); | ||
1581 | } | ||
1582 | |||
1583 | /* | ||
1584 | * Delete a group from a tree. | ||
1585 | */ | ||
1586 | static void | ||
1587 | perf_event_groups_delete(struct perf_event_groups *groups, | ||
1588 | struct perf_event *event) | ||
1589 | { | ||
1590 | WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) || | ||
1591 | RB_EMPTY_ROOT(&groups->tree)); | ||
1592 | |||
1593 | rb_erase(&event->group_node, &groups->tree); | ||
1594 | init_event_group(event); | ||
1595 | } | ||
1596 | |||
1597 | /* | ||
1598 | * Helper function to delete event from its groups. | ||
1599 | */ | ||
1600 | static void | ||
1601 | del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) | ||
1602 | { | ||
1603 | struct perf_event_groups *groups; | ||
1604 | |||
1605 | groups = get_event_groups(event, ctx); | ||
1606 | perf_event_groups_delete(groups, event); | ||
1607 | } | ||
1608 | |||
1609 | /* | ||
1610 | * Get the leftmost event in the @cpu subtree. | ||
1611 | */ | ||
1612 | static struct perf_event * | ||
1613 | perf_event_groups_first(struct perf_event_groups *groups, int cpu) | ||
1614 | { | ||
1615 | struct perf_event *node_event = NULL, *match = NULL; | ||
1616 | struct rb_node *node = groups->tree.rb_node; | ||
1617 | |||
1618 | while (node) { | ||
1619 | node_event = container_of(node, struct perf_event, group_node); | ||
1620 | |||
1621 | if (cpu < node_event->cpu) { | ||
1622 | node = node->rb_left; | ||
1623 | } else if (cpu > node_event->cpu) { | ||
1624 | node = node->rb_right; | ||
1625 | } else { | ||
1626 | match = node_event; | ||
1627 | node = node->rb_left; | ||
1628 | } | ||
1629 | } | ||
1630 | |||
1631 | return match; | ||
1632 | } | ||
1633 | |||
1634 | /* | ||
1635 | * Like rb_entry_next_safe() for the @cpu subtree. | ||
1636 | */ | ||
1637 | static struct perf_event * | ||
1638 | perf_event_groups_next(struct perf_event *event) | ||
1639 | { | ||
1640 | struct perf_event *next; | ||
1641 | |||
1642 | next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node); | ||
1643 | if (next && next->cpu == event->cpu) | ||
1644 | return next; | ||
1645 | |||
1646 | return NULL; | ||
1647 | } | ||
1648 | |||
1649 | /* | ||
1650 | * Iterate through the whole groups tree. | ||
1651 | */ | ||
1652 | #define perf_event_groups_for_each(event, groups) \ | ||
1653 | for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ | ||
1654 | typeof(*event), group_node); event; \ | ||
1655 | event = rb_entry_safe(rb_next(&event->group_node), \ | ||
1656 | typeof(*event), group_node)) | ||
1657 | |||
1658 | /* | ||
1484 | * Add a event from the lists for its context. | 1659 | * Add a event from the lists for its context. |
1485 | * Must be called with ctx->mutex and ctx->lock held. | 1660 | * Must be called with ctx->mutex and ctx->lock held. |
1486 | */ | 1661 | */ |
@@ -1500,12 +1675,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
1500 | * perf_group_detach can, at all times, locate all siblings. | 1675 | * perf_group_detach can, at all times, locate all siblings. |
1501 | */ | 1676 | */ |
1502 | if (event->group_leader == event) { | 1677 | if (event->group_leader == event) { |
1503 | struct list_head *list; | ||
1504 | |||
1505 | event->group_caps = event->event_caps; | 1678 | event->group_caps = event->event_caps; |
1506 | 1679 | add_event_to_groups(event, ctx); | |
1507 | list = ctx_group_list(event, ctx); | ||
1508 | list_add_tail(&event->group_entry, list); | ||
1509 | } | 1680 | } |
1510 | 1681 | ||
1511 | list_update_cgroup_event(event, ctx, true); | 1682 | list_update_cgroup_event(event, ctx, true); |
@@ -1663,12 +1834,12 @@ static void perf_group_attach(struct perf_event *event) | |||
1663 | 1834 | ||
1664 | group_leader->group_caps &= event->event_caps; | 1835 | group_leader->group_caps &= event->event_caps; |
1665 | 1836 | ||
1666 | list_add_tail(&event->group_entry, &group_leader->sibling_list); | 1837 | list_add_tail(&event->sibling_list, &group_leader->sibling_list); |
1667 | group_leader->nr_siblings++; | 1838 | group_leader->nr_siblings++; |
1668 | 1839 | ||
1669 | perf_event__header_size(group_leader); | 1840 | perf_event__header_size(group_leader); |
1670 | 1841 | ||
1671 | list_for_each_entry(pos, &group_leader->sibling_list, group_entry) | 1842 | for_each_sibling_event(pos, group_leader) |
1672 | perf_event__header_size(pos); | 1843 | perf_event__header_size(pos); |
1673 | } | 1844 | } |
1674 | 1845 | ||
@@ -1699,7 +1870,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
1699 | list_del_rcu(&event->event_entry); | 1870 | list_del_rcu(&event->event_entry); |
1700 | 1871 | ||
1701 | if (event->group_leader == event) | 1872 | if (event->group_leader == event) |
1702 | list_del_init(&event->group_entry); | 1873 | del_event_from_groups(event, ctx); |
1703 | 1874 | ||
1704 | /* | 1875 | /* |
1705 | * If event was in error state, then keep it | 1876 | * If event was in error state, then keep it |
@@ -1717,9 +1888,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
1717 | static void perf_group_detach(struct perf_event *event) | 1888 | static void perf_group_detach(struct perf_event *event) |
1718 | { | 1889 | { |
1719 | struct perf_event *sibling, *tmp; | 1890 | struct perf_event *sibling, *tmp; |
1720 | struct list_head *list = NULL; | 1891 | struct perf_event_context *ctx = event->ctx; |
1721 | 1892 | ||
1722 | lockdep_assert_held(&event->ctx->lock); | 1893 | lockdep_assert_held(&ctx->lock); |
1723 | 1894 | ||
1724 | /* | 1895 | /* |
1725 | * We can have double detach due to exit/hot-unplug + close. | 1896 | * We can have double detach due to exit/hot-unplug + close. |
@@ -1733,34 +1904,42 @@ static void perf_group_detach(struct perf_event *event) | |||
1733 | * If this is a sibling, remove it from its group. | 1904 | * If this is a sibling, remove it from its group. |
1734 | */ | 1905 | */ |
1735 | if (event->group_leader != event) { | 1906 | if (event->group_leader != event) { |
1736 | list_del_init(&event->group_entry); | 1907 | list_del_init(&event->sibling_list); |
1737 | event->group_leader->nr_siblings--; | 1908 | event->group_leader->nr_siblings--; |
1738 | goto out; | 1909 | goto out; |
1739 | } | 1910 | } |
1740 | 1911 | ||
1741 | if (!list_empty(&event->group_entry)) | ||
1742 | list = &event->group_entry; | ||
1743 | |||
1744 | /* | 1912 | /* |
1745 | * If this was a group event with sibling events then | 1913 | * If this was a group event with sibling events then |
1746 | * upgrade the siblings to singleton events by adding them | 1914 | * upgrade the siblings to singleton events by adding them |
1747 | * to whatever list we are on. | 1915 | * to whatever list we are on. |
1748 | */ | 1916 | */ |
1749 | list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { | 1917 | list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { |
1750 | if (list) | 1918 | |
1751 | list_move_tail(&sibling->group_entry, list); | ||
1752 | sibling->group_leader = sibling; | 1919 | sibling->group_leader = sibling; |
1920 | list_del_init(&sibling->sibling_list); | ||
1753 | 1921 | ||
1754 | /* Inherit group flags from the previous leader */ | 1922 | /* Inherit group flags from the previous leader */ |
1755 | sibling->group_caps = event->group_caps; | 1923 | sibling->group_caps = event->group_caps; |
1756 | 1924 | ||
1925 | if (!RB_EMPTY_NODE(&event->group_node)) { | ||
1926 | add_event_to_groups(sibling, event->ctx); | ||
1927 | |||
1928 | if (sibling->state == PERF_EVENT_STATE_ACTIVE) { | ||
1929 | struct list_head *list = sibling->attr.pinned ? | ||
1930 | &ctx->pinned_active : &ctx->flexible_active; | ||
1931 | |||
1932 | list_add_tail(&sibling->active_list, list); | ||
1933 | } | ||
1934 | } | ||
1935 | |||
1757 | WARN_ON_ONCE(sibling->ctx != event->ctx); | 1936 | WARN_ON_ONCE(sibling->ctx != event->ctx); |
1758 | } | 1937 | } |
1759 | 1938 | ||
1760 | out: | 1939 | out: |
1761 | perf_event__header_size(event->group_leader); | 1940 | perf_event__header_size(event->group_leader); |
1762 | 1941 | ||
1763 | list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) | 1942 | for_each_sibling_event(tmp, event->group_leader) |
1764 | perf_event__header_size(tmp); | 1943 | perf_event__header_size(tmp); |
1765 | } | 1944 | } |
1766 | 1945 | ||
@@ -1783,13 +1962,13 @@ static inline int __pmu_filter_match(struct perf_event *event) | |||
1783 | */ | 1962 | */ |
1784 | static inline int pmu_filter_match(struct perf_event *event) | 1963 | static inline int pmu_filter_match(struct perf_event *event) |
1785 | { | 1964 | { |
1786 | struct perf_event *child; | 1965 | struct perf_event *sibling; |
1787 | 1966 | ||
1788 | if (!__pmu_filter_match(event)) | 1967 | if (!__pmu_filter_match(event)) |
1789 | return 0; | 1968 | return 0; |
1790 | 1969 | ||
1791 | list_for_each_entry(child, &event->sibling_list, group_entry) { | 1970 | for_each_sibling_event(sibling, event) { |
1792 | if (!__pmu_filter_match(child)) | 1971 | if (!__pmu_filter_match(sibling)) |
1793 | return 0; | 1972 | return 0; |
1794 | } | 1973 | } |
1795 | 1974 | ||
@@ -1816,6 +1995,13 @@ event_sched_out(struct perf_event *event, | |||
1816 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 1995 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
1817 | return; | 1996 | return; |
1818 | 1997 | ||
1998 | /* | ||
1999 | * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but | ||
2000 | * we can schedule events _OUT_ individually through things like | ||
2001 | * __perf_remove_from_context(). | ||
2002 | */ | ||
2003 | list_del_init(&event->active_list); | ||
2004 | |||
1819 | perf_pmu_disable(event->pmu); | 2005 | perf_pmu_disable(event->pmu); |
1820 | 2006 | ||
1821 | event->pmu->del(event, 0); | 2007 | event->pmu->del(event, 0); |
@@ -1856,7 +2042,7 @@ group_sched_out(struct perf_event *group_event, | |||
1856 | /* | 2042 | /* |
1857 | * Schedule out siblings (if any): | 2043 | * Schedule out siblings (if any): |
1858 | */ | 2044 | */ |
1859 | list_for_each_entry(event, &group_event->sibling_list, group_entry) | 2045 | for_each_sibling_event(event, group_event) |
1860 | event_sched_out(event, cpuctx, ctx); | 2046 | event_sched_out(event, cpuctx, ctx); |
1861 | 2047 | ||
1862 | perf_pmu_enable(ctx->pmu); | 2048 | perf_pmu_enable(ctx->pmu); |
@@ -2135,7 +2321,7 @@ group_sched_in(struct perf_event *group_event, | |||
2135 | /* | 2321 | /* |
2136 | * Schedule in siblings as one group (if any): | 2322 | * Schedule in siblings as one group (if any): |
2137 | */ | 2323 | */ |
2138 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { | 2324 | for_each_sibling_event(event, group_event) { |
2139 | if (event_sched_in(event, cpuctx, ctx)) { | 2325 | if (event_sched_in(event, cpuctx, ctx)) { |
2140 | partial_group = event; | 2326 | partial_group = event; |
2141 | goto group_error; | 2327 | goto group_error; |
@@ -2151,7 +2337,7 @@ group_error: | |||
2151 | * partial group before returning: | 2337 | * partial group before returning: |
2152 | * The events up to the failed event are scheduled out normally. | 2338 | * The events up to the failed event are scheduled out normally. |
2153 | */ | 2339 | */ |
2154 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { | 2340 | for_each_sibling_event(event, group_event) { |
2155 | if (event == partial_group) | 2341 | if (event == partial_group) |
2156 | break; | 2342 | break; |
2157 | 2343 | ||
@@ -2328,6 +2514,18 @@ static int __perf_install_in_context(void *info) | |||
2328 | raw_spin_lock(&task_ctx->lock); | 2514 | raw_spin_lock(&task_ctx->lock); |
2329 | } | 2515 | } |
2330 | 2516 | ||
2517 | #ifdef CONFIG_CGROUP_PERF | ||
2518 | if (is_cgroup_event(event)) { | ||
2519 | /* | ||
2520 | * If the current cgroup doesn't match the event's | ||
2521 | * cgroup, we should not try to schedule it. | ||
2522 | */ | ||
2523 | struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); | ||
2524 | reprogram = cgroup_is_descendant(cgrp->css.cgroup, | ||
2525 | event->cgrp->css.cgroup); | ||
2526 | } | ||
2527 | #endif | ||
2528 | |||
2331 | if (reprogram) { | 2529 | if (reprogram) { |
2332 | ctx_sched_out(ctx, cpuctx, EVENT_TIME); | 2530 | ctx_sched_out(ctx, cpuctx, EVENT_TIME); |
2333 | add_event_to_ctx(event, ctx); | 2531 | add_event_to_ctx(event, ctx); |
@@ -2661,12 +2859,47 @@ int perf_event_refresh(struct perf_event *event, int refresh) | |||
2661 | } | 2859 | } |
2662 | EXPORT_SYMBOL_GPL(perf_event_refresh); | 2860 | EXPORT_SYMBOL_GPL(perf_event_refresh); |
2663 | 2861 | ||
2862 | static int perf_event_modify_breakpoint(struct perf_event *bp, | ||
2863 | struct perf_event_attr *attr) | ||
2864 | { | ||
2865 | int err; | ||
2866 | |||
2867 | _perf_event_disable(bp); | ||
2868 | |||
2869 | err = modify_user_hw_breakpoint_check(bp, attr, true); | ||
2870 | if (err) { | ||
2871 | if (!bp->attr.disabled) | ||
2872 | _perf_event_enable(bp); | ||
2873 | |||
2874 | return err; | ||
2875 | } | ||
2876 | |||
2877 | if (!attr->disabled) | ||
2878 | _perf_event_enable(bp); | ||
2879 | return 0; | ||
2880 | } | ||
2881 | |||
2882 | static int perf_event_modify_attr(struct perf_event *event, | ||
2883 | struct perf_event_attr *attr) | ||
2884 | { | ||
2885 | if (event->attr.type != attr->type) | ||
2886 | return -EINVAL; | ||
2887 | |||
2888 | switch (event->attr.type) { | ||
2889 | case PERF_TYPE_BREAKPOINT: | ||
2890 | return perf_event_modify_breakpoint(event, attr); | ||
2891 | default: | ||
2892 | /* Place holder for future additions. */ | ||
2893 | return -EOPNOTSUPP; | ||
2894 | } | ||
2895 | } | ||
2896 | |||
2664 | static void ctx_sched_out(struct perf_event_context *ctx, | 2897 | static void ctx_sched_out(struct perf_event_context *ctx, |
2665 | struct perf_cpu_context *cpuctx, | 2898 | struct perf_cpu_context *cpuctx, |
2666 | enum event_type_t event_type) | 2899 | enum event_type_t event_type) |
2667 | { | 2900 | { |
2901 | struct perf_event *event, *tmp; | ||
2668 | int is_active = ctx->is_active; | 2902 | int is_active = ctx->is_active; |
2669 | struct perf_event *event; | ||
2670 | 2903 | ||
2671 | lockdep_assert_held(&ctx->lock); | 2904 | lockdep_assert_held(&ctx->lock); |
2672 | 2905 | ||
@@ -2713,12 +2946,12 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
2713 | 2946 | ||
2714 | perf_pmu_disable(ctx->pmu); | 2947 | perf_pmu_disable(ctx->pmu); |
2715 | if (is_active & EVENT_PINNED) { | 2948 | if (is_active & EVENT_PINNED) { |
2716 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) | 2949 | list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) |
2717 | group_sched_out(event, cpuctx, ctx); | 2950 | group_sched_out(event, cpuctx, ctx); |
2718 | } | 2951 | } |
2719 | 2952 | ||
2720 | if (is_active & EVENT_FLEXIBLE) { | 2953 | if (is_active & EVENT_FLEXIBLE) { |
2721 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) | 2954 | list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) |
2722 | group_sched_out(event, cpuctx, ctx); | 2955 | group_sched_out(event, cpuctx, ctx); |
2723 | } | 2956 | } |
2724 | perf_pmu_enable(ctx->pmu); | 2957 | perf_pmu_enable(ctx->pmu); |
@@ -3005,53 +3238,116 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | |||
3005 | ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); | 3238 | ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); |
3006 | } | 3239 | } |
3007 | 3240 | ||
3008 | static void | 3241 | static int visit_groups_merge(struct perf_event_groups *groups, int cpu, |
3009 | ctx_pinned_sched_in(struct perf_event_context *ctx, | 3242 | int (*func)(struct perf_event *, void *), void *data) |
3010 | struct perf_cpu_context *cpuctx) | ||
3011 | { | 3243 | { |
3012 | struct perf_event *event; | 3244 | struct perf_event **evt, *evt1, *evt2; |
3245 | int ret; | ||
3013 | 3246 | ||
3014 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { | 3247 | evt1 = perf_event_groups_first(groups, -1); |
3015 | if (event->state <= PERF_EVENT_STATE_OFF) | 3248 | evt2 = perf_event_groups_first(groups, cpu); |
3016 | continue; | 3249 | |
3017 | if (!event_filter_match(event)) | 3250 | while (evt1 || evt2) { |
3018 | continue; | 3251 | if (evt1 && evt2) { |
3252 | if (evt1->group_index < evt2->group_index) | ||
3253 | evt = &evt1; | ||
3254 | else | ||
3255 | evt = &evt2; | ||
3256 | } else if (evt1) { | ||
3257 | evt = &evt1; | ||
3258 | } else { | ||
3259 | evt = &evt2; | ||
3260 | } | ||
3019 | 3261 | ||
3020 | if (group_can_go_on(event, cpuctx, 1)) | 3262 | ret = func(*evt, data); |
3021 | group_sched_in(event, cpuctx, ctx); | 3263 | if (ret) |
3264 | return ret; | ||
3022 | 3265 | ||
3023 | /* | 3266 | *evt = perf_event_groups_next(*evt); |
3024 | * If this pinned group hasn't been scheduled, | ||
3025 | * put it in error state. | ||
3026 | */ | ||
3027 | if (event->state == PERF_EVENT_STATE_INACTIVE) | ||
3028 | perf_event_set_state(event, PERF_EVENT_STATE_ERROR); | ||
3029 | } | 3267 | } |
3268 | |||
3269 | return 0; | ||
3270 | } | ||
3271 | |||
3272 | struct sched_in_data { | ||
3273 | struct perf_event_context *ctx; | ||
3274 | struct perf_cpu_context *cpuctx; | ||
3275 | int can_add_hw; | ||
3276 | }; | ||
3277 | |||
3278 | static int pinned_sched_in(struct perf_event *event, void *data) | ||
3279 | { | ||
3280 | struct sched_in_data *sid = data; | ||
3281 | |||
3282 | if (event->state <= PERF_EVENT_STATE_OFF) | ||
3283 | return 0; | ||
3284 | |||
3285 | if (!event_filter_match(event)) | ||
3286 | return 0; | ||
3287 | |||
3288 | if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { | ||
3289 | if (!group_sched_in(event, sid->cpuctx, sid->ctx)) | ||
3290 | list_add_tail(&event->active_list, &sid->ctx->pinned_active); | ||
3291 | } | ||
3292 | |||
3293 | /* | ||
3294 | * If this pinned group hasn't been scheduled, | ||
3295 | * put it in error state. | ||
3296 | */ | ||
3297 | if (event->state == PERF_EVENT_STATE_INACTIVE) | ||
3298 | perf_event_set_state(event, PERF_EVENT_STATE_ERROR); | ||
3299 | |||
3300 | return 0; | ||
3301 | } | ||
3302 | |||
3303 | static int flexible_sched_in(struct perf_event *event, void *data) | ||
3304 | { | ||
3305 | struct sched_in_data *sid = data; | ||
3306 | |||
3307 | if (event->state <= PERF_EVENT_STATE_OFF) | ||
3308 | return 0; | ||
3309 | |||
3310 | if (!event_filter_match(event)) | ||
3311 | return 0; | ||
3312 | |||
3313 | if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { | ||
3314 | if (!group_sched_in(event, sid->cpuctx, sid->ctx)) | ||
3315 | list_add_tail(&event->active_list, &sid->ctx->flexible_active); | ||
3316 | else | ||
3317 | sid->can_add_hw = 0; | ||
3318 | } | ||
3319 | |||
3320 | return 0; | ||
3321 | } | ||
3322 | |||
3323 | static void | ||
3324 | ctx_pinned_sched_in(struct perf_event_context *ctx, | ||
3325 | struct perf_cpu_context *cpuctx) | ||
3326 | { | ||
3327 | struct sched_in_data sid = { | ||
3328 | .ctx = ctx, | ||
3329 | .cpuctx = cpuctx, | ||
3330 | .can_add_hw = 1, | ||
3331 | }; | ||
3332 | |||
3333 | visit_groups_merge(&ctx->pinned_groups, | ||
3334 | smp_processor_id(), | ||
3335 | pinned_sched_in, &sid); | ||
3030 | } | 3336 | } |
3031 | 3337 | ||
3032 | static void | 3338 | static void |
3033 | ctx_flexible_sched_in(struct perf_event_context *ctx, | 3339 | ctx_flexible_sched_in(struct perf_event_context *ctx, |
3034 | struct perf_cpu_context *cpuctx) | 3340 | struct perf_cpu_context *cpuctx) |
3035 | { | 3341 | { |
3036 | struct perf_event *event; | 3342 | struct sched_in_data sid = { |
3037 | int can_add_hw = 1; | 3343 | .ctx = ctx, |
3038 | 3344 | .cpuctx = cpuctx, | |
3039 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) { | 3345 | .can_add_hw = 1, |
3040 | /* Ignore events in OFF or ERROR state */ | 3346 | }; |
3041 | if (event->state <= PERF_EVENT_STATE_OFF) | ||
3042 | continue; | ||
3043 | /* | ||
3044 | * Listen to the 'cpu' scheduling filter constraint | ||
3045 | * of events: | ||
3046 | */ | ||
3047 | if (!event_filter_match(event)) | ||
3048 | continue; | ||
3049 | 3347 | ||
3050 | if (group_can_go_on(event, cpuctx, can_add_hw)) { | 3348 | visit_groups_merge(&ctx->flexible_groups, |
3051 | if (group_sched_in(event, cpuctx, ctx)) | 3349 | smp_processor_id(), |
3052 | can_add_hw = 0; | 3350 | flexible_sched_in, &sid); |
3053 | } | ||
3054 | } | ||
3055 | } | 3351 | } |
3056 | 3352 | ||
3057 | static void | 3353 | static void |
@@ -3132,7 +3428,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, | |||
3132 | * However, if task's ctx is not carrying any pinned | 3428 | * However, if task's ctx is not carrying any pinned |
3133 | * events, no need to flip the cpuctx's events around. | 3429 | * events, no need to flip the cpuctx's events around. |
3134 | */ | 3430 | */ |
3135 | if (!list_empty(&ctx->pinned_groups)) | 3431 | if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) |
3136 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 3432 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
3137 | perf_event_sched_in(cpuctx, ctx, task); | 3433 | perf_event_sched_in(cpuctx, ctx, task); |
3138 | perf_pmu_enable(ctx->pmu); | 3434 | perf_pmu_enable(ctx->pmu); |
@@ -3361,55 +3657,81 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, | |||
3361 | } | 3657 | } |
3362 | 3658 | ||
3363 | /* | 3659 | /* |
3364 | * Round-robin a context's events: | 3660 | * Move @event to the tail of the @ctx's elegible events. |
3365 | */ | 3661 | */ |
3366 | static void rotate_ctx(struct perf_event_context *ctx) | 3662 | static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) |
3367 | { | 3663 | { |
3368 | /* | 3664 | /* |
3369 | * Rotate the first entry last of non-pinned groups. Rotation might be | 3665 | * Rotate the first entry last of non-pinned groups. Rotation might be |
3370 | * disabled by the inheritance code. | 3666 | * disabled by the inheritance code. |
3371 | */ | 3667 | */ |
3372 | if (!ctx->rotate_disable) | 3668 | if (ctx->rotate_disable) |
3373 | list_rotate_left(&ctx->flexible_groups); | 3669 | return; |
3670 | |||
3671 | perf_event_groups_delete(&ctx->flexible_groups, event); | ||
3672 | perf_event_groups_insert(&ctx->flexible_groups, event); | ||
3374 | } | 3673 | } |
3375 | 3674 | ||
3376 | static int perf_rotate_context(struct perf_cpu_context *cpuctx) | 3675 | static inline struct perf_event * |
3676 | ctx_first_active(struct perf_event_context *ctx) | ||
3377 | { | 3677 | { |
3678 | return list_first_entry_or_null(&ctx->flexible_active, | ||
3679 | struct perf_event, active_list); | ||
3680 | } | ||
3681 | |||
3682 | static bool perf_rotate_context(struct perf_cpu_context *cpuctx) | ||
3683 | { | ||
3684 | struct perf_event *cpu_event = NULL, *task_event = NULL; | ||
3685 | bool cpu_rotate = false, task_rotate = false; | ||
3378 | struct perf_event_context *ctx = NULL; | 3686 | struct perf_event_context *ctx = NULL; |
3379 | int rotate = 0; | 3687 | |
3688 | /* | ||
3689 | * Since we run this from IRQ context, nobody can install new | ||
3690 | * events, thus the event count values are stable. | ||
3691 | */ | ||
3380 | 3692 | ||
3381 | if (cpuctx->ctx.nr_events) { | 3693 | if (cpuctx->ctx.nr_events) { |
3382 | if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) | 3694 | if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) |
3383 | rotate = 1; | 3695 | cpu_rotate = true; |
3384 | } | 3696 | } |
3385 | 3697 | ||
3386 | ctx = cpuctx->task_ctx; | 3698 | ctx = cpuctx->task_ctx; |
3387 | if (ctx && ctx->nr_events) { | 3699 | if (ctx && ctx->nr_events) { |
3388 | if (ctx->nr_events != ctx->nr_active) | 3700 | if (ctx->nr_events != ctx->nr_active) |
3389 | rotate = 1; | 3701 | task_rotate = true; |
3390 | } | 3702 | } |
3391 | 3703 | ||
3392 | if (!rotate) | 3704 | if (!(cpu_rotate || task_rotate)) |
3393 | goto done; | 3705 | return false; |
3394 | 3706 | ||
3395 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); | 3707 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); |
3396 | perf_pmu_disable(cpuctx->ctx.pmu); | 3708 | perf_pmu_disable(cpuctx->ctx.pmu); |
3397 | 3709 | ||
3398 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 3710 | if (task_rotate) |
3399 | if (ctx) | 3711 | task_event = ctx_first_active(ctx); |
3712 | if (cpu_rotate) | ||
3713 | cpu_event = ctx_first_active(&cpuctx->ctx); | ||
3714 | |||
3715 | /* | ||
3716 | * As per the order given at ctx_resched() first 'pop' task flexible | ||
3717 | * and then, if needed CPU flexible. | ||
3718 | */ | ||
3719 | if (task_event || (ctx && cpu_event)) | ||
3400 | ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); | 3720 | ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); |
3721 | if (cpu_event) | ||
3722 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | ||
3401 | 3723 | ||
3402 | rotate_ctx(&cpuctx->ctx); | 3724 | if (task_event) |
3403 | if (ctx) | 3725 | rotate_ctx(ctx, task_event); |
3404 | rotate_ctx(ctx); | 3726 | if (cpu_event) |
3727 | rotate_ctx(&cpuctx->ctx, cpu_event); | ||
3405 | 3728 | ||
3406 | perf_event_sched_in(cpuctx, ctx, current); | 3729 | perf_event_sched_in(cpuctx, ctx, current); |
3407 | 3730 | ||
3408 | perf_pmu_enable(cpuctx->ctx.pmu); | 3731 | perf_pmu_enable(cpuctx->ctx.pmu); |
3409 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | 3732 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); |
3410 | done: | ||
3411 | 3733 | ||
3412 | return rotate; | 3734 | return true; |
3413 | } | 3735 | } |
3414 | 3736 | ||
3415 | void perf_event_task_tick(void) | 3737 | void perf_event_task_tick(void) |
@@ -3554,7 +3876,7 @@ static void __perf_event_read(void *info) | |||
3554 | 3876 | ||
3555 | pmu->read(event); | 3877 | pmu->read(event); |
3556 | 3878 | ||
3557 | list_for_each_entry(sub, &event->sibling_list, group_entry) { | 3879 | for_each_sibling_event(sub, event) { |
3558 | if (sub->state == PERF_EVENT_STATE_ACTIVE) { | 3880 | if (sub->state == PERF_EVENT_STATE_ACTIVE) { |
3559 | /* | 3881 | /* |
3560 | * Use sibling's PMU rather than @event's since | 3882 | * Use sibling's PMU rather than @event's since |
@@ -3728,9 +4050,11 @@ static void __perf_event_init_context(struct perf_event_context *ctx) | |||
3728 | raw_spin_lock_init(&ctx->lock); | 4050 | raw_spin_lock_init(&ctx->lock); |
3729 | mutex_init(&ctx->mutex); | 4051 | mutex_init(&ctx->mutex); |
3730 | INIT_LIST_HEAD(&ctx->active_ctx_list); | 4052 | INIT_LIST_HEAD(&ctx->active_ctx_list); |
3731 | INIT_LIST_HEAD(&ctx->pinned_groups); | 4053 | perf_event_groups_init(&ctx->pinned_groups); |
3732 | INIT_LIST_HEAD(&ctx->flexible_groups); | 4054 | perf_event_groups_init(&ctx->flexible_groups); |
3733 | INIT_LIST_HEAD(&ctx->event_list); | 4055 | INIT_LIST_HEAD(&ctx->event_list); |
4056 | INIT_LIST_HEAD(&ctx->pinned_active); | ||
4057 | INIT_LIST_HEAD(&ctx->flexible_active); | ||
3734 | atomic_set(&ctx->refcount, 1); | 4058 | atomic_set(&ctx->refcount, 1); |
3735 | } | 4059 | } |
3736 | 4060 | ||
@@ -4400,7 +4724,7 @@ static int __perf_read_group_add(struct perf_event *leader, | |||
4400 | if (read_format & PERF_FORMAT_ID) | 4724 | if (read_format & PERF_FORMAT_ID) |
4401 | values[n++] = primary_event_id(leader); | 4725 | values[n++] = primary_event_id(leader); |
4402 | 4726 | ||
4403 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { | 4727 | for_each_sibling_event(sub, leader) { |
4404 | values[n++] += perf_event_count(sub); | 4728 | values[n++] += perf_event_count(sub); |
4405 | if (read_format & PERF_FORMAT_ID) | 4729 | if (read_format & PERF_FORMAT_ID) |
4406 | values[n++] = primary_event_id(sub); | 4730 | values[n++] = primary_event_id(sub); |
@@ -4594,7 +4918,7 @@ static void perf_event_for_each(struct perf_event *event, | |||
4594 | event = event->group_leader; | 4918 | event = event->group_leader; |
4595 | 4919 | ||
4596 | perf_event_for_each_child(event, func); | 4920 | perf_event_for_each_child(event, func); |
4597 | list_for_each_entry(sibling, &event->sibling_list, group_entry) | 4921 | for_each_sibling_event(sibling, event) |
4598 | perf_event_for_each_child(sibling, func); | 4922 | perf_event_for_each_child(sibling, func); |
4599 | } | 4923 | } |
4600 | 4924 | ||
@@ -4676,6 +5000,8 @@ static int perf_event_set_output(struct perf_event *event, | |||
4676 | struct perf_event *output_event); | 5000 | struct perf_event *output_event); |
4677 | static int perf_event_set_filter(struct perf_event *event, void __user *arg); | 5001 | static int perf_event_set_filter(struct perf_event *event, void __user *arg); |
4678 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); | 5002 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); |
5003 | static int perf_copy_attr(struct perf_event_attr __user *uattr, | ||
5004 | struct perf_event_attr *attr); | ||
4679 | 5005 | ||
4680 | static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) | 5006 | static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) |
4681 | { | 5007 | { |
@@ -4748,6 +5074,17 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon | |||
4748 | 5074 | ||
4749 | case PERF_EVENT_IOC_QUERY_BPF: | 5075 | case PERF_EVENT_IOC_QUERY_BPF: |
4750 | return perf_event_query_prog_array(event, (void __user *)arg); | 5076 | return perf_event_query_prog_array(event, (void __user *)arg); |
5077 | |||
5078 | case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: { | ||
5079 | struct perf_event_attr new_attr; | ||
5080 | int err = perf_copy_attr((struct perf_event_attr __user *)arg, | ||
5081 | &new_attr); | ||
5082 | |||
5083 | if (err) | ||
5084 | return err; | ||
5085 | |||
5086 | return perf_event_modify_attr(event, &new_attr); | ||
5087 | } | ||
4751 | default: | 5088 | default: |
4752 | return -ENOTTY; | 5089 | return -ENOTTY; |
4753 | } | 5090 | } |
@@ -5743,7 +6080,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
5743 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | 6080 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) |
5744 | values[n++] = running; | 6081 | values[n++] = running; |
5745 | 6082 | ||
5746 | if (leader != event) | 6083 | if ((leader != event) && |
6084 | (leader->state == PERF_EVENT_STATE_ACTIVE)) | ||
5747 | leader->pmu->read(leader); | 6085 | leader->pmu->read(leader); |
5748 | 6086 | ||
5749 | values[n++] = perf_event_count(leader); | 6087 | values[n++] = perf_event_count(leader); |
@@ -5752,7 +6090,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
5752 | 6090 | ||
5753 | __output_copy(handle, values, n * sizeof(u64)); | 6091 | __output_copy(handle, values, n * sizeof(u64)); |
5754 | 6092 | ||
5755 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { | 6093 | for_each_sibling_event(sub, leader) { |
5756 | n = 0; | 6094 | n = 0; |
5757 | 6095 | ||
5758 | if ((sub != event) && | 6096 | if ((sub != event) && |
@@ -8009,9 +8347,119 @@ static struct pmu perf_tracepoint = { | |||
8009 | .read = perf_swevent_read, | 8347 | .read = perf_swevent_read, |
8010 | }; | 8348 | }; |
8011 | 8349 | ||
8350 | #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) | ||
8351 | /* | ||
8352 | * Flags in config, used by dynamic PMU kprobe and uprobe | ||
8353 | * The flags should match following PMU_FORMAT_ATTR(). | ||
8354 | * | ||
8355 | * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe | ||
8356 | * if not set, create kprobe/uprobe | ||
8357 | */ | ||
8358 | enum perf_probe_config { | ||
8359 | PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ | ||
8360 | }; | ||
8361 | |||
8362 | PMU_FORMAT_ATTR(retprobe, "config:0"); | ||
8363 | |||
8364 | static struct attribute *probe_attrs[] = { | ||
8365 | &format_attr_retprobe.attr, | ||
8366 | NULL, | ||
8367 | }; | ||
8368 | |||
8369 | static struct attribute_group probe_format_group = { | ||
8370 | .name = "format", | ||
8371 | .attrs = probe_attrs, | ||
8372 | }; | ||
8373 | |||
8374 | static const struct attribute_group *probe_attr_groups[] = { | ||
8375 | &probe_format_group, | ||
8376 | NULL, | ||
8377 | }; | ||
8378 | #endif | ||
8379 | |||
8380 | #ifdef CONFIG_KPROBE_EVENTS | ||
8381 | static int perf_kprobe_event_init(struct perf_event *event); | ||
8382 | static struct pmu perf_kprobe = { | ||
8383 | .task_ctx_nr = perf_sw_context, | ||
8384 | .event_init = perf_kprobe_event_init, | ||
8385 | .add = perf_trace_add, | ||
8386 | .del = perf_trace_del, | ||
8387 | .start = perf_swevent_start, | ||
8388 | .stop = perf_swevent_stop, | ||
8389 | .read = perf_swevent_read, | ||
8390 | .attr_groups = probe_attr_groups, | ||
8391 | }; | ||
8392 | |||
8393 | static int perf_kprobe_event_init(struct perf_event *event) | ||
8394 | { | ||
8395 | int err; | ||
8396 | bool is_retprobe; | ||
8397 | |||
8398 | if (event->attr.type != perf_kprobe.type) | ||
8399 | return -ENOENT; | ||
8400 | /* | ||
8401 | * no branch sampling for probe events | ||
8402 | */ | ||
8403 | if (has_branch_stack(event)) | ||
8404 | return -EOPNOTSUPP; | ||
8405 | |||
8406 | is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; | ||
8407 | err = perf_kprobe_init(event, is_retprobe); | ||
8408 | if (err) | ||
8409 | return err; | ||
8410 | |||
8411 | event->destroy = perf_kprobe_destroy; | ||
8412 | |||
8413 | return 0; | ||
8414 | } | ||
8415 | #endif /* CONFIG_KPROBE_EVENTS */ | ||
8416 | |||
8417 | #ifdef CONFIG_UPROBE_EVENTS | ||
8418 | static int perf_uprobe_event_init(struct perf_event *event); | ||
8419 | static struct pmu perf_uprobe = { | ||
8420 | .task_ctx_nr = perf_sw_context, | ||
8421 | .event_init = perf_uprobe_event_init, | ||
8422 | .add = perf_trace_add, | ||
8423 | .del = perf_trace_del, | ||
8424 | .start = perf_swevent_start, | ||
8425 | .stop = perf_swevent_stop, | ||
8426 | .read = perf_swevent_read, | ||
8427 | .attr_groups = probe_attr_groups, | ||
8428 | }; | ||
8429 | |||
8430 | static int perf_uprobe_event_init(struct perf_event *event) | ||
8431 | { | ||
8432 | int err; | ||
8433 | bool is_retprobe; | ||
8434 | |||
8435 | if (event->attr.type != perf_uprobe.type) | ||
8436 | return -ENOENT; | ||
8437 | /* | ||
8438 | * no branch sampling for probe events | ||
8439 | */ | ||
8440 | if (has_branch_stack(event)) | ||
8441 | return -EOPNOTSUPP; | ||
8442 | |||
8443 | is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; | ||
8444 | err = perf_uprobe_init(event, is_retprobe); | ||
8445 | if (err) | ||
8446 | return err; | ||
8447 | |||
8448 | event->destroy = perf_uprobe_destroy; | ||
8449 | |||
8450 | return 0; | ||
8451 | } | ||
8452 | #endif /* CONFIG_UPROBE_EVENTS */ | ||
8453 | |||
8012 | static inline void perf_tp_register(void) | 8454 | static inline void perf_tp_register(void) |
8013 | { | 8455 | { |
8014 | perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); | 8456 | perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); |
8457 | #ifdef CONFIG_KPROBE_EVENTS | ||
8458 | perf_pmu_register(&perf_kprobe, "kprobe", -1); | ||
8459 | #endif | ||
8460 | #ifdef CONFIG_UPROBE_EVENTS | ||
8461 | perf_pmu_register(&perf_uprobe, "uprobe", -1); | ||
8462 | #endif | ||
8015 | } | 8463 | } |
8016 | 8464 | ||
8017 | static void perf_event_free_filter(struct perf_event *event) | 8465 | static void perf_event_free_filter(struct perf_event *event) |
@@ -8088,13 +8536,32 @@ static void perf_event_free_bpf_handler(struct perf_event *event) | |||
8088 | } | 8536 | } |
8089 | #endif | 8537 | #endif |
8090 | 8538 | ||
8539 | /* | ||
8540 | * returns true if the event is a tracepoint, or a kprobe/upprobe created | ||
8541 | * with perf_event_open() | ||
8542 | */ | ||
8543 | static inline bool perf_event_is_tracing(struct perf_event *event) | ||
8544 | { | ||
8545 | if (event->pmu == &perf_tracepoint) | ||
8546 | return true; | ||
8547 | #ifdef CONFIG_KPROBE_EVENTS | ||
8548 | if (event->pmu == &perf_kprobe) | ||
8549 | return true; | ||
8550 | #endif | ||
8551 | #ifdef CONFIG_UPROBE_EVENTS | ||
8552 | if (event->pmu == &perf_uprobe) | ||
8553 | return true; | ||
8554 | #endif | ||
8555 | return false; | ||
8556 | } | ||
8557 | |||
8091 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) | 8558 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) |
8092 | { | 8559 | { |
8093 | bool is_kprobe, is_tracepoint, is_syscall_tp; | 8560 | bool is_kprobe, is_tracepoint, is_syscall_tp; |
8094 | struct bpf_prog *prog; | 8561 | struct bpf_prog *prog; |
8095 | int ret; | 8562 | int ret; |
8096 | 8563 | ||
8097 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | 8564 | if (!perf_event_is_tracing(event)) |
8098 | return perf_event_set_bpf_handler(event, prog_fd); | 8565 | return perf_event_set_bpf_handler(event, prog_fd); |
8099 | 8566 | ||
8100 | is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; | 8567 | is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; |
@@ -8140,7 +8607,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) | |||
8140 | 8607 | ||
8141 | static void perf_event_free_bpf_prog(struct perf_event *event) | 8608 | static void perf_event_free_bpf_prog(struct perf_event *event) |
8142 | { | 8609 | { |
8143 | if (event->attr.type != PERF_TYPE_TRACEPOINT) { | 8610 | if (!perf_event_is_tracing(event)) { |
8144 | perf_event_free_bpf_handler(event); | 8611 | perf_event_free_bpf_handler(event); |
8145 | return; | 8612 | return; |
8146 | } | 8613 | } |
@@ -8336,7 +8803,8 @@ restart: | |||
8336 | * * for kernel addresses: <start address>[/<size>] | 8803 | * * for kernel addresses: <start address>[/<size>] |
8337 | * * for object files: <start address>[/<size>]@</path/to/object/file> | 8804 | * * for object files: <start address>[/<size>]@</path/to/object/file> |
8338 | * | 8805 | * |
8339 | * if <size> is not specified, the range is treated as a single address. | 8806 | * if <size> is not specified or is zero, the range is treated as a single |
8807 | * address; not valid for ACTION=="filter". | ||
8340 | */ | 8808 | */ |
8341 | enum { | 8809 | enum { |
8342 | IF_ACT_NONE = -1, | 8810 | IF_ACT_NONE = -1, |
@@ -8386,6 +8854,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, | |||
8386 | return -ENOMEM; | 8854 | return -ENOMEM; |
8387 | 8855 | ||
8388 | while ((start = strsep(&fstr, " ,\n")) != NULL) { | 8856 | while ((start = strsep(&fstr, " ,\n")) != NULL) { |
8857 | static const enum perf_addr_filter_action_t actions[] = { | ||
8858 | [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER, | ||
8859 | [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START, | ||
8860 | [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP, | ||
8861 | }; | ||
8389 | ret = -EINVAL; | 8862 | ret = -EINVAL; |
8390 | 8863 | ||
8391 | if (!*start) | 8864 | if (!*start) |
@@ -8402,12 +8875,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, | |||
8402 | switch (token) { | 8875 | switch (token) { |
8403 | case IF_ACT_FILTER: | 8876 | case IF_ACT_FILTER: |
8404 | case IF_ACT_START: | 8877 | case IF_ACT_START: |
8405 | filter->filter = 1; | ||
8406 | |||
8407 | case IF_ACT_STOP: | 8878 | case IF_ACT_STOP: |
8408 | if (state != IF_STATE_ACTION) | 8879 | if (state != IF_STATE_ACTION) |
8409 | goto fail; | 8880 | goto fail; |
8410 | 8881 | ||
8882 | filter->action = actions[token]; | ||
8411 | state = IF_STATE_SOURCE; | 8883 | state = IF_STATE_SOURCE; |
8412 | break; | 8884 | break; |
8413 | 8885 | ||
@@ -8420,15 +8892,12 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, | |||
8420 | if (state != IF_STATE_SOURCE) | 8892 | if (state != IF_STATE_SOURCE) |
8421 | goto fail; | 8893 | goto fail; |
8422 | 8894 | ||
8423 | if (token == IF_SRC_FILE || token == IF_SRC_KERNEL) | ||
8424 | filter->range = 1; | ||
8425 | |||
8426 | *args[0].to = 0; | 8895 | *args[0].to = 0; |
8427 | ret = kstrtoul(args[0].from, 0, &filter->offset); | 8896 | ret = kstrtoul(args[0].from, 0, &filter->offset); |
8428 | if (ret) | 8897 | if (ret) |
8429 | goto fail; | 8898 | goto fail; |
8430 | 8899 | ||
8431 | if (filter->range) { | 8900 | if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) { |
8432 | *args[1].to = 0; | 8901 | *args[1].to = 0; |
8433 | ret = kstrtoul(args[1].from, 0, &filter->size); | 8902 | ret = kstrtoul(args[1].from, 0, &filter->size); |
8434 | if (ret) | 8903 | if (ret) |
@@ -8436,7 +8905,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, | |||
8436 | } | 8905 | } |
8437 | 8906 | ||
8438 | if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { | 8907 | if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { |
8439 | int fpos = filter->range ? 2 : 1; | 8908 | int fpos = token == IF_SRC_FILE ? 2 : 1; |
8440 | 8909 | ||
8441 | filename = match_strdup(&args[fpos]); | 8910 | filename = match_strdup(&args[fpos]); |
8442 | if (!filename) { | 8911 | if (!filename) { |
@@ -8462,6 +8931,14 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, | |||
8462 | if (kernel && event->attr.exclude_kernel) | 8931 | if (kernel && event->attr.exclude_kernel) |
8463 | goto fail; | 8932 | goto fail; |
8464 | 8933 | ||
8934 | /* | ||
8935 | * ACTION "filter" must have a non-zero length region | ||
8936 | * specified. | ||
8937 | */ | ||
8938 | if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER && | ||
8939 | !filter->size) | ||
8940 | goto fail; | ||
8941 | |||
8465 | if (!kernel) { | 8942 | if (!kernel) { |
8466 | if (!filename) | 8943 | if (!filename) |
8467 | goto fail; | 8944 | goto fail; |
@@ -8559,47 +9036,36 @@ fail_clear_files: | |||
8559 | return ret; | 9036 | return ret; |
8560 | } | 9037 | } |
8561 | 9038 | ||
8562 | static int | ||
8563 | perf_tracepoint_set_filter(struct perf_event *event, char *filter_str) | ||
8564 | { | ||
8565 | struct perf_event_context *ctx = event->ctx; | ||
8566 | int ret; | ||
8567 | |||
8568 | /* | ||
8569 | * Beware, here be dragons!! | ||
8570 | * | ||
8571 | * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint | ||
8572 | * stuff does not actually need it. So temporarily drop ctx->mutex. As per | ||
8573 | * perf_event_ctx_lock() we already have a reference on ctx. | ||
8574 | * | ||
8575 | * This can result in event getting moved to a different ctx, but that | ||
8576 | * does not affect the tracepoint state. | ||
8577 | */ | ||
8578 | mutex_unlock(&ctx->mutex); | ||
8579 | ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); | ||
8580 | mutex_lock(&ctx->mutex); | ||
8581 | |||
8582 | return ret; | ||
8583 | } | ||
8584 | |||
8585 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 9039 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
8586 | { | 9040 | { |
8587 | char *filter_str; | ||
8588 | int ret = -EINVAL; | 9041 | int ret = -EINVAL; |
8589 | 9042 | char *filter_str; | |
8590 | if ((event->attr.type != PERF_TYPE_TRACEPOINT || | ||
8591 | !IS_ENABLED(CONFIG_EVENT_TRACING)) && | ||
8592 | !has_addr_filter(event)) | ||
8593 | return -EINVAL; | ||
8594 | 9043 | ||
8595 | filter_str = strndup_user(arg, PAGE_SIZE); | 9044 | filter_str = strndup_user(arg, PAGE_SIZE); |
8596 | if (IS_ERR(filter_str)) | 9045 | if (IS_ERR(filter_str)) |
8597 | return PTR_ERR(filter_str); | 9046 | return PTR_ERR(filter_str); |
8598 | 9047 | ||
8599 | if (IS_ENABLED(CONFIG_EVENT_TRACING) && | 9048 | #ifdef CONFIG_EVENT_TRACING |
8600 | event->attr.type == PERF_TYPE_TRACEPOINT) | 9049 | if (perf_event_is_tracing(event)) { |
8601 | ret = perf_tracepoint_set_filter(event, filter_str); | 9050 | struct perf_event_context *ctx = event->ctx; |
8602 | else if (has_addr_filter(event)) | 9051 | |
9052 | /* | ||
9053 | * Beware, here be dragons!! | ||
9054 | * | ||
9055 | * the tracepoint muck will deadlock against ctx->mutex, but | ||
9056 | * the tracepoint stuff does not actually need it. So | ||
9057 | * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we | ||
9058 | * already have a reference on ctx. | ||
9059 | * | ||
9060 | * This can result in event getting moved to a different ctx, | ||
9061 | * but that does not affect the tracepoint state. | ||
9062 | */ | ||
9063 | mutex_unlock(&ctx->mutex); | ||
9064 | ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); | ||
9065 | mutex_lock(&ctx->mutex); | ||
9066 | } else | ||
9067 | #endif | ||
9068 | if (has_addr_filter(event)) | ||
8603 | ret = perf_event_set_addr_filter(event, filter_str); | 9069 | ret = perf_event_set_addr_filter(event, filter_str); |
8604 | 9070 | ||
8605 | kfree(filter_str); | 9071 | kfree(filter_str); |
@@ -9452,9 +9918,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
9452 | mutex_init(&event->child_mutex); | 9918 | mutex_init(&event->child_mutex); |
9453 | INIT_LIST_HEAD(&event->child_list); | 9919 | INIT_LIST_HEAD(&event->child_list); |
9454 | 9920 | ||
9455 | INIT_LIST_HEAD(&event->group_entry); | ||
9456 | INIT_LIST_HEAD(&event->event_entry); | 9921 | INIT_LIST_HEAD(&event->event_entry); |
9457 | INIT_LIST_HEAD(&event->sibling_list); | 9922 | INIT_LIST_HEAD(&event->sibling_list); |
9923 | INIT_LIST_HEAD(&event->active_list); | ||
9924 | init_event_group(event); | ||
9458 | INIT_LIST_HEAD(&event->rb_entry); | 9925 | INIT_LIST_HEAD(&event->rb_entry); |
9459 | INIT_LIST_HEAD(&event->active_entry); | 9926 | INIT_LIST_HEAD(&event->active_entry); |
9460 | INIT_LIST_HEAD(&event->addr_filters.list); | 9927 | INIT_LIST_HEAD(&event->addr_filters.list); |
@@ -9729,6 +10196,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
9729 | ret = -EINVAL; | 10196 | ret = -EINVAL; |
9730 | } | 10197 | } |
9731 | 10198 | ||
10199 | if (!attr->sample_max_stack) | ||
10200 | attr->sample_max_stack = sysctl_perf_event_max_stack; | ||
10201 | |||
9732 | if (attr->sample_type & PERF_SAMPLE_REGS_INTR) | 10202 | if (attr->sample_type & PERF_SAMPLE_REGS_INTR) |
9733 | ret = perf_reg_validate(attr->sample_regs_intr); | 10203 | ret = perf_reg_validate(attr->sample_regs_intr); |
9734 | out: | 10204 | out: |
@@ -9942,9 +10412,6 @@ SYSCALL_DEFINE5(perf_event_open, | |||
9942 | perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) | 10412 | perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) |
9943 | return -EACCES; | 10413 | return -EACCES; |
9944 | 10414 | ||
9945 | if (!attr.sample_max_stack) | ||
9946 | attr.sample_max_stack = sysctl_perf_event_max_stack; | ||
9947 | |||
9948 | /* | 10415 | /* |
9949 | * In cgroup mode, the pid argument is used to pass the fd | 10416 | * In cgroup mode, the pid argument is used to pass the fd |
9950 | * opened to the cgroup directory in cgroupfs. The cpu argument | 10417 | * opened to the cgroup directory in cgroupfs. The cpu argument |
@@ -10218,8 +10685,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
10218 | perf_remove_from_context(group_leader, 0); | 10685 | perf_remove_from_context(group_leader, 0); |
10219 | put_ctx(gctx); | 10686 | put_ctx(gctx); |
10220 | 10687 | ||
10221 | list_for_each_entry(sibling, &group_leader->sibling_list, | 10688 | for_each_sibling_event(sibling, group_leader) { |
10222 | group_entry) { | ||
10223 | perf_remove_from_context(sibling, 0); | 10689 | perf_remove_from_context(sibling, 0); |
10224 | put_ctx(gctx); | 10690 | put_ctx(gctx); |
10225 | } | 10691 | } |
@@ -10240,8 +10706,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
10240 | * By installing siblings first we NO-OP because they're not | 10706 | * By installing siblings first we NO-OP because they're not |
10241 | * reachable through the group lists. | 10707 | * reachable through the group lists. |
10242 | */ | 10708 | */ |
10243 | list_for_each_entry(sibling, &group_leader->sibling_list, | 10709 | for_each_sibling_event(sibling, group_leader) { |
10244 | group_entry) { | ||
10245 | perf_event__state_init(sibling); | 10710 | perf_event__state_init(sibling); |
10246 | perf_install_in_context(ctx, sibling, sibling->cpu); | 10711 | perf_install_in_context(ctx, sibling, sibling->cpu); |
10247 | get_ctx(ctx); | 10712 | get_ctx(ctx); |
@@ -10880,7 +11345,7 @@ static int inherit_group(struct perf_event *parent_event, | |||
10880 | * case inherit_event() will create individual events, similar to what | 11345 | * case inherit_event() will create individual events, similar to what |
10881 | * perf_group_detach() would do anyway. | 11346 | * perf_group_detach() would do anyway. |
10882 | */ | 11347 | */ |
10883 | list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { | 11348 | for_each_sibling_event(sub, parent_event) { |
10884 | child_ctr = inherit_event(sub, parent, parent_ctx, | 11349 | child_ctr = inherit_event(sub, parent, parent_ctx, |
10885 | child, leader, child_ctx); | 11350 | child, leader, child_ctx); |
10886 | if (IS_ERR(child_ctr)) | 11351 | if (IS_ERR(child_ctr)) |
@@ -10979,7 +11444,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) | |||
10979 | * We dont have to disable NMIs - we are only looking at | 11444 | * We dont have to disable NMIs - we are only looking at |
10980 | * the list, not manipulating it: | 11445 | * the list, not manipulating it: |
10981 | */ | 11446 | */ |
10982 | list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { | 11447 | perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { |
10983 | ret = inherit_task_group(event, parent, parent_ctx, | 11448 | ret = inherit_task_group(event, parent, parent_ctx, |
10984 | child, ctxn, &inherited_all); | 11449 | child, ctxn, &inherited_all); |
10985 | if (ret) | 11450 | if (ret) |
@@ -10995,7 +11460,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) | |||
10995 | parent_ctx->rotate_disable = 1; | 11460 | parent_ctx->rotate_disable = 1; |
10996 | raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); | 11461 | raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); |
10997 | 11462 | ||
10998 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { | 11463 | perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { |
10999 | ret = inherit_task_group(event, parent, parent_ctx, | 11464 | ret = inherit_task_group(event, parent, parent_ctx, |
11000 | child, ctxn, &inherited_all); | 11465 | child, ctxn, &inherited_all); |
11001 | if (ret) | 11466 | if (ret) |
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 3f8cb1e14588..6e28d2866be5 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
@@ -44,6 +44,7 @@ | |||
44 | #include <linux/list.h> | 44 | #include <linux/list.h> |
45 | #include <linux/cpu.h> | 45 | #include <linux/cpu.h> |
46 | #include <linux/smp.h> | 46 | #include <linux/smp.h> |
47 | #include <linux/bug.h> | ||
47 | 48 | ||
48 | #include <linux/hw_breakpoint.h> | 49 | #include <linux/hw_breakpoint.h> |
49 | /* | 50 | /* |
@@ -85,9 +86,9 @@ __weak int hw_breakpoint_weight(struct perf_event *bp) | |||
85 | return 1; | 86 | return 1; |
86 | } | 87 | } |
87 | 88 | ||
88 | static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) | 89 | static inline enum bp_type_idx find_slot_idx(u64 bp_type) |
89 | { | 90 | { |
90 | if (bp->attr.bp_type & HW_BREAKPOINT_RW) | 91 | if (bp_type & HW_BREAKPOINT_RW) |
91 | return TYPE_DATA; | 92 | return TYPE_DATA; |
92 | 93 | ||
93 | return TYPE_INST; | 94 | return TYPE_INST; |
@@ -122,7 +123,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) | |||
122 | 123 | ||
123 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { | 124 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { |
124 | if (iter->hw.target == tsk && | 125 | if (iter->hw.target == tsk && |
125 | find_slot_idx(iter) == type && | 126 | find_slot_idx(iter->attr.bp_type) == type && |
126 | (iter->cpu < 0 || cpu == iter->cpu)) | 127 | (iter->cpu < 0 || cpu == iter->cpu)) |
127 | count += hw_breakpoint_weight(iter); | 128 | count += hw_breakpoint_weight(iter); |
128 | } | 129 | } |
@@ -277,7 +278,7 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) | |||
277 | * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *)) | 278 | * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *)) |
278 | * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM | 279 | * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM |
279 | */ | 280 | */ |
280 | static int __reserve_bp_slot(struct perf_event *bp) | 281 | static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) |
281 | { | 282 | { |
282 | struct bp_busy_slots slots = {0}; | 283 | struct bp_busy_slots slots = {0}; |
283 | enum bp_type_idx type; | 284 | enum bp_type_idx type; |
@@ -288,11 +289,11 @@ static int __reserve_bp_slot(struct perf_event *bp) | |||
288 | return -ENOMEM; | 289 | return -ENOMEM; |
289 | 290 | ||
290 | /* Basic checks */ | 291 | /* Basic checks */ |
291 | if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || | 292 | if (bp_type == HW_BREAKPOINT_EMPTY || |
292 | bp->attr.bp_type == HW_BREAKPOINT_INVALID) | 293 | bp_type == HW_BREAKPOINT_INVALID) |
293 | return -EINVAL; | 294 | return -EINVAL; |
294 | 295 | ||
295 | type = find_slot_idx(bp); | 296 | type = find_slot_idx(bp_type); |
296 | weight = hw_breakpoint_weight(bp); | 297 | weight = hw_breakpoint_weight(bp); |
297 | 298 | ||
298 | fetch_bp_busy_slots(&slots, bp, type); | 299 | fetch_bp_busy_slots(&slots, bp, type); |
@@ -317,19 +318,19 @@ int reserve_bp_slot(struct perf_event *bp) | |||
317 | 318 | ||
318 | mutex_lock(&nr_bp_mutex); | 319 | mutex_lock(&nr_bp_mutex); |
319 | 320 | ||
320 | ret = __reserve_bp_slot(bp); | 321 | ret = __reserve_bp_slot(bp, bp->attr.bp_type); |
321 | 322 | ||
322 | mutex_unlock(&nr_bp_mutex); | 323 | mutex_unlock(&nr_bp_mutex); |
323 | 324 | ||
324 | return ret; | 325 | return ret; |
325 | } | 326 | } |
326 | 327 | ||
327 | static void __release_bp_slot(struct perf_event *bp) | 328 | static void __release_bp_slot(struct perf_event *bp, u64 bp_type) |
328 | { | 329 | { |
329 | enum bp_type_idx type; | 330 | enum bp_type_idx type; |
330 | int weight; | 331 | int weight; |
331 | 332 | ||
332 | type = find_slot_idx(bp); | 333 | type = find_slot_idx(bp_type); |
333 | weight = hw_breakpoint_weight(bp); | 334 | weight = hw_breakpoint_weight(bp); |
334 | toggle_bp_slot(bp, false, type, weight); | 335 | toggle_bp_slot(bp, false, type, weight); |
335 | } | 336 | } |
@@ -339,11 +340,43 @@ void release_bp_slot(struct perf_event *bp) | |||
339 | mutex_lock(&nr_bp_mutex); | 340 | mutex_lock(&nr_bp_mutex); |
340 | 341 | ||
341 | arch_unregister_hw_breakpoint(bp); | 342 | arch_unregister_hw_breakpoint(bp); |
342 | __release_bp_slot(bp); | 343 | __release_bp_slot(bp, bp->attr.bp_type); |
343 | 344 | ||
344 | mutex_unlock(&nr_bp_mutex); | 345 | mutex_unlock(&nr_bp_mutex); |
345 | } | 346 | } |
346 | 347 | ||
348 | static int __modify_bp_slot(struct perf_event *bp, u64 old_type) | ||
349 | { | ||
350 | int err; | ||
351 | |||
352 | __release_bp_slot(bp, old_type); | ||
353 | |||
354 | err = __reserve_bp_slot(bp, bp->attr.bp_type); | ||
355 | if (err) { | ||
356 | /* | ||
357 | * Reserve the old_type slot back in case | ||
358 | * there's no space for the new type. | ||
359 | * | ||
360 | * This must succeed, because we just released | ||
361 | * the old_type slot in the __release_bp_slot | ||
362 | * call above. If not, something is broken. | ||
363 | */ | ||
364 | WARN_ON(__reserve_bp_slot(bp, old_type)); | ||
365 | } | ||
366 | |||
367 | return err; | ||
368 | } | ||
369 | |||
370 | static int modify_bp_slot(struct perf_event *bp, u64 old_type) | ||
371 | { | ||
372 | int ret; | ||
373 | |||
374 | mutex_lock(&nr_bp_mutex); | ||
375 | ret = __modify_bp_slot(bp, old_type); | ||
376 | mutex_unlock(&nr_bp_mutex); | ||
377 | return ret; | ||
378 | } | ||
379 | |||
347 | /* | 380 | /* |
348 | * Allow the kernel debugger to reserve breakpoint slots without | 381 | * Allow the kernel debugger to reserve breakpoint slots without |
349 | * taking a lock using the dbg_* variant of for the reserve and | 382 | * taking a lock using the dbg_* variant of for the reserve and |
@@ -354,7 +387,7 @@ int dbg_reserve_bp_slot(struct perf_event *bp) | |||
354 | if (mutex_is_locked(&nr_bp_mutex)) | 387 | if (mutex_is_locked(&nr_bp_mutex)) |
355 | return -1; | 388 | return -1; |
356 | 389 | ||
357 | return __reserve_bp_slot(bp); | 390 | return __reserve_bp_slot(bp, bp->attr.bp_type); |
358 | } | 391 | } |
359 | 392 | ||
360 | int dbg_release_bp_slot(struct perf_event *bp) | 393 | int dbg_release_bp_slot(struct perf_event *bp) |
@@ -362,7 +395,7 @@ int dbg_release_bp_slot(struct perf_event *bp) | |||
362 | if (mutex_is_locked(&nr_bp_mutex)) | 395 | if (mutex_is_locked(&nr_bp_mutex)) |
363 | return -1; | 396 | return -1; |
364 | 397 | ||
365 | __release_bp_slot(bp); | 398 | __release_bp_slot(bp, bp->attr.bp_type); |
366 | 399 | ||
367 | return 0; | 400 | return 0; |
368 | } | 401 | } |
@@ -423,20 +456,45 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, | |||
423 | } | 456 | } |
424 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); | 457 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); |
425 | 458 | ||
459 | int | ||
460 | modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, | ||
461 | bool check) | ||
462 | { | ||
463 | u64 old_addr = bp->attr.bp_addr; | ||
464 | u64 old_len = bp->attr.bp_len; | ||
465 | int old_type = bp->attr.bp_type; | ||
466 | bool modify = attr->bp_type != old_type; | ||
467 | int err = 0; | ||
468 | |||
469 | bp->attr.bp_addr = attr->bp_addr; | ||
470 | bp->attr.bp_type = attr->bp_type; | ||
471 | bp->attr.bp_len = attr->bp_len; | ||
472 | |||
473 | if (check && memcmp(&bp->attr, attr, sizeof(*attr))) | ||
474 | return -EINVAL; | ||
475 | |||
476 | err = validate_hw_breakpoint(bp); | ||
477 | if (!err && modify) | ||
478 | err = modify_bp_slot(bp, old_type); | ||
479 | |||
480 | if (err) { | ||
481 | bp->attr.bp_addr = old_addr; | ||
482 | bp->attr.bp_type = old_type; | ||
483 | bp->attr.bp_len = old_len; | ||
484 | return err; | ||
485 | } | ||
486 | |||
487 | bp->attr.disabled = attr->disabled; | ||
488 | return 0; | ||
489 | } | ||
490 | |||
426 | /** | 491 | /** |
427 | * modify_user_hw_breakpoint - modify a user-space hardware breakpoint | 492 | * modify_user_hw_breakpoint - modify a user-space hardware breakpoint |
428 | * @bp: the breakpoint structure to modify | 493 | * @bp: the breakpoint structure to modify |
429 | * @attr: new breakpoint attributes | 494 | * @attr: new breakpoint attributes |
430 | * @triggered: callback to trigger when we hit the breakpoint | ||
431 | * @tsk: pointer to 'task_struct' of the process to which the address belongs | ||
432 | */ | 495 | */ |
433 | int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) | 496 | int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) |
434 | { | 497 | { |
435 | u64 old_addr = bp->attr.bp_addr; | ||
436 | u64 old_len = bp->attr.bp_len; | ||
437 | int old_type = bp->attr.bp_type; | ||
438 | int err = 0; | ||
439 | |||
440 | /* | 498 | /* |
441 | * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it | 499 | * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it |
442 | * will not be possible to raise IPIs that invoke __perf_event_disable. | 500 | * will not be possible to raise IPIs that invoke __perf_event_disable. |
@@ -448,30 +506,14 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att | |||
448 | else | 506 | else |
449 | perf_event_disable(bp); | 507 | perf_event_disable(bp); |
450 | 508 | ||
451 | bp->attr.bp_addr = attr->bp_addr; | 509 | if (!attr->disabled) { |
452 | bp->attr.bp_type = attr->bp_type; | 510 | int err = modify_user_hw_breakpoint_check(bp, attr, false); |
453 | bp->attr.bp_len = attr->bp_len; | ||
454 | |||
455 | if (attr->disabled) | ||
456 | goto end; | ||
457 | 511 | ||
458 | err = validate_hw_breakpoint(bp); | 512 | if (err) |
459 | if (!err) | 513 | return err; |
460 | perf_event_enable(bp); | 514 | perf_event_enable(bp); |
461 | 515 | bp->attr.disabled = 0; | |
462 | if (err) { | ||
463 | bp->attr.bp_addr = old_addr; | ||
464 | bp->attr.bp_type = old_type; | ||
465 | bp->attr.bp_len = old_len; | ||
466 | if (!bp->attr.disabled) | ||
467 | perf_event_enable(bp); | ||
468 | |||
469 | return err; | ||
470 | } | 516 | } |
471 | |||
472 | end: | ||
473 | bp->attr.disabled = attr->disabled; | ||
474 | |||
475 | return 0; | 517 | return 0; |
476 | } | 518 | } |
477 | EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); | 519 | EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); |
diff --git a/kernel/exit.c b/kernel/exit.c index 995453d9fb55..c3c7ac560114 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -1691,7 +1691,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, | |||
1691 | */ | 1691 | */ |
1692 | SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) | 1692 | SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) |
1693 | { | 1693 | { |
1694 | return sys_wait4(pid, stat_addr, options, NULL); | 1694 | return kernel_wait4(pid, stat_addr, options, NULL); |
1695 | } | 1695 | } |
1696 | 1696 | ||
1697 | #endif | 1697 | #endif |
diff --git a/kernel/fork.c b/kernel/fork.c index e5d9d405ae4e..f71b67dc156d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1198,8 +1198,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) | |||
1198 | * not set up a proper pointer then tough luck. | 1198 | * not set up a proper pointer then tough luck. |
1199 | */ | 1199 | */ |
1200 | put_user(0, tsk->clear_child_tid); | 1200 | put_user(0, tsk->clear_child_tid); |
1201 | sys_futex(tsk->clear_child_tid, FUTEX_WAKE, | 1201 | do_futex(tsk->clear_child_tid, FUTEX_WAKE, |
1202 | 1, NULL, NULL, 0); | 1202 | 1, NULL, NULL, 0, 0); |
1203 | } | 1203 | } |
1204 | tsk->clear_child_tid = NULL; | 1204 | tsk->clear_child_tid = NULL; |
1205 | } | 1205 | } |
@@ -2354,7 +2354,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp | |||
2354 | * constructed. Here we are modifying the current, active, | 2354 | * constructed. Here we are modifying the current, active, |
2355 | * task_struct. | 2355 | * task_struct. |
2356 | */ | 2356 | */ |
2357 | SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | 2357 | int ksys_unshare(unsigned long unshare_flags) |
2358 | { | 2358 | { |
2359 | struct fs_struct *fs, *new_fs = NULL; | 2359 | struct fs_struct *fs, *new_fs = NULL; |
2360 | struct files_struct *fd, *new_fd = NULL; | 2360 | struct files_struct *fd, *new_fd = NULL; |
@@ -2470,6 +2470,11 @@ bad_unshare_out: | |||
2470 | return err; | 2470 | return err; |
2471 | } | 2471 | } |
2472 | 2472 | ||
2473 | SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | ||
2474 | { | ||
2475 | return ksys_unshare(unshare_flags); | ||
2476 | } | ||
2477 | |||
2473 | /* | 2478 | /* |
2474 | * Helper to unshare the files of the current task. | 2479 | * Helper to unshare the files of the current task. |
2475 | * We don't want to expose copy_files internals to | 2480 | * We don't want to expose copy_files internals to |
diff --git a/kernel/kexec.c b/kernel/kexec.c index e62ec4dc6620..aed8fb2564b3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -192,11 +192,9 @@ out: | |||
192 | * that to happen you need to do that yourself. | 192 | * that to happen you need to do that yourself. |
193 | */ | 193 | */ |
194 | 194 | ||
195 | SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, | 195 | static inline int kexec_load_check(unsigned long nr_segments, |
196 | struct kexec_segment __user *, segments, unsigned long, flags) | 196 | unsigned long flags) |
197 | { | 197 | { |
198 | int result; | ||
199 | |||
200 | /* We only trust the superuser with rebooting the system. */ | 198 | /* We only trust the superuser with rebooting the system. */ |
201 | if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) | 199 | if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) |
202 | return -EPERM; | 200 | return -EPERM; |
@@ -208,17 +206,29 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, | |||
208 | if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) | 206 | if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) |
209 | return -EINVAL; | 207 | return -EINVAL; |
210 | 208 | ||
211 | /* Verify we are on the appropriate architecture */ | ||
212 | if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && | ||
213 | ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) | ||
214 | return -EINVAL; | ||
215 | |||
216 | /* Put an artificial cap on the number | 209 | /* Put an artificial cap on the number |
217 | * of segments passed to kexec_load. | 210 | * of segments passed to kexec_load. |
218 | */ | 211 | */ |
219 | if (nr_segments > KEXEC_SEGMENT_MAX) | 212 | if (nr_segments > KEXEC_SEGMENT_MAX) |
220 | return -EINVAL; | 213 | return -EINVAL; |
221 | 214 | ||
215 | return 0; | ||
216 | } | ||
217 | |||
218 | SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, | ||
219 | struct kexec_segment __user *, segments, unsigned long, flags) | ||
220 | { | ||
221 | int result; | ||
222 | |||
223 | result = kexec_load_check(nr_segments, flags); | ||
224 | if (result) | ||
225 | return result; | ||
226 | |||
227 | /* Verify we are on the appropriate architecture */ | ||
228 | if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && | ||
229 | ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) | ||
230 | return -EINVAL; | ||
231 | |||
222 | /* Because we write directly to the reserved memory | 232 | /* Because we write directly to the reserved memory |
223 | * region when loading crash kernels we need a mutex here to | 233 | * region when loading crash kernels we need a mutex here to |
224 | * prevent multiple crash kernels from attempting to load | 234 | * prevent multiple crash kernels from attempting to load |
@@ -247,15 +257,16 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, | |||
247 | struct kexec_segment out, __user *ksegments; | 257 | struct kexec_segment out, __user *ksegments; |
248 | unsigned long i, result; | 258 | unsigned long i, result; |
249 | 259 | ||
260 | result = kexec_load_check(nr_segments, flags); | ||
261 | if (result) | ||
262 | return result; | ||
263 | |||
250 | /* Don't allow clients that don't understand the native | 264 | /* Don't allow clients that don't understand the native |
251 | * architecture to do anything. | 265 | * architecture to do anything. |
252 | */ | 266 | */ |
253 | if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) | 267 | if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) |
254 | return -EINVAL; | 268 | return -EINVAL; |
255 | 269 | ||
256 | if (nr_segments > KEXEC_SEGMENT_MAX) | ||
257 | return -EINVAL; | ||
258 | |||
259 | ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); | 270 | ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); |
260 | for (i = 0; i < nr_segments; i++) { | 271 | for (i = 0; i < nr_segments; i++) { |
261 | result = copy_from_user(&in, &segments[i], sizeof(in)); | 272 | result = copy_from_user(&in, &segments[i], sizeof(in)); |
@@ -272,6 +283,21 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, | |||
272 | return -EFAULT; | 283 | return -EFAULT; |
273 | } | 284 | } |
274 | 285 | ||
275 | return sys_kexec_load(entry, nr_segments, ksegments, flags); | 286 | /* Because we write directly to the reserved memory |
287 | * region when loading crash kernels we need a mutex here to | ||
288 | * prevent multiple crash kernels from attempting to load | ||
289 | * simultaneously, and to prevent a crash kernel from loading | ||
290 | * over the top of a in use crash kernel. | ||
291 | * | ||
292 | * KISS: always take the mutex. | ||
293 | */ | ||
294 | if (!mutex_trylock(&kexec_mutex)) | ||
295 | return -EBUSY; | ||
296 | |||
297 | result = do_kexec_load(entry, nr_segments, ksegments, flags); | ||
298 | |||
299 | mutex_unlock(&kexec_mutex); | ||
300 | |||
301 | return result; | ||
276 | } | 302 | } |
277 | #endif | 303 | #endif |
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 89b5f83f1969..023386338269 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
@@ -556,9 +556,9 @@ static void print_lock(struct held_lock *hlock) | |||
556 | return; | 556 | return; |
557 | } | 557 | } |
558 | 558 | ||
559 | printk(KERN_CONT "%p", hlock->instance); | ||
559 | print_lock_name(lock_classes + class_idx - 1); | 560 | print_lock_name(lock_classes + class_idx - 1); |
560 | printk(KERN_CONT ", at: [<%p>] %pS\n", | 561 | printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); |
561 | (void *)hlock->acquire_ip, (void *)hlock->acquire_ip); | ||
562 | } | 562 | } |
563 | 563 | ||
564 | static void lockdep_print_held_locks(struct task_struct *curr) | 564 | static void lockdep_print_held_locks(struct task_struct *curr) |
@@ -808,7 +808,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
808 | if (verbose(class)) { | 808 | if (verbose(class)) { |
809 | graph_unlock(); | 809 | graph_unlock(); |
810 | 810 | ||
811 | printk("\nnew class %p: %s", class->key, class->name); | 811 | printk("\nnew class %px: %s", class->key, class->name); |
812 | if (class->name_version > 1) | 812 | if (class->name_version > 1) |
813 | printk(KERN_CONT "#%d", class->name_version); | 813 | printk(KERN_CONT "#%d", class->name_version); |
814 | printk(KERN_CONT "\n"); | 814 | printk(KERN_CONT "\n"); |
@@ -1407,7 +1407,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) | |||
1407 | } | 1407 | } |
1408 | printk("%*s }\n", depth, ""); | 1408 | printk("%*s }\n", depth, ""); |
1409 | 1409 | ||
1410 | printk("%*s ... key at: [<%p>] %pS\n", | 1410 | printk("%*s ... key at: [<%px>] %pS\n", |
1411 | depth, "", class->key, class->key); | 1411 | depth, "", class->key, class->key); |
1412 | } | 1412 | } |
1413 | 1413 | ||
@@ -2340,7 +2340,7 @@ cache_hit: | |||
2340 | 2340 | ||
2341 | if (very_verbose(class)) { | 2341 | if (very_verbose(class)) { |
2342 | printk("\nhash chain already cached, key: " | 2342 | printk("\nhash chain already cached, key: " |
2343 | "%016Lx tail class: [%p] %s\n", | 2343 | "%016Lx tail class: [%px] %s\n", |
2344 | (unsigned long long)chain_key, | 2344 | (unsigned long long)chain_key, |
2345 | class->key, class->name); | 2345 | class->key, class->name); |
2346 | } | 2346 | } |
@@ -2349,7 +2349,7 @@ cache_hit: | |||
2349 | } | 2349 | } |
2350 | 2350 | ||
2351 | if (very_verbose(class)) { | 2351 | if (very_verbose(class)) { |
2352 | printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", | 2352 | printk("\nnew hash chain, key: %016Lx tail class: [%px] %s\n", |
2353 | (unsigned long long)chain_key, class->key, class->name); | 2353 | (unsigned long long)chain_key, class->key, class->name); |
2354 | } | 2354 | } |
2355 | 2355 | ||
@@ -2676,16 +2676,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, | |||
2676 | void print_irqtrace_events(struct task_struct *curr) | 2676 | void print_irqtrace_events(struct task_struct *curr) |
2677 | { | 2677 | { |
2678 | printk("irq event stamp: %u\n", curr->irq_events); | 2678 | printk("irq event stamp: %u\n", curr->irq_events); |
2679 | printk("hardirqs last enabled at (%u): [<%p>] %pS\n", | 2679 | printk("hardirqs last enabled at (%u): [<%px>] %pS\n", |
2680 | curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, | 2680 | curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, |
2681 | (void *)curr->hardirq_enable_ip); | 2681 | (void *)curr->hardirq_enable_ip); |
2682 | printk("hardirqs last disabled at (%u): [<%p>] %pS\n", | 2682 | printk("hardirqs last disabled at (%u): [<%px>] %pS\n", |
2683 | curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, | 2683 | curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, |
2684 | (void *)curr->hardirq_disable_ip); | 2684 | (void *)curr->hardirq_disable_ip); |
2685 | printk("softirqs last enabled at (%u): [<%p>] %pS\n", | 2685 | printk("softirqs last enabled at (%u): [<%px>] %pS\n", |
2686 | curr->softirq_enable_event, (void *)curr->softirq_enable_ip, | 2686 | curr->softirq_enable_event, (void *)curr->softirq_enable_ip, |
2687 | (void *)curr->softirq_enable_ip); | 2687 | (void *)curr->softirq_enable_ip); |
2688 | printk("softirqs last disabled at (%u): [<%p>] %pS\n", | 2688 | printk("softirqs last disabled at (%u): [<%px>] %pS\n", |
2689 | curr->softirq_disable_event, (void *)curr->softirq_disable_ip, | 2689 | curr->softirq_disable_event, (void *)curr->softirq_disable_ip, |
2690 | (void *)curr->softirq_disable_ip); | 2690 | (void *)curr->softirq_disable_ip); |
2691 | } | 2691 | } |
@@ -3207,7 +3207,7 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, | |||
3207 | * Sanity check, the lock-class key must be persistent: | 3207 | * Sanity check, the lock-class key must be persistent: |
3208 | */ | 3208 | */ |
3209 | if (!static_obj(key)) { | 3209 | if (!static_obj(key)) { |
3210 | printk("BUG: key %p not in .data!\n", key); | 3210 | printk("BUG: key %px not in .data!\n", key); |
3211 | /* | 3211 | /* |
3212 | * What it says above ^^^^^, I suggest you read it. | 3212 | * What it says above ^^^^^, I suggest you read it. |
3213 | */ | 3213 | */ |
@@ -3322,7 +3322,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
3322 | } | 3322 | } |
3323 | atomic_inc((atomic_t *)&class->ops); | 3323 | atomic_inc((atomic_t *)&class->ops); |
3324 | if (very_verbose(class)) { | 3324 | if (very_verbose(class)) { |
3325 | printk("\nacquire class [%p] %s", class->key, class->name); | 3325 | printk("\nacquire class [%px] %s", class->key, class->name); |
3326 | if (class->name_version > 1) | 3326 | if (class->name_version > 1) |
3327 | printk(KERN_CONT "#%d", class->name_version); | 3327 | printk(KERN_CONT "#%d", class->name_version); |
3328 | printk(KERN_CONT "\n"); | 3328 | printk(KERN_CONT "\n"); |
@@ -4376,7 +4376,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, | |||
4376 | pr_warn("WARNING: held lock freed!\n"); | 4376 | pr_warn("WARNING: held lock freed!\n"); |
4377 | print_kernel_ident(); | 4377 | print_kernel_ident(); |
4378 | pr_warn("-------------------------\n"); | 4378 | pr_warn("-------------------------\n"); |
4379 | pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n", | 4379 | pr_warn("%s/%d is freeing memory %px-%px, with a lock still held there!\n", |
4380 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); | 4380 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); |
4381 | print_lock(hlock); | 4381 | print_lock(hlock); |
4382 | lockdep_print_held_locks(curr); | 4382 | lockdep_print_held_locks(curr); |
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 940633c63254..4f014be7a4b8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
@@ -1268,8 +1268,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
1268 | 1268 | ||
1269 | if (unlikely(ret)) { | 1269 | if (unlikely(ret)) { |
1270 | __set_current_state(TASK_RUNNING); | 1270 | __set_current_state(TASK_RUNNING); |
1271 | if (rt_mutex_has_waiters(lock)) | 1271 | remove_waiter(lock, &waiter); |
1272 | remove_waiter(lock, &waiter); | ||
1273 | rt_mutex_handle_deadlock(ret, chwalk, &waiter); | 1272 | rt_mutex_handle_deadlock(ret, chwalk, &waiter); |
1274 | } | 1273 | } |
1275 | 1274 | ||
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 68686b3ec3c1..d1d62f942be2 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h | |||
@@ -52,12 +52,13 @@ static inline int rt_mutex_has_waiters(struct rt_mutex *lock) | |||
52 | static inline struct rt_mutex_waiter * | 52 | static inline struct rt_mutex_waiter * |
53 | rt_mutex_top_waiter(struct rt_mutex *lock) | 53 | rt_mutex_top_waiter(struct rt_mutex *lock) |
54 | { | 54 | { |
55 | struct rt_mutex_waiter *w; | 55 | struct rb_node *leftmost = rb_first_cached(&lock->waiters); |
56 | 56 | struct rt_mutex_waiter *w = NULL; | |
57 | w = rb_entry(lock->waiters.rb_leftmost, | ||
58 | struct rt_mutex_waiter, tree_entry); | ||
59 | BUG_ON(w->lock != lock); | ||
60 | 57 | ||
58 | if (leftmost) { | ||
59 | w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry); | ||
60 | BUG_ON(w->lock != lock); | ||
61 | } | ||
61 | return w; | 62 | return w; |
62 | } | 63 | } |
63 | 64 | ||
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index f549c552dbf1..30465a2f2b6c 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c | |||
@@ -117,6 +117,7 @@ EXPORT_SYMBOL(down_write_trylock); | |||
117 | void up_read(struct rw_semaphore *sem) | 117 | void up_read(struct rw_semaphore *sem) |
118 | { | 118 | { |
119 | rwsem_release(&sem->dep_map, 1, _RET_IP_); | 119 | rwsem_release(&sem->dep_map, 1, _RET_IP_); |
120 | DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED); | ||
120 | 121 | ||
121 | __up_read(sem); | 122 | __up_read(sem); |
122 | } | 123 | } |
@@ -129,6 +130,7 @@ EXPORT_SYMBOL(up_read); | |||
129 | void up_write(struct rw_semaphore *sem) | 130 | void up_write(struct rw_semaphore *sem) |
130 | { | 131 | { |
131 | rwsem_release(&sem->dep_map, 1, _RET_IP_); | 132 | rwsem_release(&sem->dep_map, 1, _RET_IP_); |
133 | DEBUG_RWSEMS_WARN_ON(sem->owner != current); | ||
132 | 134 | ||
133 | rwsem_clear_owner(sem); | 135 | rwsem_clear_owner(sem); |
134 | __up_write(sem); | 136 | __up_write(sem); |
@@ -142,6 +144,7 @@ EXPORT_SYMBOL(up_write); | |||
142 | void downgrade_write(struct rw_semaphore *sem) | 144 | void downgrade_write(struct rw_semaphore *sem) |
143 | { | 145 | { |
144 | lock_downgrade(&sem->dep_map, _RET_IP_); | 146 | lock_downgrade(&sem->dep_map, _RET_IP_); |
147 | DEBUG_RWSEMS_WARN_ON(sem->owner != current); | ||
145 | 148 | ||
146 | rwsem_set_reader_owned(sem); | 149 | rwsem_set_reader_owned(sem); |
147 | __downgrade_write(sem); | 150 | __downgrade_write(sem); |
@@ -211,6 +214,7 @@ EXPORT_SYMBOL(down_write_killable_nested); | |||
211 | 214 | ||
212 | void up_read_non_owner(struct rw_semaphore *sem) | 215 | void up_read_non_owner(struct rw_semaphore *sem) |
213 | { | 216 | { |
217 | DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED); | ||
214 | __up_read(sem); | 218 | __up_read(sem); |
215 | } | 219 | } |
216 | 220 | ||
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index a883b8f1fdc6..a17cba8d94bb 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h | |||
@@ -16,6 +16,12 @@ | |||
16 | */ | 16 | */ |
17 | #define RWSEM_READER_OWNED ((struct task_struct *)1UL) | 17 | #define RWSEM_READER_OWNED ((struct task_struct *)1UL) |
18 | 18 | ||
19 | #ifdef CONFIG_DEBUG_RWSEMS | ||
20 | # define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) | ||
21 | #else | ||
22 | # define DEBUG_RWSEMS_WARN_ON(c) | ||
23 | #endif | ||
24 | |||
19 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | 25 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER |
20 | /* | 26 | /* |
21 | * All writes to owner are protected by WRITE_ONCE() to make sure that | 27 | * All writes to owner are protected by WRITE_ONCE() to make sure that |
@@ -41,7 +47,7 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) | |||
41 | * do a write to the rwsem cacheline when it is really necessary | 47 | * do a write to the rwsem cacheline when it is really necessary |
42 | * to minimize cacheline contention. | 48 | * to minimize cacheline contention. |
43 | */ | 49 | */ |
44 | if (sem->owner != RWSEM_READER_OWNED) | 50 | if (READ_ONCE(sem->owner) != RWSEM_READER_OWNED) |
45 | WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); | 51 | WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); |
46 | } | 52 | } |
47 | 53 | ||
diff --git a/kernel/module.c b/kernel/module.c index e42764acedb4..a6e43a5806a1 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -2181,10 +2181,6 @@ static void free_module(struct module *mod) | |||
2181 | /* Finally, free the core (containing the module structure) */ | 2181 | /* Finally, free the core (containing the module structure) */ |
2182 | disable_ro_nx(&mod->core_layout); | 2182 | disable_ro_nx(&mod->core_layout); |
2183 | module_memfree(mod->core_layout.base); | 2183 | module_memfree(mod->core_layout.base); |
2184 | |||
2185 | #ifdef CONFIG_MPU | ||
2186 | update_protections(current->mm); | ||
2187 | #endif | ||
2188 | } | 2184 | } |
2189 | 2185 | ||
2190 | void *__symbol_get(const char *symbol) | 2186 | void *__symbol_get(const char *symbol) |
diff --git a/kernel/panic.c b/kernel/panic.c index 4b794f1d8561..9d833d913c84 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -289,7 +289,7 @@ void panic(const char *fmt, ...) | |||
289 | disabled_wait(caller); | 289 | disabled_wait(caller); |
290 | } | 290 | } |
291 | #endif | 291 | #endif |
292 | pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf); | 292 | pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); |
293 | local_irq_enable(); | 293 | local_irq_enable(); |
294 | for (i = 0; ; i += PANIC_TIMER_STEP) { | 294 | for (i = 0; ; i += PANIC_TIMER_STEP) { |
295 | touch_softlockup_watchdog(); | 295 | touch_softlockup_watchdog(); |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 0b53eef7d34b..93b57f026688 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -242,16 +242,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
242 | 242 | ||
243 | /* | 243 | /* |
244 | * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. | 244 | * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. |
245 | * sys_wait4() will also block until our children traced from the | 245 | * kernel_wait4() will also block until our children traced from the |
246 | * parent namespace are detached and become EXIT_DEAD. | 246 | * parent namespace are detached and become EXIT_DEAD. |
247 | */ | 247 | */ |
248 | do { | 248 | do { |
249 | clear_thread_flag(TIF_SIGPENDING); | 249 | clear_thread_flag(TIF_SIGPENDING); |
250 | rc = sys_wait4(-1, NULL, __WALL, NULL); | 250 | rc = kernel_wait4(-1, NULL, __WALL, NULL); |
251 | } while (rc != -ECHILD); | 251 | } while (rc != -ECHILD); |
252 | 252 | ||
253 | /* | 253 | /* |
254 | * sys_wait4() above can't reap the EXIT_DEAD children but we do not | 254 | * kernel_wait4() above can't reap the EXIT_DEAD children but we do not |
255 | * really care, we could reparent them to the global init. We could | 255 | * really care, we could reparent them to the global init. We could |
256 | * exit and reap ->child_reaper even if it is not the last thread in | 256 | * exit and reap ->child_reaper even if it is not the last thread in |
257 | * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), | 257 | * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a5c36e9c56a6..4710f1b142fc 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -701,7 +701,7 @@ int hibernate(void) | |||
701 | } | 701 | } |
702 | 702 | ||
703 | pr_info("Syncing filesystems ... \n"); | 703 | pr_info("Syncing filesystems ... \n"); |
704 | sys_sync(); | 704 | ksys_sync(); |
705 | pr_info("done.\n"); | 705 | pr_info("done.\n"); |
706 | 706 | ||
707 | error = freeze_processes(); | 707 | error = freeze_processes(); |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0685c4499431..4c10be0f4843 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -560,7 +560,7 @@ static int enter_state(suspend_state_t state) | |||
560 | #ifndef CONFIG_SUSPEND_SKIP_SYNC | 560 | #ifndef CONFIG_SUSPEND_SKIP_SYNC |
561 | trace_suspend_resume(TPS("sync_filesystems"), 0, true); | 561 | trace_suspend_resume(TPS("sync_filesystems"), 0, true); |
562 | pr_info("Syncing filesystems ... "); | 562 | pr_info("Syncing filesystems ... "); |
563 | sys_sync(); | 563 | ksys_sync(); |
564 | pr_cont("done.\n"); | 564 | pr_cont("done.\n"); |
565 | trace_suspend_resume(TPS("sync_filesystems"), 0, false); | 565 | trace_suspend_resume(TPS("sync_filesystems"), 0, false); |
566 | #endif | 566 | #endif |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 22df9f7ff672..75c959de4b29 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -224,7 +224,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
224 | break; | 224 | break; |
225 | 225 | ||
226 | printk("Syncing filesystems ... "); | 226 | printk("Syncing filesystems ... "); |
227 | sys_sync(); | 227 | ksys_sync(); |
228 | printk("done.\n"); | 228 | printk("done.\n"); |
229 | 229 | ||
230 | error = freeze_processes(); | 230 | error = freeze_processes(); |
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 6334f2c1abd0..7a693e31184a 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h | |||
@@ -77,12 +77,18 @@ static inline void rcu_seq_start(unsigned long *sp) | |||
77 | WARN_ON_ONCE(rcu_seq_state(*sp) != 1); | 77 | WARN_ON_ONCE(rcu_seq_state(*sp) != 1); |
78 | } | 78 | } |
79 | 79 | ||
80 | /* Compute the end-of-grace-period value for the specified sequence number. */ | ||
81 | static inline unsigned long rcu_seq_endval(unsigned long *sp) | ||
82 | { | ||
83 | return (*sp | RCU_SEQ_STATE_MASK) + 1; | ||
84 | } | ||
85 | |||
80 | /* Adjust sequence number for end of update-side operation. */ | 86 | /* Adjust sequence number for end of update-side operation. */ |
81 | static inline void rcu_seq_end(unsigned long *sp) | 87 | static inline void rcu_seq_end(unsigned long *sp) |
82 | { | 88 | { |
83 | smp_mb(); /* Ensure update-side operation before counter increment. */ | 89 | smp_mb(); /* Ensure update-side operation before counter increment. */ |
84 | WARN_ON_ONCE(!rcu_seq_state(*sp)); | 90 | WARN_ON_ONCE(!rcu_seq_state(*sp)); |
85 | WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1); | 91 | WRITE_ONCE(*sp, rcu_seq_endval(sp)); |
86 | } | 92 | } |
87 | 93 | ||
88 | /* Take a snapshot of the update side's sequence number. */ | 94 | /* Take a snapshot of the update side's sequence number. */ |
@@ -295,9 +301,19 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) | |||
295 | * Iterate over all possible CPUs in a leaf RCU node. | 301 | * Iterate over all possible CPUs in a leaf RCU node. |
296 | */ | 302 | */ |
297 | #define for_each_leaf_node_possible_cpu(rnp, cpu) \ | 303 | #define for_each_leaf_node_possible_cpu(rnp, cpu) \ |
298 | for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ | 304 | for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ |
299 | cpu <= rnp->grphi; \ | 305 | (cpu) <= rnp->grphi; \ |
300 | cpu = cpumask_next((cpu), cpu_possible_mask)) | 306 | (cpu) = cpumask_next((cpu), cpu_possible_mask)) |
307 | |||
308 | /* | ||
309 | * Iterate over all CPUs in a leaf RCU node's specified mask. | ||
310 | */ | ||
311 | #define rcu_find_next_bit(rnp, cpu, mask) \ | ||
312 | ((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu))) | ||
313 | #define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \ | ||
314 | for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ | ||
315 | (cpu) <= rnp->grphi; \ | ||
316 | (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask))) | ||
301 | 317 | ||
302 | /* | 318 | /* |
303 | * Wrappers for the rcu_node::lock acquire and release. | 319 | * Wrappers for the rcu_node::lock acquire and release. |
@@ -337,7 +353,7 @@ do { \ | |||
337 | } while (0) | 353 | } while (0) |
338 | 354 | ||
339 | #define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ | 355 | #define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ |
340 | raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ | 356 | raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) |
341 | 357 | ||
342 | #define raw_spin_trylock_rcu_node(p) \ | 358 | #define raw_spin_trylock_rcu_node(p) \ |
343 | ({ \ | 359 | ({ \ |
@@ -348,6 +364,9 @@ do { \ | |||
348 | ___locked; \ | 364 | ___locked; \ |
349 | }) | 365 | }) |
350 | 366 | ||
367 | #define raw_lockdep_assert_held_rcu_node(p) \ | ||
368 | lockdep_assert_held(&ACCESS_PRIVATE(p, lock)) | ||
369 | |||
351 | #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ | 370 | #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ |
352 | 371 | ||
353 | #ifdef CONFIG_TINY_RCU | 372 | #ifdef CONFIG_TINY_RCU |
@@ -356,24 +375,20 @@ static inline bool rcu_gp_is_normal(void) { return true; } | |||
356 | static inline bool rcu_gp_is_expedited(void) { return false; } | 375 | static inline bool rcu_gp_is_expedited(void) { return false; } |
357 | static inline void rcu_expedite_gp(void) { } | 376 | static inline void rcu_expedite_gp(void) { } |
358 | static inline void rcu_unexpedite_gp(void) { } | 377 | static inline void rcu_unexpedite_gp(void) { } |
378 | static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } | ||
359 | #else /* #ifdef CONFIG_TINY_RCU */ | 379 | #else /* #ifdef CONFIG_TINY_RCU */ |
360 | bool rcu_gp_is_normal(void); /* Internal RCU use. */ | 380 | bool rcu_gp_is_normal(void); /* Internal RCU use. */ |
361 | bool rcu_gp_is_expedited(void); /* Internal RCU use. */ | 381 | bool rcu_gp_is_expedited(void); /* Internal RCU use. */ |
362 | void rcu_expedite_gp(void); | 382 | void rcu_expedite_gp(void); |
363 | void rcu_unexpedite_gp(void); | 383 | void rcu_unexpedite_gp(void); |
364 | void rcupdate_announce_bootup_oddness(void); | 384 | void rcupdate_announce_bootup_oddness(void); |
385 | void rcu_request_urgent_qs_task(struct task_struct *t); | ||
365 | #endif /* #else #ifdef CONFIG_TINY_RCU */ | 386 | #endif /* #else #ifdef CONFIG_TINY_RCU */ |
366 | 387 | ||
367 | #define RCU_SCHEDULER_INACTIVE 0 | 388 | #define RCU_SCHEDULER_INACTIVE 0 |
368 | #define RCU_SCHEDULER_INIT 1 | 389 | #define RCU_SCHEDULER_INIT 1 |
369 | #define RCU_SCHEDULER_RUNNING 2 | 390 | #define RCU_SCHEDULER_RUNNING 2 |
370 | 391 | ||
371 | #ifdef CONFIG_TINY_RCU | ||
372 | static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } | ||
373 | #else /* #ifdef CONFIG_TINY_RCU */ | ||
374 | void rcu_request_urgent_qs_task(struct task_struct *t); | ||
375 | #endif /* #else #ifdef CONFIG_TINY_RCU */ | ||
376 | |||
377 | enum rcutorture_type { | 392 | enum rcutorture_type { |
378 | RCU_FLAVOR, | 393 | RCU_FLAVOR, |
379 | RCU_BH_FLAVOR, | 394 | RCU_BH_FLAVOR, |
@@ -470,6 +485,7 @@ void show_rcu_gp_kthreads(void); | |||
470 | void rcu_force_quiescent_state(void); | 485 | void rcu_force_quiescent_state(void); |
471 | void rcu_bh_force_quiescent_state(void); | 486 | void rcu_bh_force_quiescent_state(void); |
472 | void rcu_sched_force_quiescent_state(void); | 487 | void rcu_sched_force_quiescent_state(void); |
488 | extern struct workqueue_struct *rcu_gp_wq; | ||
473 | #endif /* #else #ifdef CONFIG_TINY_RCU */ | 489 | #endif /* #else #ifdef CONFIG_TINY_RCU */ |
474 | 490 | ||
475 | #ifdef CONFIG_RCU_NOCB_CPU | 491 | #ifdef CONFIG_RCU_NOCB_CPU |
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index d1ebdf9868bb..777e7a6a0292 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c | |||
@@ -61,11 +61,30 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); | |||
61 | #define VERBOSE_PERFOUT_ERRSTRING(s) \ | 61 | #define VERBOSE_PERFOUT_ERRSTRING(s) \ |
62 | do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) | 62 | do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) |
63 | 63 | ||
64 | /* | ||
65 | * The intended use cases for the nreaders and nwriters module parameters | ||
66 | * are as follows: | ||
67 | * | ||
68 | * 1. Specify only the nr_cpus kernel boot parameter. This will | ||
69 | * set both nreaders and nwriters to the value specified by | ||
70 | * nr_cpus for a mixed reader/writer test. | ||
71 | * | ||
72 | * 2. Specify the nr_cpus kernel boot parameter, but set | ||
73 | * rcuperf.nreaders to zero. This will set nwriters to the | ||
74 | * value specified by nr_cpus for an update-only test. | ||
75 | * | ||
76 | * 3. Specify the nr_cpus kernel boot parameter, but set | ||
77 | * rcuperf.nwriters to zero. This will set nreaders to the | ||
78 | * value specified by nr_cpus for a read-only test. | ||
79 | * | ||
80 | * Various other use cases may of course be specified. | ||
81 | */ | ||
82 | |||
64 | torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); | 83 | torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); |
65 | torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); | 84 | torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); |
66 | torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); | 85 | torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); |
67 | torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); | 86 | torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); |
68 | torture_param(int, nreaders, 0, "Number of RCU reader threads"); | 87 | torture_param(int, nreaders, -1, "Number of RCU reader threads"); |
69 | torture_param(int, nwriters, -1, "Number of RCU updater threads"); | 88 | torture_param(int, nwriters, -1, "Number of RCU updater threads"); |
70 | torture_param(bool, shutdown, !IS_ENABLED(MODULE), | 89 | torture_param(bool, shutdown, !IS_ENABLED(MODULE), |
71 | "Shutdown at end of performance tests."); | 90 | "Shutdown at end of performance tests."); |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 308e6fdbced8..680c96d8c00f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
@@ -909,34 +909,38 @@ rcu_torture_writer(void *arg) | |||
909 | int nsynctypes = 0; | 909 | int nsynctypes = 0; |
910 | 910 | ||
911 | VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); | 911 | VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); |
912 | if (!can_expedite) { | 912 | if (!can_expedite) |
913 | pr_alert("%s" TORTURE_FLAG | 913 | pr_alert("%s" TORTURE_FLAG |
914 | " GP expediting controlled from boot/sysfs for %s,\n", | 914 | " GP expediting controlled from boot/sysfs for %s.\n", |
915 | torture_type, cur_ops->name); | 915 | torture_type, cur_ops->name); |
916 | pr_alert("%s" TORTURE_FLAG | ||
917 | " Disabled dynamic grace-period expediting.\n", | ||
918 | torture_type); | ||
919 | } | ||
920 | 916 | ||
921 | /* Initialize synctype[] array. If none set, take default. */ | 917 | /* Initialize synctype[] array. If none set, take default. */ |
922 | if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) | 918 | if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) |
923 | gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; | 919 | gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; |
924 | if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) | 920 | if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) { |
925 | synctype[nsynctypes++] = RTWS_COND_GET; | 921 | synctype[nsynctypes++] = RTWS_COND_GET; |
926 | else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) | 922 | pr_info("%s: Testing conditional GPs.\n", __func__); |
927 | pr_alert("rcu_torture_writer: gp_cond without primitives.\n"); | 923 | } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) { |
928 | if (gp_exp1 && cur_ops->exp_sync) | 924 | pr_alert("%s: gp_cond without primitives.\n", __func__); |
925 | } | ||
926 | if (gp_exp1 && cur_ops->exp_sync) { | ||
929 | synctype[nsynctypes++] = RTWS_EXP_SYNC; | 927 | synctype[nsynctypes++] = RTWS_EXP_SYNC; |
930 | else if (gp_exp && !cur_ops->exp_sync) | 928 | pr_info("%s: Testing expedited GPs.\n", __func__); |
931 | pr_alert("rcu_torture_writer: gp_exp without primitives.\n"); | 929 | } else if (gp_exp && !cur_ops->exp_sync) { |
932 | if (gp_normal1 && cur_ops->deferred_free) | 930 | pr_alert("%s: gp_exp without primitives.\n", __func__); |
931 | } | ||
932 | if (gp_normal1 && cur_ops->deferred_free) { | ||
933 | synctype[nsynctypes++] = RTWS_DEF_FREE; | 933 | synctype[nsynctypes++] = RTWS_DEF_FREE; |
934 | else if (gp_normal && !cur_ops->deferred_free) | 934 | pr_info("%s: Testing asynchronous GPs.\n", __func__); |
935 | pr_alert("rcu_torture_writer: gp_normal without primitives.\n"); | 935 | } else if (gp_normal && !cur_ops->deferred_free) { |
936 | if (gp_sync1 && cur_ops->sync) | 936 | pr_alert("%s: gp_normal without primitives.\n", __func__); |
937 | } | ||
938 | if (gp_sync1 && cur_ops->sync) { | ||
937 | synctype[nsynctypes++] = RTWS_SYNC; | 939 | synctype[nsynctypes++] = RTWS_SYNC; |
938 | else if (gp_sync && !cur_ops->sync) | 940 | pr_info("%s: Testing normal GPs.\n", __func__); |
939 | pr_alert("rcu_torture_writer: gp_sync without primitives.\n"); | 941 | } else if (gp_sync && !cur_ops->sync) { |
942 | pr_alert("%s: gp_sync without primitives.\n", __func__); | ||
943 | } | ||
940 | if (WARN_ONCE(nsynctypes == 0, | 944 | if (WARN_ONCE(nsynctypes == 0, |
941 | "rcu_torture_writer: No update-side primitives.\n")) { | 945 | "rcu_torture_writer: No update-side primitives.\n")) { |
942 | /* | 946 | /* |
@@ -1011,6 +1015,9 @@ rcu_torture_writer(void *arg) | |||
1011 | rcu_unexpedite_gp(); | 1015 | rcu_unexpedite_gp(); |
1012 | if (++expediting > 3) | 1016 | if (++expediting > 3) |
1013 | expediting = -expediting; | 1017 | expediting = -expediting; |
1018 | } else if (!can_expedite) { /* Disabled during boot, recheck. */ | ||
1019 | can_expedite = !rcu_gp_is_expedited() && | ||
1020 | !rcu_gp_is_normal(); | ||
1014 | } | 1021 | } |
1015 | rcu_torture_writer_state = RTWS_STUTTER; | 1022 | rcu_torture_writer_state = RTWS_STUTTER; |
1016 | stutter_wait("rcu_torture_writer"); | 1023 | stutter_wait("rcu_torture_writer"); |
@@ -1021,6 +1028,10 @@ rcu_torture_writer(void *arg) | |||
1021 | while (can_expedite && expediting++ < 0) | 1028 | while (can_expedite && expediting++ < 0) |
1022 | rcu_unexpedite_gp(); | 1029 | rcu_unexpedite_gp(); |
1023 | WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited()); | 1030 | WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited()); |
1031 | if (!can_expedite) | ||
1032 | pr_alert("%s" TORTURE_FLAG | ||
1033 | " Dynamic grace-period expediting was disabled.\n", | ||
1034 | torture_type); | ||
1024 | rcu_torture_writer_state = RTWS_STOPPING; | 1035 | rcu_torture_writer_state = RTWS_STOPPING; |
1025 | torture_kthread_stopping("rcu_torture_writer"); | 1036 | torture_kthread_stopping("rcu_torture_writer"); |
1026 | return 0; | 1037 | return 0; |
@@ -1045,13 +1056,13 @@ rcu_torture_fakewriter(void *arg) | |||
1045 | torture_random(&rand) % (nfakewriters * 8) == 0) { | 1056 | torture_random(&rand) % (nfakewriters * 8) == 0) { |
1046 | cur_ops->cb_barrier(); | 1057 | cur_ops->cb_barrier(); |
1047 | } else if (gp_normal == gp_exp) { | 1058 | } else if (gp_normal == gp_exp) { |
1048 | if (torture_random(&rand) & 0x80) | 1059 | if (cur_ops->sync && torture_random(&rand) & 0x80) |
1049 | cur_ops->sync(); | 1060 | cur_ops->sync(); |
1050 | else | 1061 | else if (cur_ops->exp_sync) |
1051 | cur_ops->exp_sync(); | 1062 | cur_ops->exp_sync(); |
1052 | } else if (gp_normal) { | 1063 | } else if (gp_normal && cur_ops->sync) { |
1053 | cur_ops->sync(); | 1064 | cur_ops->sync(); |
1054 | } else { | 1065 | } else if (cur_ops->exp_sync) { |
1055 | cur_ops->exp_sync(); | 1066 | cur_ops->exp_sync(); |
1056 | } | 1067 | } |
1057 | stutter_wait("rcu_torture_fakewriter"); | 1068 | stutter_wait("rcu_torture_fakewriter"); |
@@ -1557,11 +1568,10 @@ static int rcu_torture_barrier_init(void) | |||
1557 | atomic_set(&barrier_cbs_count, 0); | 1568 | atomic_set(&barrier_cbs_count, 0); |
1558 | atomic_set(&barrier_cbs_invoked, 0); | 1569 | atomic_set(&barrier_cbs_invoked, 0); |
1559 | barrier_cbs_tasks = | 1570 | barrier_cbs_tasks = |
1560 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), | 1571 | kcalloc(n_barrier_cbs, sizeof(barrier_cbs_tasks[0]), |
1561 | GFP_KERNEL); | 1572 | GFP_KERNEL); |
1562 | barrier_cbs_wq = | 1573 | barrier_cbs_wq = |
1563 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), | 1574 | kcalloc(n_barrier_cbs, sizeof(barrier_cbs_wq[0]), GFP_KERNEL); |
1564 | GFP_KERNEL); | ||
1565 | if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) | 1575 | if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) |
1566 | return -ENOMEM; | 1576 | return -ENOMEM; |
1567 | for (i = 0; i < n_barrier_cbs; i++) { | 1577 | for (i = 0; i < n_barrier_cbs; i++) { |
@@ -1674,7 +1684,7 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) | |||
1674 | * next grace period. Unlikely, but can happen. If it | 1684 | * next grace period. Unlikely, but can happen. If it |
1675 | * does happen, the debug-objects subsystem won't have splatted. | 1685 | * does happen, the debug-objects subsystem won't have splatted. |
1676 | */ | 1686 | */ |
1677 | pr_alert("rcutorture: duplicated callback was invoked.\n"); | 1687 | pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME); |
1678 | } | 1688 | } |
1679 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 1689 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
1680 | 1690 | ||
@@ -1691,7 +1701,7 @@ static void rcu_test_debug_objects(void) | |||
1691 | 1701 | ||
1692 | init_rcu_head_on_stack(&rh1); | 1702 | init_rcu_head_on_stack(&rh1); |
1693 | init_rcu_head_on_stack(&rh2); | 1703 | init_rcu_head_on_stack(&rh2); |
1694 | pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); | 1704 | pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME); |
1695 | 1705 | ||
1696 | /* Try to queue the rh2 pair of callbacks for the same grace period. */ | 1706 | /* Try to queue the rh2 pair of callbacks for the same grace period. */ |
1697 | preempt_disable(); /* Prevent preemption from interrupting test. */ | 1707 | preempt_disable(); /* Prevent preemption from interrupting test. */ |
@@ -1706,11 +1716,11 @@ static void rcu_test_debug_objects(void) | |||
1706 | 1716 | ||
1707 | /* Wait for them all to get done so we can safely return. */ | 1717 | /* Wait for them all to get done so we can safely return. */ |
1708 | rcu_barrier(); | 1718 | rcu_barrier(); |
1709 | pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); | 1719 | pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME); |
1710 | destroy_rcu_head_on_stack(&rh1); | 1720 | destroy_rcu_head_on_stack(&rh1); |
1711 | destroy_rcu_head_on_stack(&rh2); | 1721 | destroy_rcu_head_on_stack(&rh2); |
1712 | #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 1722 | #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
1713 | pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); | 1723 | pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME); |
1714 | #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 1724 | #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
1715 | } | 1725 | } |
1716 | 1726 | ||
@@ -1799,7 +1809,7 @@ rcu_torture_init(void) | |||
1799 | if (firsterr) | 1809 | if (firsterr) |
1800 | goto unwind; | 1810 | goto unwind; |
1801 | if (nfakewriters > 0) { | 1811 | if (nfakewriters > 0) { |
1802 | fakewriter_tasks = kzalloc(nfakewriters * | 1812 | fakewriter_tasks = kcalloc(nfakewriters, |
1803 | sizeof(fakewriter_tasks[0]), | 1813 | sizeof(fakewriter_tasks[0]), |
1804 | GFP_KERNEL); | 1814 | GFP_KERNEL); |
1805 | if (fakewriter_tasks == NULL) { | 1815 | if (fakewriter_tasks == NULL) { |
@@ -1814,7 +1824,7 @@ rcu_torture_init(void) | |||
1814 | if (firsterr) | 1824 | if (firsterr) |
1815 | goto unwind; | 1825 | goto unwind; |
1816 | } | 1826 | } |
1817 | reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), | 1827 | reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), |
1818 | GFP_KERNEL); | 1828 | GFP_KERNEL); |
1819 | if (reader_tasks == NULL) { | 1829 | if (reader_tasks == NULL) { |
1820 | VERBOSE_TOROUT_ERRSTRING("out of memory"); | 1830 | VERBOSE_TOROUT_ERRSTRING("out of memory"); |
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d5cea81378cc..fb560fca9ef4 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c | |||
@@ -386,7 +386,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp) | |||
386 | flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); | 386 | flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); |
387 | if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || | 387 | if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || |
388 | WARN_ON(srcu_readers_active(sp))) { | 388 | WARN_ON(srcu_readers_active(sp))) { |
389 | pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); | 389 | pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); |
390 | return; /* Caller forgot to stop doing call_srcu()? */ | 390 | return; /* Caller forgot to stop doing call_srcu()? */ |
391 | } | 391 | } |
392 | free_percpu(sp->sda); | 392 | free_percpu(sp->sda); |
@@ -439,7 +439,7 @@ static void srcu_gp_start(struct srcu_struct *sp) | |||
439 | struct srcu_data *sdp = this_cpu_ptr(sp->sda); | 439 | struct srcu_data *sdp = this_cpu_ptr(sp->sda); |
440 | int state; | 440 | int state; |
441 | 441 | ||
442 | lockdep_assert_held(&sp->lock); | 442 | lockdep_assert_held(&ACCESS_PRIVATE(sp, lock)); |
443 | WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); | 443 | WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); |
444 | rcu_segcblist_advance(&sdp->srcu_cblist, | 444 | rcu_segcblist_advance(&sdp->srcu_cblist, |
445 | rcu_seq_current(&sp->srcu_gp_seq)); | 445 | rcu_seq_current(&sp->srcu_gp_seq)); |
@@ -492,8 +492,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | |||
492 | */ | 492 | */ |
493 | static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) | 493 | static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) |
494 | { | 494 | { |
495 | srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq, | 495 | srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay); |
496 | &sdp->work, delay); | ||
497 | } | 496 | } |
498 | 497 | ||
499 | /* | 498 | /* |
@@ -527,11 +526,11 @@ static void srcu_gp_end(struct srcu_struct *sp) | |||
527 | { | 526 | { |
528 | unsigned long cbdelay; | 527 | unsigned long cbdelay; |
529 | bool cbs; | 528 | bool cbs; |
529 | bool last_lvl; | ||
530 | int cpu; | 530 | int cpu; |
531 | unsigned long flags; | 531 | unsigned long flags; |
532 | unsigned long gpseq; | 532 | unsigned long gpseq; |
533 | int idx; | 533 | int idx; |
534 | int idxnext; | ||
535 | unsigned long mask; | 534 | unsigned long mask; |
536 | struct srcu_data *sdp; | 535 | struct srcu_data *sdp; |
537 | struct srcu_node *snp; | 536 | struct srcu_node *snp; |
@@ -555,11 +554,11 @@ static void srcu_gp_end(struct srcu_struct *sp) | |||
555 | 554 | ||
556 | /* Initiate callback invocation as needed. */ | 555 | /* Initiate callback invocation as needed. */ |
557 | idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); | 556 | idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); |
558 | idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); | ||
559 | rcu_for_each_node_breadth_first(sp, snp) { | 557 | rcu_for_each_node_breadth_first(sp, snp) { |
560 | spin_lock_irq_rcu_node(snp); | 558 | spin_lock_irq_rcu_node(snp); |
561 | cbs = false; | 559 | cbs = false; |
562 | if (snp >= sp->level[rcu_num_lvls - 1]) | 560 | last_lvl = snp >= sp->level[rcu_num_lvls - 1]; |
561 | if (last_lvl) | ||
563 | cbs = snp->srcu_have_cbs[idx] == gpseq; | 562 | cbs = snp->srcu_have_cbs[idx] == gpseq; |
564 | snp->srcu_have_cbs[idx] = gpseq; | 563 | snp->srcu_have_cbs[idx] = gpseq; |
565 | rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); | 564 | rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); |
@@ -572,13 +571,16 @@ static void srcu_gp_end(struct srcu_struct *sp) | |||
572 | srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); | 571 | srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); |
573 | 572 | ||
574 | /* Occasionally prevent srcu_data counter wrap. */ | 573 | /* Occasionally prevent srcu_data counter wrap. */ |
575 | if (!(gpseq & counter_wrap_check)) | 574 | if (!(gpseq & counter_wrap_check) && last_lvl) |
576 | for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { | 575 | for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { |
577 | sdp = per_cpu_ptr(sp->sda, cpu); | 576 | sdp = per_cpu_ptr(sp->sda, cpu); |
578 | spin_lock_irqsave_rcu_node(sdp, flags); | 577 | spin_lock_irqsave_rcu_node(sdp, flags); |
579 | if (ULONG_CMP_GE(gpseq, | 578 | if (ULONG_CMP_GE(gpseq, |
580 | sdp->srcu_gp_seq_needed + 100)) | 579 | sdp->srcu_gp_seq_needed + 100)) |
581 | sdp->srcu_gp_seq_needed = gpseq; | 580 | sdp->srcu_gp_seq_needed = gpseq; |
581 | if (ULONG_CMP_GE(gpseq, | ||
582 | sdp->srcu_gp_seq_needed_exp + 100)) | ||
583 | sdp->srcu_gp_seq_needed_exp = gpseq; | ||
582 | spin_unlock_irqrestore_rcu_node(sdp, flags); | 584 | spin_unlock_irqrestore_rcu_node(sdp, flags); |
583 | } | 585 | } |
584 | } | 586 | } |
@@ -593,9 +595,7 @@ static void srcu_gp_end(struct srcu_struct *sp) | |||
593 | ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { | 595 | ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { |
594 | srcu_gp_start(sp); | 596 | srcu_gp_start(sp); |
595 | spin_unlock_irq_rcu_node(sp); | 597 | spin_unlock_irq_rcu_node(sp); |
596 | /* Throttle expedited grace periods: Should be rare! */ | 598 | srcu_reschedule(sp, 0); |
597 | srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff | ||
598 | ? 0 : SRCU_INTERVAL); | ||
599 | } else { | 599 | } else { |
600 | spin_unlock_irq_rcu_node(sp); | 600 | spin_unlock_irq_rcu_node(sp); |
601 | } | 601 | } |
@@ -626,7 +626,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, | |||
626 | spin_unlock_irqrestore_rcu_node(snp, flags); | 626 | spin_unlock_irqrestore_rcu_node(snp, flags); |
627 | } | 627 | } |
628 | spin_lock_irqsave_rcu_node(sp, flags); | 628 | spin_lock_irqsave_rcu_node(sp, flags); |
629 | if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) | 629 | if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) |
630 | sp->srcu_gp_seq_needed_exp = s; | 630 | sp->srcu_gp_seq_needed_exp = s; |
631 | spin_unlock_irqrestore_rcu_node(sp, flags); | 631 | spin_unlock_irqrestore_rcu_node(sp, flags); |
632 | } | 632 | } |
@@ -691,8 +691,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, | |||
691 | rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { | 691 | rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { |
692 | WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); | 692 | WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); |
693 | srcu_gp_start(sp); | 693 | srcu_gp_start(sp); |
694 | queue_delayed_work(system_power_efficient_wq, &sp->work, | 694 | queue_delayed_work(rcu_gp_wq, &sp->work, srcu_get_delay(sp)); |
695 | srcu_get_delay(sp)); | ||
696 | } | 695 | } |
697 | spin_unlock_irqrestore_rcu_node(sp, flags); | 696 | spin_unlock_irqrestore_rcu_node(sp, flags); |
698 | } | 697 | } |
@@ -1225,7 +1224,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) | |||
1225 | spin_unlock_irq_rcu_node(sp); | 1224 | spin_unlock_irq_rcu_node(sp); |
1226 | 1225 | ||
1227 | if (pushgp) | 1226 | if (pushgp) |
1228 | queue_delayed_work(system_power_efficient_wq, &sp->work, delay); | 1227 | queue_delayed_work(rcu_gp_wq, &sp->work, delay); |
1229 | } | 1228 | } |
1230 | 1229 | ||
1231 | /* | 1230 | /* |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 491bdf39f276..2a734692a581 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -1161,7 +1161,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) | |||
1161 | */ | 1161 | */ |
1162 | static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) | 1162 | static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) |
1163 | { | 1163 | { |
1164 | lockdep_assert_held(&rnp->lock); | 1164 | raw_lockdep_assert_held_rcu_node(rnp); |
1165 | if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) | 1165 | if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) |
1166 | WRITE_ONCE(rdp->gpwrap, true); | 1166 | WRITE_ONCE(rdp->gpwrap, true); |
1167 | if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) | 1167 | if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) |
@@ -1350,6 +1350,7 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) | |||
1350 | rsp->gp_kthread ? rsp->gp_kthread->state : ~0, | 1350 | rsp->gp_kthread ? rsp->gp_kthread->state : ~0, |
1351 | rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); | 1351 | rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); |
1352 | if (rsp->gp_kthread) { | 1352 | if (rsp->gp_kthread) { |
1353 | pr_err("RCU grace-period kthread stack dump:\n"); | ||
1353 | sched_show_task(rsp->gp_kthread); | 1354 | sched_show_task(rsp->gp_kthread); |
1354 | wake_up_process(rsp->gp_kthread); | 1355 | wake_up_process(rsp->gp_kthread); |
1355 | } | 1356 | } |
@@ -1628,7 +1629,7 @@ void rcu_cpu_stall_reset(void) | |||
1628 | static unsigned long rcu_cbs_completed(struct rcu_state *rsp, | 1629 | static unsigned long rcu_cbs_completed(struct rcu_state *rsp, |
1629 | struct rcu_node *rnp) | 1630 | struct rcu_node *rnp) |
1630 | { | 1631 | { |
1631 | lockdep_assert_held(&rnp->lock); | 1632 | raw_lockdep_assert_held_rcu_node(rnp); |
1632 | 1633 | ||
1633 | /* | 1634 | /* |
1634 | * If RCU is idle, we just wait for the next grace period. | 1635 | * If RCU is idle, we just wait for the next grace period. |
@@ -1675,7 +1676,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | |||
1675 | bool ret = false; | 1676 | bool ret = false; |
1676 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); | 1677 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); |
1677 | 1678 | ||
1678 | lockdep_assert_held(&rnp->lock); | 1679 | raw_lockdep_assert_held_rcu_node(rnp); |
1679 | 1680 | ||
1680 | /* | 1681 | /* |
1681 | * Pick up grace-period number for new callbacks. If this | 1682 | * Pick up grace-period number for new callbacks. If this |
@@ -1803,7 +1804,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1803 | { | 1804 | { |
1804 | bool ret = false; | 1805 | bool ret = false; |
1805 | 1806 | ||
1806 | lockdep_assert_held(&rnp->lock); | 1807 | raw_lockdep_assert_held_rcu_node(rnp); |
1807 | 1808 | ||
1808 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ | 1809 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ |
1809 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) | 1810 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) |
@@ -1843,7 +1844,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1843 | static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | 1844 | static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, |
1844 | struct rcu_data *rdp) | 1845 | struct rcu_data *rdp) |
1845 | { | 1846 | { |
1846 | lockdep_assert_held(&rnp->lock); | 1847 | raw_lockdep_assert_held_rcu_node(rnp); |
1847 | 1848 | ||
1848 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ | 1849 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ |
1849 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) | 1850 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) |
@@ -1871,7 +1872,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1871 | bool ret; | 1872 | bool ret; |
1872 | bool need_gp; | 1873 | bool need_gp; |
1873 | 1874 | ||
1874 | lockdep_assert_held(&rnp->lock); | 1875 | raw_lockdep_assert_held_rcu_node(rnp); |
1875 | 1876 | ||
1876 | /* Handle the ends of any preceding grace periods first. */ | 1877 | /* Handle the ends of any preceding grace periods first. */ |
1877 | if (rdp->completed == rnp->completed && | 1878 | if (rdp->completed == rnp->completed && |
@@ -2296,7 +2297,7 @@ static bool | |||
2296 | rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | 2297 | rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, |
2297 | struct rcu_data *rdp) | 2298 | struct rcu_data *rdp) |
2298 | { | 2299 | { |
2299 | lockdep_assert_held(&rnp->lock); | 2300 | raw_lockdep_assert_held_rcu_node(rnp); |
2300 | if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { | 2301 | if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { |
2301 | /* | 2302 | /* |
2302 | * Either we have not yet spawned the grace-period | 2303 | * Either we have not yet spawned the grace-period |
@@ -2358,7 +2359,7 @@ static bool rcu_start_gp(struct rcu_state *rsp) | |||
2358 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | 2359 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) |
2359 | __releases(rcu_get_root(rsp)->lock) | 2360 | __releases(rcu_get_root(rsp)->lock) |
2360 | { | 2361 | { |
2361 | lockdep_assert_held(&rcu_get_root(rsp)->lock); | 2362 | raw_lockdep_assert_held_rcu_node(rcu_get_root(rsp)); |
2362 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 2363 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); |
2363 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); | 2364 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); |
2364 | raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); | 2365 | raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); |
@@ -2383,7 +2384,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
2383 | unsigned long oldmask = 0; | 2384 | unsigned long oldmask = 0; |
2384 | struct rcu_node *rnp_c; | 2385 | struct rcu_node *rnp_c; |
2385 | 2386 | ||
2386 | lockdep_assert_held(&rnp->lock); | 2387 | raw_lockdep_assert_held_rcu_node(rnp); |
2387 | 2388 | ||
2388 | /* Walk up the rcu_node hierarchy. */ | 2389 | /* Walk up the rcu_node hierarchy. */ |
2389 | for (;;) { | 2390 | for (;;) { |
@@ -2447,7 +2448,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, | |||
2447 | unsigned long mask; | 2448 | unsigned long mask; |
2448 | struct rcu_node *rnp_p; | 2449 | struct rcu_node *rnp_p; |
2449 | 2450 | ||
2450 | lockdep_assert_held(&rnp->lock); | 2451 | raw_lockdep_assert_held_rcu_node(rnp); |
2451 | if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || | 2452 | if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || |
2452 | rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { | 2453 | rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { |
2453 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 2454 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
@@ -2592,7 +2593,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) | |||
2592 | long mask; | 2593 | long mask; |
2593 | struct rcu_node *rnp = rnp_leaf; | 2594 | struct rcu_node *rnp = rnp_leaf; |
2594 | 2595 | ||
2595 | lockdep_assert_held(&rnp->lock); | 2596 | raw_lockdep_assert_held_rcu_node(rnp); |
2596 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || | 2597 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || |
2597 | rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) | 2598 | rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) |
2598 | return; | 2599 | return; |
@@ -2691,7 +2692,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
2691 | /* Update counts and requeue any remaining callbacks. */ | 2692 | /* Update counts and requeue any remaining callbacks. */ |
2692 | rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); | 2693 | rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); |
2693 | smp_mb(); /* List handling before counting for rcu_barrier(). */ | 2694 | smp_mb(); /* List handling before counting for rcu_barrier(). */ |
2694 | rdp->n_cbs_invoked += count; | ||
2695 | rcu_segcblist_insert_count(&rdp->cblist, &rcl); | 2695 | rcu_segcblist_insert_count(&rdp->cblist, &rcl); |
2696 | 2696 | ||
2697 | /* Reinstate batch limit if we have worked down the excess. */ | 2697 | /* Reinstate batch limit if we have worked down the excess. */ |
@@ -2845,10 +2845,8 @@ static void force_quiescent_state(struct rcu_state *rsp) | |||
2845 | !raw_spin_trylock(&rnp->fqslock); | 2845 | !raw_spin_trylock(&rnp->fqslock); |
2846 | if (rnp_old != NULL) | 2846 | if (rnp_old != NULL) |
2847 | raw_spin_unlock(&rnp_old->fqslock); | 2847 | raw_spin_unlock(&rnp_old->fqslock); |
2848 | if (ret) { | 2848 | if (ret) |
2849 | rsp->n_force_qs_lh++; | ||
2850 | return; | 2849 | return; |
2851 | } | ||
2852 | rnp_old = rnp; | 2850 | rnp_old = rnp; |
2853 | } | 2851 | } |
2854 | /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ | 2852 | /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ |
@@ -2857,7 +2855,6 @@ static void force_quiescent_state(struct rcu_state *rsp) | |||
2857 | raw_spin_lock_irqsave_rcu_node(rnp_old, flags); | 2855 | raw_spin_lock_irqsave_rcu_node(rnp_old, flags); |
2858 | raw_spin_unlock(&rnp_old->fqslock); | 2856 | raw_spin_unlock(&rnp_old->fqslock); |
2859 | if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | 2857 | if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
2860 | rsp->n_force_qs_lh++; | ||
2861 | raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); | 2858 | raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); |
2862 | return; /* Someone beat us to it. */ | 2859 | return; /* Someone beat us to it. */ |
2863 | } | 2860 | } |
@@ -3355,8 +3352,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
3355 | { | 3352 | { |
3356 | struct rcu_node *rnp = rdp->mynode; | 3353 | struct rcu_node *rnp = rdp->mynode; |
3357 | 3354 | ||
3358 | rdp->n_rcu_pending++; | ||
3359 | |||
3360 | /* Check for CPU stalls, if enabled. */ | 3355 | /* Check for CPU stalls, if enabled. */ |
3361 | check_cpu_stall(rsp, rdp); | 3356 | check_cpu_stall(rsp, rdp); |
3362 | 3357 | ||
@@ -3365,48 +3360,31 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
3365 | return 0; | 3360 | return 0; |
3366 | 3361 | ||
3367 | /* Is the RCU core waiting for a quiescent state from this CPU? */ | 3362 | /* Is the RCU core waiting for a quiescent state from this CPU? */ |
3368 | if (rcu_scheduler_fully_active && | 3363 | if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) |
3369 | rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && | ||
3370 | rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) { | ||
3371 | rdp->n_rp_core_needs_qs++; | ||
3372 | } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { | ||
3373 | rdp->n_rp_report_qs++; | ||
3374 | return 1; | 3364 | return 1; |
3375 | } | ||
3376 | 3365 | ||
3377 | /* Does this CPU have callbacks ready to invoke? */ | 3366 | /* Does this CPU have callbacks ready to invoke? */ |
3378 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) { | 3367 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) |
3379 | rdp->n_rp_cb_ready++; | ||
3380 | return 1; | 3368 | return 1; |
3381 | } | ||
3382 | 3369 | ||
3383 | /* Has RCU gone idle with this CPU needing another grace period? */ | 3370 | /* Has RCU gone idle with this CPU needing another grace period? */ |
3384 | if (cpu_needs_another_gp(rsp, rdp)) { | 3371 | if (cpu_needs_another_gp(rsp, rdp)) |
3385 | rdp->n_rp_cpu_needs_gp++; | ||
3386 | return 1; | 3372 | return 1; |
3387 | } | ||
3388 | 3373 | ||
3389 | /* Has another RCU grace period completed? */ | 3374 | /* Has another RCU grace period completed? */ |
3390 | if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ | 3375 | if (READ_ONCE(rnp->completed) != rdp->completed) /* outside lock */ |
3391 | rdp->n_rp_gp_completed++; | ||
3392 | return 1; | 3376 | return 1; |
3393 | } | ||
3394 | 3377 | ||
3395 | /* Has a new RCU grace period started? */ | 3378 | /* Has a new RCU grace period started? */ |
3396 | if (READ_ONCE(rnp->gpnum) != rdp->gpnum || | 3379 | if (READ_ONCE(rnp->gpnum) != rdp->gpnum || |
3397 | unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */ | 3380 | unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ |
3398 | rdp->n_rp_gp_started++; | ||
3399 | return 1; | 3381 | return 1; |
3400 | } | ||
3401 | 3382 | ||
3402 | /* Does this CPU need a deferred NOCB wakeup? */ | 3383 | /* Does this CPU need a deferred NOCB wakeup? */ |
3403 | if (rcu_nocb_need_deferred_wakeup(rdp)) { | 3384 | if (rcu_nocb_need_deferred_wakeup(rdp)) |
3404 | rdp->n_rp_nocb_defer_wakeup++; | ||
3405 | return 1; | 3385 | return 1; |
3406 | } | ||
3407 | 3386 | ||
3408 | /* nothing to do */ | 3387 | /* nothing to do */ |
3409 | rdp->n_rp_need_nothing++; | ||
3410 | return 0; | 3388 | return 0; |
3411 | } | 3389 | } |
3412 | 3390 | ||
@@ -3618,7 +3596,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) | |||
3618 | long mask; | 3596 | long mask; |
3619 | struct rcu_node *rnp = rnp_leaf; | 3597 | struct rcu_node *rnp = rnp_leaf; |
3620 | 3598 | ||
3621 | lockdep_assert_held(&rnp->lock); | 3599 | raw_lockdep_assert_held_rcu_node(rnp); |
3622 | for (;;) { | 3600 | for (;;) { |
3623 | mask = rnp->grpmask; | 3601 | mask = rnp->grpmask; |
3624 | rnp = rnp->parent; | 3602 | rnp = rnp->parent; |
@@ -3636,12 +3614,9 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) | |||
3636 | static void __init | 3614 | static void __init |
3637 | rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | 3615 | rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) |
3638 | { | 3616 | { |
3639 | unsigned long flags; | ||
3640 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 3617 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
3641 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
3642 | 3618 | ||
3643 | /* Set up local state, ensuring consistent view of global state. */ | 3619 | /* Set up local state, ensuring consistent view of global state. */ |
3644 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
3645 | rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); | 3620 | rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); |
3646 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 3621 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
3647 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); | 3622 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); |
@@ -3649,7 +3624,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
3649 | rdp->cpu = cpu; | 3624 | rdp->cpu = cpu; |
3650 | rdp->rsp = rsp; | 3625 | rdp->rsp = rsp; |
3651 | rcu_boot_init_nocb_percpu_data(rdp); | 3626 | rcu_boot_init_nocb_percpu_data(rdp); |
3652 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
3653 | } | 3627 | } |
3654 | 3628 | ||
3655 | /* | 3629 | /* |
@@ -4193,6 +4167,8 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) | |||
4193 | pr_cont("\n"); | 4167 | pr_cont("\n"); |
4194 | } | 4168 | } |
4195 | 4169 | ||
4170 | struct workqueue_struct *rcu_gp_wq; | ||
4171 | |||
4196 | void __init rcu_init(void) | 4172 | void __init rcu_init(void) |
4197 | { | 4173 | { |
4198 | int cpu; | 4174 | int cpu; |
@@ -4219,6 +4195,10 @@ void __init rcu_init(void) | |||
4219 | rcu_cpu_starting(cpu); | 4195 | rcu_cpu_starting(cpu); |
4220 | rcutree_online_cpu(cpu); | 4196 | rcutree_online_cpu(cpu); |
4221 | } | 4197 | } |
4198 | |||
4199 | /* Create workqueue for expedited GPs and for Tree SRCU. */ | ||
4200 | rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); | ||
4201 | WARN_ON(!rcu_gp_wq); | ||
4222 | } | 4202 | } |
4223 | 4203 | ||
4224 | #include "tree_exp.h" | 4204 | #include "tree_exp.h" |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6488a3b0e729..f491ab4f2e8e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
@@ -146,12 +146,6 @@ struct rcu_node { | |||
146 | /* boosting for this rcu_node structure. */ | 146 | /* boosting for this rcu_node structure. */ |
147 | unsigned int boost_kthread_status; | 147 | unsigned int boost_kthread_status; |
148 | /* State of boost_kthread_task for tracing. */ | 148 | /* State of boost_kthread_task for tracing. */ |
149 | unsigned long n_tasks_boosted; | ||
150 | /* Total number of tasks boosted. */ | ||
151 | unsigned long n_exp_boosts; | ||
152 | /* Number of tasks boosted for expedited GP. */ | ||
153 | unsigned long n_normal_boosts; | ||
154 | /* Number of tasks boosted for normal GP. */ | ||
155 | #ifdef CONFIG_RCU_NOCB_CPU | 149 | #ifdef CONFIG_RCU_NOCB_CPU |
156 | struct swait_queue_head nocb_gp_wq[2]; | 150 | struct swait_queue_head nocb_gp_wq[2]; |
157 | /* Place for rcu_nocb_kthread() to wait GP. */ | 151 | /* Place for rcu_nocb_kthread() to wait GP. */ |
@@ -184,13 +178,6 @@ union rcu_noqs { | |||
184 | u16 s; /* Set of bits, aggregate OR here. */ | 178 | u16 s; /* Set of bits, aggregate OR here. */ |
185 | }; | 179 | }; |
186 | 180 | ||
187 | /* Index values for nxttail array in struct rcu_data. */ | ||
188 | #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ | ||
189 | #define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ | ||
190 | #define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ | ||
191 | #define RCU_NEXT_TAIL 3 | ||
192 | #define RCU_NEXT_SIZE 4 | ||
193 | |||
194 | /* Per-CPU data for read-copy update. */ | 181 | /* Per-CPU data for read-copy update. */ |
195 | struct rcu_data { | 182 | struct rcu_data { |
196 | /* 1) quiescent-state and grace-period handling : */ | 183 | /* 1) quiescent-state and grace-period handling : */ |
@@ -217,8 +204,6 @@ struct rcu_data { | |||
217 | /* different grace periods. */ | 204 | /* different grace periods. */ |
218 | long qlen_last_fqs_check; | 205 | long qlen_last_fqs_check; |
219 | /* qlen at last check for QS forcing */ | 206 | /* qlen at last check for QS forcing */ |
220 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ | ||
221 | unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ | ||
222 | unsigned long n_force_qs_snap; | 207 | unsigned long n_force_qs_snap; |
223 | /* did other CPU force QS recently? */ | 208 | /* did other CPU force QS recently? */ |
224 | long blimit; /* Upper limit on a processed batch */ | 209 | long blimit; /* Upper limit on a processed batch */ |
@@ -234,18 +219,7 @@ struct rcu_data { | |||
234 | /* Grace period that needs help */ | 219 | /* Grace period that needs help */ |
235 | /* from cond_resched(). */ | 220 | /* from cond_resched(). */ |
236 | 221 | ||
237 | /* 5) __rcu_pending() statistics. */ | 222 | /* 5) _rcu_barrier(), OOM callbacks, and expediting. */ |
238 | unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ | ||
239 | unsigned long n_rp_core_needs_qs; | ||
240 | unsigned long n_rp_report_qs; | ||
241 | unsigned long n_rp_cb_ready; | ||
242 | unsigned long n_rp_cpu_needs_gp; | ||
243 | unsigned long n_rp_gp_completed; | ||
244 | unsigned long n_rp_gp_started; | ||
245 | unsigned long n_rp_nocb_defer_wakeup; | ||
246 | unsigned long n_rp_need_nothing; | ||
247 | |||
248 | /* 6) _rcu_barrier(), OOM callbacks, and expediting. */ | ||
249 | struct rcu_head barrier_head; | 223 | struct rcu_head barrier_head; |
250 | #ifdef CONFIG_RCU_FAST_NO_HZ | 224 | #ifdef CONFIG_RCU_FAST_NO_HZ |
251 | struct rcu_head oom_head; | 225 | struct rcu_head oom_head; |
@@ -256,7 +230,7 @@ struct rcu_data { | |||
256 | atomic_long_t exp_workdone3; /* # done by others #3. */ | 230 | atomic_long_t exp_workdone3; /* # done by others #3. */ |
257 | int exp_dynticks_snap; /* Double-check need for IPI. */ | 231 | int exp_dynticks_snap; /* Double-check need for IPI. */ |
258 | 232 | ||
259 | /* 7) Callback offloading. */ | 233 | /* 6) Callback offloading. */ |
260 | #ifdef CONFIG_RCU_NOCB_CPU | 234 | #ifdef CONFIG_RCU_NOCB_CPU |
261 | struct rcu_head *nocb_head; /* CBs waiting for kthread. */ | 235 | struct rcu_head *nocb_head; /* CBs waiting for kthread. */ |
262 | struct rcu_head **nocb_tail; | 236 | struct rcu_head **nocb_tail; |
@@ -283,7 +257,7 @@ struct rcu_data { | |||
283 | /* Leader CPU takes GP-end wakeups. */ | 257 | /* Leader CPU takes GP-end wakeups. */ |
284 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | 258 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ |
285 | 259 | ||
286 | /* 8) RCU CPU stall data. */ | 260 | /* 7) RCU CPU stall data. */ |
287 | unsigned int softirq_snap; /* Snapshot of softirq activity. */ | 261 | unsigned int softirq_snap; /* Snapshot of softirq activity. */ |
288 | /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ | 262 | /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ |
289 | struct irq_work rcu_iw; /* Check for non-irq activity. */ | 263 | struct irq_work rcu_iw; /* Check for non-irq activity. */ |
@@ -374,10 +348,6 @@ struct rcu_state { | |||
374 | /* kthreads, if configured. */ | 348 | /* kthreads, if configured. */ |
375 | unsigned long n_force_qs; /* Number of calls to */ | 349 | unsigned long n_force_qs; /* Number of calls to */ |
376 | /* force_quiescent_state(). */ | 350 | /* force_quiescent_state(). */ |
377 | unsigned long n_force_qs_lh; /* ~Number of calls leaving */ | ||
378 | /* due to lock unavailable. */ | ||
379 | unsigned long n_force_qs_ngp; /* Number of calls leaving */ | ||
380 | /* due to no GP active. */ | ||
381 | unsigned long gp_start; /* Time at which GP started, */ | 351 | unsigned long gp_start; /* Time at which GP started, */ |
382 | /* but in jiffies. */ | 352 | /* but in jiffies. */ |
383 | unsigned long gp_activity; /* Time of last GP kthread */ | 353 | unsigned long gp_activity; /* Time of last GP kthread */ |
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 46d61b597731..f72eefab8543 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h | |||
@@ -29,6 +29,15 @@ static void rcu_exp_gp_seq_start(struct rcu_state *rsp) | |||
29 | } | 29 | } |
30 | 30 | ||
31 | /* | 31 | /* |
32 | * Return then value that expedited-grace-period counter will have | ||
33 | * at the end of the current grace period. | ||
34 | */ | ||
35 | static __maybe_unused unsigned long rcu_exp_gp_seq_endval(struct rcu_state *rsp) | ||
36 | { | ||
37 | return rcu_seq_endval(&rsp->expedited_sequence); | ||
38 | } | ||
39 | |||
40 | /* | ||
32 | * Record the end of an expedited grace period. | 41 | * Record the end of an expedited grace period. |
33 | */ | 42 | */ |
34 | static void rcu_exp_gp_seq_end(struct rcu_state *rsp) | 43 | static void rcu_exp_gp_seq_end(struct rcu_state *rsp) |
@@ -366,21 +375,30 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, | |||
366 | int ret; | 375 | int ret; |
367 | struct rcu_node *rnp; | 376 | struct rcu_node *rnp; |
368 | 377 | ||
378 | trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); | ||
369 | sync_exp_reset_tree(rsp); | 379 | sync_exp_reset_tree(rsp); |
380 | trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); | ||
370 | rcu_for_each_leaf_node(rsp, rnp) { | 381 | rcu_for_each_leaf_node(rsp, rnp) { |
371 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | 382 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
372 | 383 | ||
373 | /* Each pass checks a CPU for identity, offline, and idle. */ | 384 | /* Each pass checks a CPU for identity, offline, and idle. */ |
374 | mask_ofl_test = 0; | 385 | mask_ofl_test = 0; |
375 | for_each_leaf_node_possible_cpu(rnp, cpu) { | 386 | for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { |
387 | unsigned long mask = leaf_node_cpu_bit(rnp, cpu); | ||
376 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 388 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
389 | struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); | ||
390 | int snap; | ||
377 | 391 | ||
378 | rdp->exp_dynticks_snap = | ||
379 | rcu_dynticks_snap(rdp->dynticks); | ||
380 | if (raw_smp_processor_id() == cpu || | 392 | if (raw_smp_processor_id() == cpu || |
381 | rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) || | 393 | !(rnp->qsmaskinitnext & mask)) { |
382 | !(rnp->qsmaskinitnext & rdp->grpmask)) | 394 | mask_ofl_test |= mask; |
383 | mask_ofl_test |= rdp->grpmask; | 395 | } else { |
396 | snap = rcu_dynticks_snap(rdtp); | ||
397 | if (rcu_dynticks_in_eqs(snap)) | ||
398 | mask_ofl_test |= mask; | ||
399 | else | ||
400 | rdp->exp_dynticks_snap = snap; | ||
401 | } | ||
384 | } | 402 | } |
385 | mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; | 403 | mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; |
386 | 404 | ||
@@ -394,7 +412,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, | |||
394 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 412 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
395 | 413 | ||
396 | /* IPI the remaining CPUs for expedited quiescent state. */ | 414 | /* IPI the remaining CPUs for expedited quiescent state. */ |
397 | for_each_leaf_node_possible_cpu(rnp, cpu) { | 415 | for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { |
398 | unsigned long mask = leaf_node_cpu_bit(rnp, cpu); | 416 | unsigned long mask = leaf_node_cpu_bit(rnp, cpu); |
399 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 417 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
400 | 418 | ||
@@ -417,6 +435,7 @@ retry_ipi: | |||
417 | (rnp->expmask & mask)) { | 435 | (rnp->expmask & mask)) { |
418 | /* Online, so delay for a bit and try again. */ | 436 | /* Online, so delay for a bit and try again. */ |
419 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 437 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
438 | trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); | ||
420 | schedule_timeout_uninterruptible(1); | 439 | schedule_timeout_uninterruptible(1); |
421 | goto retry_ipi; | 440 | goto retry_ipi; |
422 | } | 441 | } |
@@ -443,6 +462,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | |||
443 | struct rcu_node *rnp_root = rcu_get_root(rsp); | 462 | struct rcu_node *rnp_root = rcu_get_root(rsp); |
444 | int ret; | 463 | int ret; |
445 | 464 | ||
465 | trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("startwait")); | ||
446 | jiffies_stall = rcu_jiffies_till_stall_check(); | 466 | jiffies_stall = rcu_jiffies_till_stall_check(); |
447 | jiffies_start = jiffies; | 467 | jiffies_start = jiffies; |
448 | 468 | ||
@@ -606,7 +626,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, | |||
606 | rew.rew_rsp = rsp; | 626 | rew.rew_rsp = rsp; |
607 | rew.rew_s = s; | 627 | rew.rew_s = s; |
608 | INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); | 628 | INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); |
609 | schedule_work(&rew.rew_work); | 629 | queue_work(rcu_gp_wq, &rew.rew_work); |
610 | } | 630 | } |
611 | 631 | ||
612 | /* Wait for expedited grace period to complete. */ | 632 | /* Wait for expedited grace period to complete. */ |
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fb88a028deec..84fbee4686d3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -180,7 +180,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) | |||
180 | (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); | 180 | (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); |
181 | struct task_struct *t = current; | 181 | struct task_struct *t = current; |
182 | 182 | ||
183 | lockdep_assert_held(&rnp->lock); | 183 | raw_lockdep_assert_held_rcu_node(rnp); |
184 | WARN_ON_ONCE(rdp->mynode != rnp); | 184 | WARN_ON_ONCE(rdp->mynode != rnp); |
185 | WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); | 185 | WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); |
186 | 186 | ||
@@ -560,8 +560,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) | |||
560 | } | 560 | } |
561 | t = list_entry(rnp->gp_tasks->prev, | 561 | t = list_entry(rnp->gp_tasks->prev, |
562 | struct task_struct, rcu_node_entry); | 562 | struct task_struct, rcu_node_entry); |
563 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) | 563 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { |
564 | /* | ||
565 | * We could be printing a lot while holding a spinlock. | ||
566 | * Avoid triggering hard lockup. | ||
567 | */ | ||
568 | touch_nmi_watchdog(); | ||
564 | sched_show_task(t); | 569 | sched_show_task(t); |
570 | } | ||
565 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 571 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
566 | } | 572 | } |
567 | 573 | ||
@@ -957,14 +963,10 @@ static int rcu_boost(struct rcu_node *rnp) | |||
957 | * expedited grace period must boost all blocked tasks, including | 963 | * expedited grace period must boost all blocked tasks, including |
958 | * those blocking the pre-existing normal grace period. | 964 | * those blocking the pre-existing normal grace period. |
959 | */ | 965 | */ |
960 | if (rnp->exp_tasks != NULL) { | 966 | if (rnp->exp_tasks != NULL) |
961 | tb = rnp->exp_tasks; | 967 | tb = rnp->exp_tasks; |
962 | rnp->n_exp_boosts++; | 968 | else |
963 | } else { | ||
964 | tb = rnp->boost_tasks; | 969 | tb = rnp->boost_tasks; |
965 | rnp->n_normal_boosts++; | ||
966 | } | ||
967 | rnp->n_tasks_boosted++; | ||
968 | 970 | ||
969 | /* | 971 | /* |
970 | * We boost task t by manufacturing an rt_mutex that appears to | 972 | * We boost task t by manufacturing an rt_mutex that appears to |
@@ -1042,7 +1044,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | |||
1042 | { | 1044 | { |
1043 | struct task_struct *t; | 1045 | struct task_struct *t; |
1044 | 1046 | ||
1045 | lockdep_assert_held(&rnp->lock); | 1047 | raw_lockdep_assert_held_rcu_node(rnp); |
1046 | if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { | 1048 | if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { |
1047 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 1049 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
1048 | return; | 1050 | return; |
@@ -1677,6 +1679,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) | |||
1677 | char *ticks_title; | 1679 | char *ticks_title; |
1678 | unsigned long ticks_value; | 1680 | unsigned long ticks_value; |
1679 | 1681 | ||
1682 | /* | ||
1683 | * We could be printing a lot while holding a spinlock. Avoid | ||
1684 | * triggering hard lockup. | ||
1685 | */ | ||
1686 | touch_nmi_watchdog(); | ||
1687 | |||
1680 | if (rsp->gpnum == rdp->gpnum) { | 1688 | if (rsp->gpnum == rdp->gpnum) { |
1681 | ticks_title = "ticks this GP"; | 1689 | ticks_title = "ticks this GP"; |
1682 | ticks_value = rdp->ticks_this_gp; | 1690 | ticks_value = rdp->ticks_this_gp; |
@@ -2235,7 +2243,6 @@ static int rcu_nocb_kthread(void *arg) | |||
2235 | smp_mb__before_atomic(); /* _add after CB invocation. */ | 2243 | smp_mb__before_atomic(); /* _add after CB invocation. */ |
2236 | atomic_long_add(-c, &rdp->nocb_q_count); | 2244 | atomic_long_add(-c, &rdp->nocb_q_count); |
2237 | atomic_long_add(-cl, &rdp->nocb_q_count_lazy); | 2245 | atomic_long_add(-cl, &rdp->nocb_q_count_lazy); |
2238 | rdp->n_nocbs_invoked += c; | ||
2239 | } | 2246 | } |
2240 | return 0; | 2247 | return 0; |
2241 | } | 2248 | } |
@@ -2312,8 +2319,11 @@ void __init rcu_init_nohz(void) | |||
2312 | cpumask_and(rcu_nocb_mask, cpu_possible_mask, | 2319 | cpumask_and(rcu_nocb_mask, cpu_possible_mask, |
2313 | rcu_nocb_mask); | 2320 | rcu_nocb_mask); |
2314 | } | 2321 | } |
2315 | pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", | 2322 | if (cpumask_empty(rcu_nocb_mask)) |
2316 | cpumask_pr_args(rcu_nocb_mask)); | 2323 | pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); |
2324 | else | ||
2325 | pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", | ||
2326 | cpumask_pr_args(rcu_nocb_mask)); | ||
2317 | if (rcu_nocb_poll) | 2327 | if (rcu_nocb_poll) |
2318 | pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); | 2328 | pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); |
2319 | 2329 | ||
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index e2f9d4feff40..d9a02b318108 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | |||
17 | endif | 17 | endif |
18 | 18 | ||
19 | obj-y += core.o loadavg.o clock.o cputime.o | 19 | obj-y += core.o loadavg.o clock.o cputime.o |
20 | obj-y += idle_task.o fair.o rt.o deadline.o | 20 | obj-y += idle.o fair.o rt.o deadline.o |
21 | obj-y += wait.o wait_bit.o swait.o completion.o idle.o | 21 | obj-y += wait.o wait_bit.o swait.o completion.o |
22 | |||
22 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o | 23 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o |
23 | obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o | 24 | obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o |
24 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 25 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index bb4b9fe026a1..6be6c575b6cd 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c | |||
@@ -1,10 +1,7 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/proc_fs.h> | 2 | /* |
3 | #include <linux/seq_file.h> | 3 | * Auto-group scheduling implementation: |
4 | #include <linux/utsname.h> | 4 | */ |
5 | #include <linux/security.h> | ||
6 | #include <linux/export.h> | ||
7 | |||
8 | #include "sched.h" | 5 | #include "sched.h" |
9 | 6 | ||
10 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; | 7 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; |
@@ -168,18 +165,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) | |||
168 | autogroup_kref_put(prev); | 165 | autogroup_kref_put(prev); |
169 | } | 166 | } |
170 | 167 | ||
171 | /* Allocates GFP_KERNEL, cannot be called under any spinlock */ | 168 | /* Allocates GFP_KERNEL, cannot be called under any spinlock: */ |
172 | void sched_autogroup_create_attach(struct task_struct *p) | 169 | void sched_autogroup_create_attach(struct task_struct *p) |
173 | { | 170 | { |
174 | struct autogroup *ag = autogroup_create(); | 171 | struct autogroup *ag = autogroup_create(); |
175 | 172 | ||
176 | autogroup_move_group(p, ag); | 173 | autogroup_move_group(p, ag); |
177 | /* drop extra reference added by autogroup_create() */ | 174 | |
175 | /* Drop extra reference added by autogroup_create(): */ | ||
178 | autogroup_kref_put(ag); | 176 | autogroup_kref_put(ag); |
179 | } | 177 | } |
180 | EXPORT_SYMBOL(sched_autogroup_create_attach); | 178 | EXPORT_SYMBOL(sched_autogroup_create_attach); |
181 | 179 | ||
182 | /* Cannot be called under siglock. Currently has no users */ | 180 | /* Cannot be called under siglock. Currently has no users: */ |
183 | void sched_autogroup_detach(struct task_struct *p) | 181 | void sched_autogroup_detach(struct task_struct *p) |
184 | { | 182 | { |
185 | autogroup_move_group(p, &autogroup_default); | 183 | autogroup_move_group(p, &autogroup_default); |
@@ -202,7 +200,6 @@ static int __init setup_autogroup(char *str) | |||
202 | 200 | ||
203 | return 1; | 201 | return 1; |
204 | } | 202 | } |
205 | |||
206 | __setup("noautogroup", setup_autogroup); | 203 | __setup("noautogroup", setup_autogroup); |
207 | 204 | ||
208 | #ifdef CONFIG_PROC_FS | 205 | #ifdef CONFIG_PROC_FS |
@@ -224,7 +221,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) | |||
224 | if (nice < 0 && !can_nice(current, nice)) | 221 | if (nice < 0 && !can_nice(current, nice)) |
225 | return -EPERM; | 222 | return -EPERM; |
226 | 223 | ||
227 | /* this is a heavy operation taking global locks.. */ | 224 | /* This is a heavy operation, taking global locks.. */ |
228 | if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) | 225 | if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) |
229 | return -EAGAIN; | 226 | return -EAGAIN; |
230 | 227 | ||
@@ -267,4 +264,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) | |||
267 | 264 | ||
268 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); | 265 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); |
269 | } | 266 | } |
270 | #endif /* CONFIG_SCHED_DEBUG */ | 267 | #endif |
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 27cd22b89824..b96419974a1f 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h | |||
@@ -1,15 +1,11 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifdef CONFIG_SCHED_AUTOGROUP | 2 | #ifdef CONFIG_SCHED_AUTOGROUP |
3 | 3 | ||
4 | #include <linux/kref.h> | ||
5 | #include <linux/rwsem.h> | ||
6 | #include <linux/sched/autogroup.h> | ||
7 | |||
8 | struct autogroup { | 4 | struct autogroup { |
9 | /* | 5 | /* |
10 | * reference doesn't mean how many thread attach to this | 6 | * Reference doesn't mean how many threads attach to this |
11 | * autogroup now. It just stands for the number of task | 7 | * autogroup now. It just stands for the number of tasks |
12 | * could use this autogroup. | 8 | * which could use this autogroup. |
13 | */ | 9 | */ |
14 | struct kref kref; | 10 | struct kref kref; |
15 | struct task_group *tg; | 11 | struct task_group *tg; |
@@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg) | |||
56 | return tg; | 52 | return tg; |
57 | } | 53 | } |
58 | 54 | ||
59 | #ifdef CONFIG_SCHED_DEBUG | ||
60 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | 55 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) |
61 | { | 56 | { |
62 | return 0; | 57 | return 0; |
63 | } | 58 | } |
64 | #endif | ||
65 | 59 | ||
66 | #endif /* CONFIG_SCHED_AUTOGROUP */ | 60 | #endif /* CONFIG_SCHED_AUTOGROUP */ |
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e086babe6c61..10c83e73837a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * sched_clock for unstable cpu clocks | 2 | * sched_clock() for unstable CPU clocks |
3 | * | 3 | * |
4 | * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra | 4 | * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra |
5 | * | 5 | * |
@@ -11,7 +11,7 @@ | |||
11 | * Guillaume Chazarain <guichaz@gmail.com> | 11 | * Guillaume Chazarain <guichaz@gmail.com> |
12 | * | 12 | * |
13 | * | 13 | * |
14 | * What: | 14 | * What this file implements: |
15 | * | 15 | * |
16 | * cpu_clock(i) provides a fast (execution time) high resolution | 16 | * cpu_clock(i) provides a fast (execution time) high resolution |
17 | * clock with bounded drift between CPUs. The value of cpu_clock(i) | 17 | * clock with bounded drift between CPUs. The value of cpu_clock(i) |
@@ -26,11 +26,11 @@ | |||
26 | * at 0 on boot (but people really shouldn't rely on that). | 26 | * at 0 on boot (but people really shouldn't rely on that). |
27 | * | 27 | * |
28 | * cpu_clock(i) -- can be used from any context, including NMI. | 28 | * cpu_clock(i) -- can be used from any context, including NMI. |
29 | * local_clock() -- is cpu_clock() on the current cpu. | 29 | * local_clock() -- is cpu_clock() on the current CPU. |
30 | * | 30 | * |
31 | * sched_clock_cpu(i) | 31 | * sched_clock_cpu(i) |
32 | * | 32 | * |
33 | * How: | 33 | * How it is implemented: |
34 | * | 34 | * |
35 | * The implementation either uses sched_clock() when | 35 | * The implementation either uses sched_clock() when |
36 | * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the | 36 | * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the |
@@ -52,19 +52,7 @@ | |||
52 | * that is otherwise invisible (TSC gets stopped). | 52 | * that is otherwise invisible (TSC gets stopped). |
53 | * | 53 | * |
54 | */ | 54 | */ |
55 | #include <linux/spinlock.h> | 55 | #include "sched.h" |
56 | #include <linux/hardirq.h> | ||
57 | #include <linux/export.h> | ||
58 | #include <linux/percpu.h> | ||
59 | #include <linux/ktime.h> | ||
60 | #include <linux/sched.h> | ||
61 | #include <linux/nmi.h> | ||
62 | #include <linux/sched/clock.h> | ||
63 | #include <linux/static_key.h> | ||
64 | #include <linux/workqueue.h> | ||
65 | #include <linux/compiler.h> | ||
66 | #include <linux/tick.h> | ||
67 | #include <linux/init.h> | ||
68 | 56 | ||
69 | /* | 57 | /* |
70 | * Scheduler clock - returns current time in nanosec units. | 58 | * Scheduler clock - returns current time in nanosec units. |
@@ -302,21 +290,21 @@ again: | |||
302 | * cmpxchg64 below only protects one readout. | 290 | * cmpxchg64 below only protects one readout. |
303 | * | 291 | * |
304 | * We must reread via sched_clock_local() in the retry case on | 292 | * We must reread via sched_clock_local() in the retry case on |
305 | * 32bit as an NMI could use sched_clock_local() via the | 293 | * 32-bit kernels as an NMI could use sched_clock_local() via the |
306 | * tracer and hit between the readout of | 294 | * tracer and hit between the readout of |
307 | * the low32bit and the high 32bit portion. | 295 | * the low 32-bit and the high 32-bit portion. |
308 | */ | 296 | */ |
309 | this_clock = sched_clock_local(my_scd); | 297 | this_clock = sched_clock_local(my_scd); |
310 | /* | 298 | /* |
311 | * We must enforce atomic readout on 32bit, otherwise the | 299 | * We must enforce atomic readout on 32-bit, otherwise the |
312 | * update on the remote cpu can hit inbetween the readout of | 300 | * update on the remote CPU can hit inbetween the readout of |
313 | * the low32bit and the high 32bit portion. | 301 | * the low 32-bit and the high 32-bit portion. |
314 | */ | 302 | */ |
315 | remote_clock = cmpxchg64(&scd->clock, 0, 0); | 303 | remote_clock = cmpxchg64(&scd->clock, 0, 0); |
316 | #else | 304 | #else |
317 | /* | 305 | /* |
318 | * On 64bit the read of [my]scd->clock is atomic versus the | 306 | * On 64-bit kernels the read of [my]scd->clock is atomic versus the |
319 | * update, so we can avoid the above 32bit dance. | 307 | * update, so we can avoid the above 32-bit dance. |
320 | */ | 308 | */ |
321 | sched_clock_local(my_scd); | 309 | sched_clock_local(my_scd); |
322 | again: | 310 | again: |
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 0926aef10dad..e426b0cb9ac6 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c | |||
@@ -11,10 +11,7 @@ | |||
11 | * typically be used for exclusion which gives rise to priority inversion. | 11 | * typically be used for exclusion which gives rise to priority inversion. |
12 | * Waiting for completion is a typically sync point, but not an exclusion point. | 12 | * Waiting for completion is a typically sync point, but not an exclusion point. |
13 | */ | 13 | */ |
14 | 14 | #include "sched.h" | |
15 | #include <linux/sched/signal.h> | ||
16 | #include <linux/sched/debug.h> | ||
17 | #include <linux/completion.h> | ||
18 | 15 | ||
19 | /** | 16 | /** |
20 | * complete: - signals a single thread waiting on this completion | 17 | * complete: - signals a single thread waiting on this completion |
@@ -283,7 +280,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout); | |||
283 | bool try_wait_for_completion(struct completion *x) | 280 | bool try_wait_for_completion(struct completion *x) |
284 | { | 281 | { |
285 | unsigned long flags; | 282 | unsigned long flags; |
286 | int ret = 1; | 283 | bool ret = true; |
287 | 284 | ||
288 | /* | 285 | /* |
289 | * Since x->done will need to be locked only | 286 | * Since x->done will need to be locked only |
@@ -292,11 +289,11 @@ bool try_wait_for_completion(struct completion *x) | |||
292 | * return early in the blocking case. | 289 | * return early in the blocking case. |
293 | */ | 290 | */ |
294 | if (!READ_ONCE(x->done)) | 291 | if (!READ_ONCE(x->done)) |
295 | return 0; | 292 | return false; |
296 | 293 | ||
297 | spin_lock_irqsave(&x->wait.lock, flags); | 294 | spin_lock_irqsave(&x->wait.lock, flags); |
298 | if (!x->done) | 295 | if (!x->done) |
299 | ret = 0; | 296 | ret = false; |
300 | else if (x->done != UINT_MAX) | 297 | else if (x->done != UINT_MAX) |
301 | x->done--; | 298 | x->done--; |
302 | spin_unlock_irqrestore(&x->wait.lock, flags); | 299 | spin_unlock_irqrestore(&x->wait.lock, flags); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c94895bc5a2c..28b68995a417 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -5,37 +5,11 @@ | |||
5 | * | 5 | * |
6 | * Copyright (C) 1991-2002 Linus Torvalds | 6 | * Copyright (C) 1991-2002 Linus Torvalds |
7 | */ | 7 | */ |
8 | #include <linux/sched.h> | 8 | #include "sched.h" |
9 | #include <linux/sched/clock.h> | ||
10 | #include <uapi/linux/sched/types.h> | ||
11 | #include <linux/sched/loadavg.h> | ||
12 | #include <linux/sched/hotplug.h> | ||
13 | #include <linux/wait_bit.h> | ||
14 | #include <linux/cpuset.h> | ||
15 | #include <linux/delayacct.h> | ||
16 | #include <linux/init_task.h> | ||
17 | #include <linux/context_tracking.h> | ||
18 | #include <linux/rcupdate_wait.h> | ||
19 | #include <linux/compat.h> | ||
20 | |||
21 | #include <linux/blkdev.h> | ||
22 | #include <linux/kprobes.h> | ||
23 | #include <linux/mmu_context.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/nmi.h> | ||
26 | #include <linux/prefetch.h> | ||
27 | #include <linux/profile.h> | ||
28 | #include <linux/security.h> | ||
29 | #include <linux/syscalls.h> | ||
30 | #include <linux/sched/isolation.h> | ||
31 | 9 | ||
32 | #include <asm/switch_to.h> | 10 | #include <asm/switch_to.h> |
33 | #include <asm/tlb.h> | 11 | #include <asm/tlb.h> |
34 | #ifdef CONFIG_PARAVIRT | ||
35 | #include <asm/paravirt.h> | ||
36 | #endif | ||
37 | 12 | ||
38 | #include "sched.h" | ||
39 | #include "../workqueue_internal.h" | 13 | #include "../workqueue_internal.h" |
40 | #include "../smpboot.h" | 14 | #include "../smpboot.h" |
41 | 15 | ||
@@ -135,7 +109,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | |||
135 | * [L] ->on_rq | 109 | * [L] ->on_rq |
136 | * RELEASE (rq->lock) | 110 | * RELEASE (rq->lock) |
137 | * | 111 | * |
138 | * If we observe the old cpu in task_rq_lock, the acquire of | 112 | * If we observe the old CPU in task_rq_lock, the acquire of |
139 | * the old rq->lock will fully serialize against the stores. | 113 | * the old rq->lock will fully serialize against the stores. |
140 | * | 114 | * |
141 | * If we observe the new CPU in task_rq_lock, the acquire will | 115 | * If we observe the new CPU in task_rq_lock, the acquire will |
@@ -333,7 +307,7 @@ void hrtick_start(struct rq *rq, u64 delay) | |||
333 | } | 307 | } |
334 | #endif /* CONFIG_SMP */ | 308 | #endif /* CONFIG_SMP */ |
335 | 309 | ||
336 | static void init_rq_hrtick(struct rq *rq) | 310 | static void hrtick_rq_init(struct rq *rq) |
337 | { | 311 | { |
338 | #ifdef CONFIG_SMP | 312 | #ifdef CONFIG_SMP |
339 | rq->hrtick_csd_pending = 0; | 313 | rq->hrtick_csd_pending = 0; |
@@ -351,7 +325,7 @@ static inline void hrtick_clear(struct rq *rq) | |||
351 | { | 325 | { |
352 | } | 326 | } |
353 | 327 | ||
354 | static inline void init_rq_hrtick(struct rq *rq) | 328 | static inline void hrtick_rq_init(struct rq *rq) |
355 | { | 329 | { |
356 | } | 330 | } |
357 | #endif /* CONFIG_SCHED_HRTICK */ | 331 | #endif /* CONFIG_SCHED_HRTICK */ |
@@ -609,7 +583,7 @@ static inline bool got_nohz_idle_kick(void) | |||
609 | { | 583 | { |
610 | int cpu = smp_processor_id(); | 584 | int cpu = smp_processor_id(); |
611 | 585 | ||
612 | if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) | 586 | if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) |
613 | return false; | 587 | return false; |
614 | 588 | ||
615 | if (idle_cpu(cpu) && !need_resched()) | 589 | if (idle_cpu(cpu) && !need_resched()) |
@@ -619,7 +593,7 @@ static inline bool got_nohz_idle_kick(void) | |||
619 | * We can't run Idle Load Balance on this CPU for this time so we | 593 | * We can't run Idle Load Balance on this CPU for this time so we |
620 | * cancel it and clear NOHZ_BALANCE_KICK | 594 | * cancel it and clear NOHZ_BALANCE_KICK |
621 | */ | 595 | */ |
622 | clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); | 596 | atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); |
623 | return false; | 597 | return false; |
624 | } | 598 | } |
625 | 599 | ||
@@ -1457,7 +1431,7 @@ EXPORT_SYMBOL_GPL(kick_process); | |||
1457 | * | 1431 | * |
1458 | * - cpu_active must be a subset of cpu_online | 1432 | * - cpu_active must be a subset of cpu_online |
1459 | * | 1433 | * |
1460 | * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, | 1434 | * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, |
1461 | * see __set_cpus_allowed_ptr(). At this point the newly online | 1435 | * see __set_cpus_allowed_ptr(). At this point the newly online |
1462 | * CPU isn't yet part of the sched domains, and balancing will not | 1436 | * CPU isn't yet part of the sched domains, and balancing will not |
1463 | * see it. | 1437 | * see it. |
@@ -2488,17 +2462,17 @@ void wake_up_new_task(struct task_struct *p) | |||
2488 | 2462 | ||
2489 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2463 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
2490 | 2464 | ||
2491 | static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; | 2465 | static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); |
2492 | 2466 | ||
2493 | void preempt_notifier_inc(void) | 2467 | void preempt_notifier_inc(void) |
2494 | { | 2468 | { |
2495 | static_key_slow_inc(&preempt_notifier_key); | 2469 | static_branch_inc(&preempt_notifier_key); |
2496 | } | 2470 | } |
2497 | EXPORT_SYMBOL_GPL(preempt_notifier_inc); | 2471 | EXPORT_SYMBOL_GPL(preempt_notifier_inc); |
2498 | 2472 | ||
2499 | void preempt_notifier_dec(void) | 2473 | void preempt_notifier_dec(void) |
2500 | { | 2474 | { |
2501 | static_key_slow_dec(&preempt_notifier_key); | 2475 | static_branch_dec(&preempt_notifier_key); |
2502 | } | 2476 | } |
2503 | EXPORT_SYMBOL_GPL(preempt_notifier_dec); | 2477 | EXPORT_SYMBOL_GPL(preempt_notifier_dec); |
2504 | 2478 | ||
@@ -2508,7 +2482,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_dec); | |||
2508 | */ | 2482 | */ |
2509 | void preempt_notifier_register(struct preempt_notifier *notifier) | 2483 | void preempt_notifier_register(struct preempt_notifier *notifier) |
2510 | { | 2484 | { |
2511 | if (!static_key_false(&preempt_notifier_key)) | 2485 | if (!static_branch_unlikely(&preempt_notifier_key)) |
2512 | WARN(1, "registering preempt_notifier while notifiers disabled\n"); | 2486 | WARN(1, "registering preempt_notifier while notifiers disabled\n"); |
2513 | 2487 | ||
2514 | hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); | 2488 | hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); |
@@ -2537,7 +2511,7 @@ static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) | |||
2537 | 2511 | ||
2538 | static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) | 2512 | static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) |
2539 | { | 2513 | { |
2540 | if (static_key_false(&preempt_notifier_key)) | 2514 | if (static_branch_unlikely(&preempt_notifier_key)) |
2541 | __fire_sched_in_preempt_notifiers(curr); | 2515 | __fire_sched_in_preempt_notifiers(curr); |
2542 | } | 2516 | } |
2543 | 2517 | ||
@@ -2555,7 +2529,7 @@ static __always_inline void | |||
2555 | fire_sched_out_preempt_notifiers(struct task_struct *curr, | 2529 | fire_sched_out_preempt_notifiers(struct task_struct *curr, |
2556 | struct task_struct *next) | 2530 | struct task_struct *next) |
2557 | { | 2531 | { |
2558 | if (static_key_false(&preempt_notifier_key)) | 2532 | if (static_branch_unlikely(&preempt_notifier_key)) |
2559 | __fire_sched_out_preempt_notifiers(curr, next); | 2533 | __fire_sched_out_preempt_notifiers(curr, next); |
2560 | } | 2534 | } |
2561 | 2535 | ||
@@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq) | |||
2629 | raw_spin_unlock_irq(&rq->lock); | 2603 | raw_spin_unlock_irq(&rq->lock); |
2630 | } | 2604 | } |
2631 | 2605 | ||
2606 | /* | ||
2607 | * NOP if the arch has not defined these: | ||
2608 | */ | ||
2609 | |||
2610 | #ifndef prepare_arch_switch | ||
2611 | # define prepare_arch_switch(next) do { } while (0) | ||
2612 | #endif | ||
2613 | |||
2614 | #ifndef finish_arch_post_lock_switch | ||
2615 | # define finish_arch_post_lock_switch() do { } while (0) | ||
2616 | #endif | ||
2617 | |||
2632 | /** | 2618 | /** |
2633 | * prepare_task_switch - prepare to switch tasks | 2619 | * prepare_task_switch - prepare to switch tasks |
2634 | * @rq: the runqueue preparing to switch | 2620 | * @rq: the runqueue preparing to switch |
@@ -3037,7 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3037 | 3023 | ||
3038 | #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) | 3024 | #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) |
3039 | /* | 3025 | /* |
3040 | * 64-bit doesn't need locks to atomically read a 64bit value. | 3026 | * 64-bit doesn't need locks to atomically read a 64-bit value. |
3041 | * So we have a optimization chance when the task's delta_exec is 0. | 3027 | * So we have a optimization chance when the task's delta_exec is 0. |
3042 | * Reading ->on_cpu is racy, but this is ok. | 3028 | * Reading ->on_cpu is racy, but this is ok. |
3043 | * | 3029 | * |
@@ -3096,35 +3082,99 @@ void scheduler_tick(void) | |||
3096 | rq->idle_balance = idle_cpu(cpu); | 3082 | rq->idle_balance = idle_cpu(cpu); |
3097 | trigger_load_balance(rq); | 3083 | trigger_load_balance(rq); |
3098 | #endif | 3084 | #endif |
3099 | rq_last_tick_reset(rq); | ||
3100 | } | 3085 | } |
3101 | 3086 | ||
3102 | #ifdef CONFIG_NO_HZ_FULL | 3087 | #ifdef CONFIG_NO_HZ_FULL |
3103 | /** | 3088 | |
3104 | * scheduler_tick_max_deferment | 3089 | struct tick_work { |
3105 | * | 3090 | int cpu; |
3106 | * Keep at least one tick per second when a single | 3091 | struct delayed_work work; |
3107 | * active task is running because the scheduler doesn't | 3092 | }; |
3108 | * yet completely support full dynticks environment. | 3093 | |
3109 | * | 3094 | static struct tick_work __percpu *tick_work_cpu; |
3110 | * This makes sure that uptime, CFS vruntime, load | 3095 | |
3111 | * balancing, etc... continue to move forward, even | 3096 | static void sched_tick_remote(struct work_struct *work) |
3112 | * with a very low granularity. | ||
3113 | * | ||
3114 | * Return: Maximum deferment in nanoseconds. | ||
3115 | */ | ||
3116 | u64 scheduler_tick_max_deferment(void) | ||
3117 | { | 3097 | { |
3118 | struct rq *rq = this_rq(); | 3098 | struct delayed_work *dwork = to_delayed_work(work); |
3119 | unsigned long next, now = READ_ONCE(jiffies); | 3099 | struct tick_work *twork = container_of(dwork, struct tick_work, work); |
3100 | int cpu = twork->cpu; | ||
3101 | struct rq *rq = cpu_rq(cpu); | ||
3102 | struct rq_flags rf; | ||
3103 | |||
3104 | /* | ||
3105 | * Handle the tick only if it appears the remote CPU is running in full | ||
3106 | * dynticks mode. The check is racy by nature, but missing a tick or | ||
3107 | * having one too much is no big deal because the scheduler tick updates | ||
3108 | * statistics and checks timeslices in a time-independent way, regardless | ||
3109 | * of when exactly it is running. | ||
3110 | */ | ||
3111 | if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { | ||
3112 | struct task_struct *curr; | ||
3113 | u64 delta; | ||
3120 | 3114 | ||
3121 | next = rq->last_sched_tick + HZ; | 3115 | rq_lock_irq(rq, &rf); |
3116 | update_rq_clock(rq); | ||
3117 | curr = rq->curr; | ||
3118 | delta = rq_clock_task(rq) - curr->se.exec_start; | ||
3122 | 3119 | ||
3123 | if (time_before_eq(next, now)) | 3120 | /* |
3124 | return 0; | 3121 | * Make sure the next tick runs within a reasonable |
3122 | * amount of time. | ||
3123 | */ | ||
3124 | WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); | ||
3125 | curr->sched_class->task_tick(rq, curr, 0); | ||
3126 | rq_unlock_irq(rq, &rf); | ||
3127 | } | ||
3125 | 3128 | ||
3126 | return jiffies_to_nsecs(next - now); | 3129 | /* |
3130 | * Run the remote tick once per second (1Hz). This arbitrary | ||
3131 | * frequency is large enough to avoid overload but short enough | ||
3132 | * to keep scheduler internal stats reasonably up to date. | ||
3133 | */ | ||
3134 | queue_delayed_work(system_unbound_wq, dwork, HZ); | ||
3135 | } | ||
3136 | |||
3137 | static void sched_tick_start(int cpu) | ||
3138 | { | ||
3139 | struct tick_work *twork; | ||
3140 | |||
3141 | if (housekeeping_cpu(cpu, HK_FLAG_TICK)) | ||
3142 | return; | ||
3143 | |||
3144 | WARN_ON_ONCE(!tick_work_cpu); | ||
3145 | |||
3146 | twork = per_cpu_ptr(tick_work_cpu, cpu); | ||
3147 | twork->cpu = cpu; | ||
3148 | INIT_DELAYED_WORK(&twork->work, sched_tick_remote); | ||
3149 | queue_delayed_work(system_unbound_wq, &twork->work, HZ); | ||
3127 | } | 3150 | } |
3151 | |||
3152 | #ifdef CONFIG_HOTPLUG_CPU | ||
3153 | static void sched_tick_stop(int cpu) | ||
3154 | { | ||
3155 | struct tick_work *twork; | ||
3156 | |||
3157 | if (housekeeping_cpu(cpu, HK_FLAG_TICK)) | ||
3158 | return; | ||
3159 | |||
3160 | WARN_ON_ONCE(!tick_work_cpu); | ||
3161 | |||
3162 | twork = per_cpu_ptr(tick_work_cpu, cpu); | ||
3163 | cancel_delayed_work_sync(&twork->work); | ||
3164 | } | ||
3165 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
3166 | |||
3167 | int __init sched_tick_offload_init(void) | ||
3168 | { | ||
3169 | tick_work_cpu = alloc_percpu(struct tick_work); | ||
3170 | BUG_ON(!tick_work_cpu); | ||
3171 | |||
3172 | return 0; | ||
3173 | } | ||
3174 | |||
3175 | #else /* !CONFIG_NO_HZ_FULL */ | ||
3176 | static inline void sched_tick_start(int cpu) { } | ||
3177 | static inline void sched_tick_stop(int cpu) { } | ||
3128 | #endif | 3178 | #endif |
3129 | 3179 | ||
3130 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ | 3180 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
@@ -4892,7 +4942,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, | |||
4892 | * | 4942 | * |
4893 | * Return: 0. | 4943 | * Return: 0. |
4894 | */ | 4944 | */ |
4895 | SYSCALL_DEFINE0(sched_yield) | 4945 | static void do_sched_yield(void) |
4896 | { | 4946 | { |
4897 | struct rq_flags rf; | 4947 | struct rq_flags rf; |
4898 | struct rq *rq; | 4948 | struct rq *rq; |
@@ -4913,7 +4963,11 @@ SYSCALL_DEFINE0(sched_yield) | |||
4913 | sched_preempt_enable_no_resched(); | 4963 | sched_preempt_enable_no_resched(); |
4914 | 4964 | ||
4915 | schedule(); | 4965 | schedule(); |
4966 | } | ||
4916 | 4967 | ||
4968 | SYSCALL_DEFINE0(sched_yield) | ||
4969 | { | ||
4970 | do_sched_yield(); | ||
4917 | return 0; | 4971 | return 0; |
4918 | } | 4972 | } |
4919 | 4973 | ||
@@ -4997,7 +5051,7 @@ EXPORT_SYMBOL(__cond_resched_softirq); | |||
4997 | void __sched yield(void) | 5051 | void __sched yield(void) |
4998 | { | 5052 | { |
4999 | set_current_state(TASK_RUNNING); | 5053 | set_current_state(TASK_RUNNING); |
5000 | sys_sched_yield(); | 5054 | do_sched_yield(); |
5001 | } | 5055 | } |
5002 | EXPORT_SYMBOL(yield); | 5056 | EXPORT_SYMBOL(yield); |
5003 | 5057 | ||
@@ -5786,6 +5840,7 @@ int sched_cpu_starting(unsigned int cpu) | |||
5786 | { | 5840 | { |
5787 | set_cpu_rq_start_time(cpu); | 5841 | set_cpu_rq_start_time(cpu); |
5788 | sched_rq_cpu_starting(cpu); | 5842 | sched_rq_cpu_starting(cpu); |
5843 | sched_tick_start(cpu); | ||
5789 | return 0; | 5844 | return 0; |
5790 | } | 5845 | } |
5791 | 5846 | ||
@@ -5797,6 +5852,7 @@ int sched_cpu_dying(unsigned int cpu) | |||
5797 | 5852 | ||
5798 | /* Handle pending wakeups and then migrate everything off */ | 5853 | /* Handle pending wakeups and then migrate everything off */ |
5799 | sched_ttwu_pending(); | 5854 | sched_ttwu_pending(); |
5855 | sched_tick_stop(cpu); | ||
5800 | 5856 | ||
5801 | rq_lock_irqsave(rq, &rf); | 5857 | rq_lock_irqsave(rq, &rf); |
5802 | if (rq->rd) { | 5858 | if (rq->rd) { |
@@ -5809,7 +5865,7 @@ int sched_cpu_dying(unsigned int cpu) | |||
5809 | 5865 | ||
5810 | calc_load_migrate(rq); | 5866 | calc_load_migrate(rq); |
5811 | update_max_interval(); | 5867 | update_max_interval(); |
5812 | nohz_balance_exit_idle(cpu); | 5868 | nohz_balance_exit_idle(rq); |
5813 | hrtick_clear(rq); | 5869 | hrtick_clear(rq); |
5814 | return 0; | 5870 | return 0; |
5815 | } | 5871 | } |
@@ -6022,13 +6078,11 @@ void __init sched_init(void) | |||
6022 | rq_attach_root(rq, &def_root_domain); | 6078 | rq_attach_root(rq, &def_root_domain); |
6023 | #ifdef CONFIG_NO_HZ_COMMON | 6079 | #ifdef CONFIG_NO_HZ_COMMON |
6024 | rq->last_load_update_tick = jiffies; | 6080 | rq->last_load_update_tick = jiffies; |
6025 | rq->nohz_flags = 0; | 6081 | rq->last_blocked_load_update_tick = jiffies; |
6026 | #endif | 6082 | atomic_set(&rq->nohz_flags, 0); |
6027 | #ifdef CONFIG_NO_HZ_FULL | ||
6028 | rq->last_sched_tick = 0; | ||
6029 | #endif | 6083 | #endif |
6030 | #endif /* CONFIG_SMP */ | 6084 | #endif /* CONFIG_SMP */ |
6031 | init_rq_hrtick(rq); | 6085 | hrtick_rq_init(rq); |
6032 | atomic_set(&rq->nr_iowait, 0); | 6086 | atomic_set(&rq->nr_iowait, 0); |
6033 | } | 6087 | } |
6034 | 6088 | ||
@@ -7027,3 +7081,5 @@ const u32 sched_prio_to_wmult[40] = { | |||
7027 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, | 7081 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, |
7028 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 7082 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
7029 | }; | 7083 | }; |
7084 | |||
7085 | #undef CREATE_TRACE_POINTS | ||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 44ab32a4fab6..9fbb10383434 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
@@ -1,24 +1,13 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/cgroup.h> | ||
3 | #include <linux/slab.h> | ||
4 | #include <linux/percpu.h> | ||
5 | #include <linux/spinlock.h> | ||
6 | #include <linux/cpumask.h> | ||
7 | #include <linux/seq_file.h> | ||
8 | #include <linux/rcupdate.h> | ||
9 | #include <linux/kernel_stat.h> | ||
10 | #include <linux/err.h> | ||
11 | |||
12 | #include "sched.h" | ||
13 | |||
14 | /* | 2 | /* |
15 | * CPU accounting code for task groups. | 3 | * CPU accounting code for task groups. |
16 | * | 4 | * |
17 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh | 5 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh |
18 | * (balbir@in.ibm.com). | 6 | * (balbir@in.ibm.com). |
19 | */ | 7 | */ |
8 | #include "sched.h" | ||
20 | 9 | ||
21 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | 10 | /* Time spent by the tasks of the CPU accounting group executing in ... */ |
22 | enum cpuacct_stat_index { | 11 | enum cpuacct_stat_index { |
23 | CPUACCT_STAT_USER, /* ... user mode */ | 12 | CPUACCT_STAT_USER, /* ... user mode */ |
24 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | 13 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ |
@@ -35,12 +24,12 @@ struct cpuacct_usage { | |||
35 | u64 usages[CPUACCT_STAT_NSTATS]; | 24 | u64 usages[CPUACCT_STAT_NSTATS]; |
36 | }; | 25 | }; |
37 | 26 | ||
38 | /* track cpu usage of a group of tasks and its child groups */ | 27 | /* track CPU usage of a group of tasks and its child groups */ |
39 | struct cpuacct { | 28 | struct cpuacct { |
40 | struct cgroup_subsys_state css; | 29 | struct cgroup_subsys_state css; |
41 | /* cpuusage holds pointer to a u64-type object on every cpu */ | 30 | /* cpuusage holds pointer to a u64-type object on every CPU */ |
42 | struct cpuacct_usage __percpu *cpuusage; | 31 | struct cpuacct_usage __percpu *cpuusage; |
43 | struct kernel_cpustat __percpu *cpustat; | 32 | struct kernel_cpustat __percpu *cpustat; |
44 | }; | 33 | }; |
45 | 34 | ||
46 | static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) | 35 | static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) |
@@ -48,7 +37,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) | |||
48 | return css ? container_of(css, struct cpuacct, css) : NULL; | 37 | return css ? container_of(css, struct cpuacct, css) : NULL; |
49 | } | 38 | } |
50 | 39 | ||
51 | /* return cpu accounting group to which this task belongs */ | 40 | /* Return CPU accounting group to which this task belongs */ |
52 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | 41 | static inline struct cpuacct *task_ca(struct task_struct *tsk) |
53 | { | 42 | { |
54 | return css_ca(task_css(tsk, cpuacct_cgrp_id)); | 43 | return css_ca(task_css(tsk, cpuacct_cgrp_id)); |
@@ -65,7 +54,7 @@ static struct cpuacct root_cpuacct = { | |||
65 | .cpuusage = &root_cpuacct_cpuusage, | 54 | .cpuusage = &root_cpuacct_cpuusage, |
66 | }; | 55 | }; |
67 | 56 | ||
68 | /* create a new cpu accounting group */ | 57 | /* Create a new CPU accounting group */ |
69 | static struct cgroup_subsys_state * | 58 | static struct cgroup_subsys_state * |
70 | cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) | 59 | cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) |
71 | { | 60 | { |
@@ -96,7 +85,7 @@ out: | |||
96 | return ERR_PTR(-ENOMEM); | 85 | return ERR_PTR(-ENOMEM); |
97 | } | 86 | } |
98 | 87 | ||
99 | /* destroy an existing cpu accounting group */ | 88 | /* Destroy an existing CPU accounting group */ |
100 | static void cpuacct_css_free(struct cgroup_subsys_state *css) | 89 | static void cpuacct_css_free(struct cgroup_subsys_state *css) |
101 | { | 90 | { |
102 | struct cpuacct *ca = css_ca(css); | 91 | struct cpuacct *ca = css_ca(css); |
@@ -162,7 +151,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | |||
162 | #endif | 151 | #endif |
163 | } | 152 | } |
164 | 153 | ||
165 | /* return total cpu usage (in nanoseconds) of a group */ | 154 | /* Return total CPU usage (in nanoseconds) of a group */ |
166 | static u64 __cpuusage_read(struct cgroup_subsys_state *css, | 155 | static u64 __cpuusage_read(struct cgroup_subsys_state *css, |
167 | enum cpuacct_stat_index index) | 156 | enum cpuacct_stat_index index) |
168 | { | 157 | { |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 8d9562d890d3..50316455ea66 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
@@ -10,11 +10,7 @@ | |||
10 | * as published by the Free Software Foundation; version 2 | 10 | * as published by the Free Software Foundation; version 2 |
11 | * of the License. | 11 | * of the License. |
12 | */ | 12 | */ |
13 | 13 | #include "sched.h" | |
14 | #include <linux/gfp.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/slab.h> | ||
17 | #include "cpudeadline.h" | ||
18 | 14 | ||
19 | static inline int parent(int i) | 15 | static inline int parent(int i) |
20 | { | 16 | { |
@@ -42,8 +38,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx) | |||
42 | return; | 38 | return; |
43 | 39 | ||
44 | /* adapted from lib/prio_heap.c */ | 40 | /* adapted from lib/prio_heap.c */ |
45 | while(1) { | 41 | while (1) { |
46 | u64 largest_dl; | 42 | u64 largest_dl; |
43 | |||
47 | l = left_child(idx); | 44 | l = left_child(idx); |
48 | r = right_child(idx); | 45 | r = right_child(idx); |
49 | largest = idx; | 46 | largest = idx; |
@@ -131,6 +128,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
131 | return 1; | 128 | return 1; |
132 | } else { | 129 | } else { |
133 | int best_cpu = cpudl_maximum(cp); | 130 | int best_cpu = cpudl_maximum(cp); |
131 | |||
134 | WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); | 132 | WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); |
135 | 133 | ||
136 | if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && | 134 | if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && |
@@ -145,9 +143,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
145 | } | 143 | } |
146 | 144 | ||
147 | /* | 145 | /* |
148 | * cpudl_clear - remove a cpu from the cpudl max-heap | 146 | * cpudl_clear - remove a CPU from the cpudl max-heap |
149 | * @cp: the cpudl max-heap context | 147 | * @cp: the cpudl max-heap context |
150 | * @cpu: the target cpu | 148 | * @cpu: the target CPU |
151 | * | 149 | * |
152 | * Notes: assumes cpu_rq(cpu)->lock is locked | 150 | * Notes: assumes cpu_rq(cpu)->lock is locked |
153 | * | 151 | * |
@@ -186,8 +184,8 @@ void cpudl_clear(struct cpudl *cp, int cpu) | |||
186 | /* | 184 | /* |
187 | * cpudl_set - update the cpudl max-heap | 185 | * cpudl_set - update the cpudl max-heap |
188 | * @cp: the cpudl max-heap context | 186 | * @cp: the cpudl max-heap context |
189 | * @cpu: the target cpu | 187 | * @cpu: the target CPU |
190 | * @dl: the new earliest deadline for this cpu | 188 | * @dl: the new earliest deadline for this CPU |
191 | * | 189 | * |
192 | * Notes: assumes cpu_rq(cpu)->lock is locked | 190 | * Notes: assumes cpu_rq(cpu)->lock is locked |
193 | * | 191 | * |
@@ -205,6 +203,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) | |||
205 | old_idx = cp->elements[cpu].idx; | 203 | old_idx = cp->elements[cpu].idx; |
206 | if (old_idx == IDX_INVALID) { | 204 | if (old_idx == IDX_INVALID) { |
207 | int new_idx = cp->size++; | 205 | int new_idx = cp->size++; |
206 | |||
208 | cp->elements[new_idx].dl = dl; | 207 | cp->elements[new_idx].dl = dl; |
209 | cp->elements[new_idx].cpu = cpu; | 208 | cp->elements[new_idx].cpu = cpu; |
210 | cp->elements[cpu].idx = new_idx; | 209 | cp->elements[cpu].idx = new_idx; |
@@ -221,7 +220,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) | |||
221 | /* | 220 | /* |
222 | * cpudl_set_freecpu - Set the cpudl.free_cpus | 221 | * cpudl_set_freecpu - Set the cpudl.free_cpus |
223 | * @cp: the cpudl max-heap context | 222 | * @cp: the cpudl max-heap context |
224 | * @cpu: rd attached cpu | 223 | * @cpu: rd attached CPU |
225 | */ | 224 | */ |
226 | void cpudl_set_freecpu(struct cpudl *cp, int cpu) | 225 | void cpudl_set_freecpu(struct cpudl *cp, int cpu) |
227 | { | 226 | { |
@@ -231,7 +230,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu) | |||
231 | /* | 230 | /* |
232 | * cpudl_clear_freecpu - Clear the cpudl.free_cpus | 231 | * cpudl_clear_freecpu - Clear the cpudl.free_cpus |
233 | * @cp: the cpudl max-heap context | 232 | * @cp: the cpudl max-heap context |
234 | * @cpu: rd attached cpu | 233 | * @cpu: rd attached CPU |
235 | */ | 234 | */ |
236 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu) | 235 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu) |
237 | { | 236 | { |
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index b010d26e108e..0adeda93b5fb 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h | |||
@@ -1,35 +1,26 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _LINUX_CPUDL_H | ||
3 | #define _LINUX_CPUDL_H | ||
4 | 2 | ||
5 | #include <linux/sched.h> | 3 | #define IDX_INVALID -1 |
6 | #include <linux/sched/deadline.h> | ||
7 | |||
8 | #define IDX_INVALID -1 | ||
9 | 4 | ||
10 | struct cpudl_item { | 5 | struct cpudl_item { |
11 | u64 dl; | 6 | u64 dl; |
12 | int cpu; | 7 | int cpu; |
13 | int idx; | 8 | int idx; |
14 | }; | 9 | }; |
15 | 10 | ||
16 | struct cpudl { | 11 | struct cpudl { |
17 | raw_spinlock_t lock; | 12 | raw_spinlock_t lock; |
18 | int size; | 13 | int size; |
19 | cpumask_var_t free_cpus; | 14 | cpumask_var_t free_cpus; |
20 | struct cpudl_item *elements; | 15 | struct cpudl_item *elements; |
21 | }; | 16 | }; |
22 | 17 | ||
23 | |||
24 | #ifdef CONFIG_SMP | 18 | #ifdef CONFIG_SMP |
25 | int cpudl_find(struct cpudl *cp, struct task_struct *p, | 19 | int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); |
26 | struct cpumask *later_mask); | ||
27 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl); | 20 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl); |
28 | void cpudl_clear(struct cpudl *cp, int cpu); | 21 | void cpudl_clear(struct cpudl *cp, int cpu); |
29 | int cpudl_init(struct cpudl *cp); | 22 | int cpudl_init(struct cpudl *cp); |
30 | void cpudl_set_freecpu(struct cpudl *cp, int cpu); | 23 | void cpudl_set_freecpu(struct cpudl *cp, int cpu); |
31 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu); | 24 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu); |
32 | void cpudl_cleanup(struct cpudl *cp); | 25 | void cpudl_cleanup(struct cpudl *cp); |
33 | #endif /* CONFIG_SMP */ | 26 | #endif /* CONFIG_SMP */ |
34 | |||
35 | #endif /* _LINUX_CPUDL_H */ | ||
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index dbc51442ecbc..5e54cbcae673 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c | |||
@@ -8,7 +8,6 @@ | |||
8 | * it under the terms of the GNU General Public License version 2 as | 8 | * it under the terms of the GNU General Public License version 2 as |
9 | * published by the Free Software Foundation. | 9 | * published by the Free Software Foundation. |
10 | */ | 10 | */ |
11 | |||
12 | #include "sched.h" | 11 | #include "sched.h" |
13 | 12 | ||
14 | DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); | 13 | DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); |
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 617c6741c525..d2c6083304b4 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
@@ -11,61 +11,56 @@ | |||
11 | 11 | ||
12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
13 | 13 | ||
14 | #include <linux/cpufreq.h> | ||
15 | #include <linux/kthread.h> | ||
16 | #include <uapi/linux/sched/types.h> | ||
17 | #include <linux/slab.h> | ||
18 | #include <trace/events/power.h> | ||
19 | |||
20 | #include "sched.h" | 14 | #include "sched.h" |
21 | 15 | ||
16 | #include <trace/events/power.h> | ||
17 | |||
22 | struct sugov_tunables { | 18 | struct sugov_tunables { |
23 | struct gov_attr_set attr_set; | 19 | struct gov_attr_set attr_set; |
24 | unsigned int rate_limit_us; | 20 | unsigned int rate_limit_us; |
25 | }; | 21 | }; |
26 | 22 | ||
27 | struct sugov_policy { | 23 | struct sugov_policy { |
28 | struct cpufreq_policy *policy; | 24 | struct cpufreq_policy *policy; |
29 | 25 | ||
30 | struct sugov_tunables *tunables; | 26 | struct sugov_tunables *tunables; |
31 | struct list_head tunables_hook; | 27 | struct list_head tunables_hook; |
32 | 28 | ||
33 | raw_spinlock_t update_lock; /* For shared policies */ | 29 | raw_spinlock_t update_lock; /* For shared policies */ |
34 | u64 last_freq_update_time; | 30 | u64 last_freq_update_time; |
35 | s64 freq_update_delay_ns; | 31 | s64 freq_update_delay_ns; |
36 | unsigned int next_freq; | 32 | unsigned int next_freq; |
37 | unsigned int cached_raw_freq; | 33 | unsigned int cached_raw_freq; |
38 | 34 | ||
39 | /* The next fields are only needed if fast switch cannot be used. */ | 35 | /* The next fields are only needed if fast switch cannot be used: */ |
40 | struct irq_work irq_work; | 36 | struct irq_work irq_work; |
41 | struct kthread_work work; | 37 | struct kthread_work work; |
42 | struct mutex work_lock; | 38 | struct mutex work_lock; |
43 | struct kthread_worker worker; | 39 | struct kthread_worker worker; |
44 | struct task_struct *thread; | 40 | struct task_struct *thread; |
45 | bool work_in_progress; | 41 | bool work_in_progress; |
46 | 42 | ||
47 | bool need_freq_update; | 43 | bool need_freq_update; |
48 | }; | 44 | }; |
49 | 45 | ||
50 | struct sugov_cpu { | 46 | struct sugov_cpu { |
51 | struct update_util_data update_util; | 47 | struct update_util_data update_util; |
52 | struct sugov_policy *sg_policy; | 48 | struct sugov_policy *sg_policy; |
53 | unsigned int cpu; | 49 | unsigned int cpu; |
54 | 50 | ||
55 | bool iowait_boost_pending; | 51 | bool iowait_boost_pending; |
56 | unsigned int iowait_boost; | 52 | unsigned int iowait_boost; |
57 | unsigned int iowait_boost_max; | 53 | unsigned int iowait_boost_max; |
58 | u64 last_update; | 54 | u64 last_update; |
59 | 55 | ||
60 | /* The fields below are only needed when sharing a policy. */ | 56 | /* The fields below are only needed when sharing a policy: */ |
61 | unsigned long util_cfs; | 57 | unsigned long util_cfs; |
62 | unsigned long util_dl; | 58 | unsigned long util_dl; |
63 | unsigned long max; | 59 | unsigned long max; |
64 | unsigned int flags; | ||
65 | 60 | ||
66 | /* The field below is for single-CPU policies only. */ | 61 | /* The field below is for single-CPU policies only: */ |
67 | #ifdef CONFIG_NO_HZ_COMMON | 62 | #ifdef CONFIG_NO_HZ_COMMON |
68 | unsigned long saved_idle_calls; | 63 | unsigned long saved_idle_calls; |
69 | #endif | 64 | #endif |
70 | }; | 65 | }; |
71 | 66 | ||
@@ -79,9 +74,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) | |||
79 | 74 | ||
80 | /* | 75 | /* |
81 | * Since cpufreq_update_util() is called with rq->lock held for | 76 | * Since cpufreq_update_util() is called with rq->lock held for |
82 | * the @target_cpu, our per-cpu data is fully serialized. | 77 | * the @target_cpu, our per-CPU data is fully serialized. |
83 | * | 78 | * |
84 | * However, drivers cannot in general deal with cross-cpu | 79 | * However, drivers cannot in general deal with cross-CPU |
85 | * requests, so while get_next_freq() will work, our | 80 | * requests, so while get_next_freq() will work, our |
86 | * sugov_update_commit() call may not for the fast switching platforms. | 81 | * sugov_update_commit() call may not for the fast switching platforms. |
87 | * | 82 | * |
@@ -111,6 +106,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) | |||
111 | } | 106 | } |
112 | 107 | ||
113 | delta_ns = time - sg_policy->last_freq_update_time; | 108 | delta_ns = time - sg_policy->last_freq_update_time; |
109 | |||
114 | return delta_ns >= sg_policy->freq_update_delay_ns; | 110 | return delta_ns >= sg_policy->freq_update_delay_ns; |
115 | } | 111 | } |
116 | 112 | ||
@@ -186,17 +182,28 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) | |||
186 | 182 | ||
187 | static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) | 183 | static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) |
188 | { | 184 | { |
185 | struct rq *rq = cpu_rq(sg_cpu->cpu); | ||
186 | unsigned long util; | ||
187 | |||
188 | if (rq->rt.rt_nr_running) { | ||
189 | util = sg_cpu->max; | ||
190 | } else { | ||
191 | util = sg_cpu->util_dl; | ||
192 | if (rq->cfs.h_nr_running) | ||
193 | util += sg_cpu->util_cfs; | ||
194 | } | ||
195 | |||
189 | /* | 196 | /* |
190 | * Ideally we would like to set util_dl as min/guaranteed freq and | 197 | * Ideally we would like to set util_dl as min/guaranteed freq and |
191 | * util_cfs + util_dl as requested freq. However, cpufreq is not yet | 198 | * util_cfs + util_dl as requested freq. However, cpufreq is not yet |
192 | * ready for such an interface. So, we only do the latter for now. | 199 | * ready for such an interface. So, we only do the latter for now. |
193 | */ | 200 | */ |
194 | return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max); | 201 | return min(util, sg_cpu->max); |
195 | } | 202 | } |
196 | 203 | ||
197 | static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time) | 204 | static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) |
198 | { | 205 | { |
199 | if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) { | 206 | if (flags & SCHED_CPUFREQ_IOWAIT) { |
200 | if (sg_cpu->iowait_boost_pending) | 207 | if (sg_cpu->iowait_boost_pending) |
201 | return; | 208 | return; |
202 | 209 | ||
@@ -260,43 +267,51 @@ static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) | |||
260 | static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } | 267 | static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } |
261 | #endif /* CONFIG_NO_HZ_COMMON */ | 268 | #endif /* CONFIG_NO_HZ_COMMON */ |
262 | 269 | ||
270 | /* | ||
271 | * Make sugov_should_update_freq() ignore the rate limit when DL | ||
272 | * has increased the utilization. | ||
273 | */ | ||
274 | static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) | ||
275 | { | ||
276 | if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl) | ||
277 | sg_policy->need_freq_update = true; | ||
278 | } | ||
279 | |||
263 | static void sugov_update_single(struct update_util_data *hook, u64 time, | 280 | static void sugov_update_single(struct update_util_data *hook, u64 time, |
264 | unsigned int flags) | 281 | unsigned int flags) |
265 | { | 282 | { |
266 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); | 283 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); |
267 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 284 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
268 | struct cpufreq_policy *policy = sg_policy->policy; | ||
269 | unsigned long util, max; | 285 | unsigned long util, max; |
270 | unsigned int next_f; | 286 | unsigned int next_f; |
271 | bool busy; | 287 | bool busy; |
272 | 288 | ||
273 | sugov_set_iowait_boost(sg_cpu, time); | 289 | sugov_set_iowait_boost(sg_cpu, time, flags); |
274 | sg_cpu->last_update = time; | 290 | sg_cpu->last_update = time; |
275 | 291 | ||
292 | ignore_dl_rate_limit(sg_cpu, sg_policy); | ||
293 | |||
276 | if (!sugov_should_update_freq(sg_policy, time)) | 294 | if (!sugov_should_update_freq(sg_policy, time)) |
277 | return; | 295 | return; |
278 | 296 | ||
279 | busy = sugov_cpu_is_busy(sg_cpu); | 297 | busy = sugov_cpu_is_busy(sg_cpu); |
280 | 298 | ||
281 | if (flags & SCHED_CPUFREQ_RT) { | 299 | sugov_get_util(sg_cpu); |
282 | next_f = policy->cpuinfo.max_freq; | 300 | max = sg_cpu->max; |
283 | } else { | 301 | util = sugov_aggregate_util(sg_cpu); |
284 | sugov_get_util(sg_cpu); | 302 | sugov_iowait_boost(sg_cpu, &util, &max); |
285 | max = sg_cpu->max; | 303 | next_f = get_next_freq(sg_policy, util, max); |
286 | util = sugov_aggregate_util(sg_cpu); | 304 | /* |
287 | sugov_iowait_boost(sg_cpu, &util, &max); | 305 | * Do not reduce the frequency if the CPU has not been idle |
288 | next_f = get_next_freq(sg_policy, util, max); | 306 | * recently, as the reduction is likely to be premature then. |
289 | /* | 307 | */ |
290 | * Do not reduce the frequency if the CPU has not been idle | 308 | if (busy && next_f < sg_policy->next_freq) { |
291 | * recently, as the reduction is likely to be premature then. | 309 | next_f = sg_policy->next_freq; |
292 | */ | ||
293 | if (busy && next_f < sg_policy->next_freq) { | ||
294 | next_f = sg_policy->next_freq; | ||
295 | 310 | ||
296 | /* Reset cached freq as next_freq has changed */ | 311 | /* Reset cached freq as next_freq has changed */ |
297 | sg_policy->cached_raw_freq = 0; | 312 | sg_policy->cached_raw_freq = 0; |
298 | } | ||
299 | } | 313 | } |
314 | |||
300 | sugov_update_commit(sg_policy, time, next_f); | 315 | sugov_update_commit(sg_policy, time, next_f); |
301 | } | 316 | } |
302 | 317 | ||
@@ -312,6 +327,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) | |||
312 | unsigned long j_util, j_max; | 327 | unsigned long j_util, j_max; |
313 | s64 delta_ns; | 328 | s64 delta_ns; |
314 | 329 | ||
330 | sugov_get_util(j_sg_cpu); | ||
331 | |||
315 | /* | 332 | /* |
316 | * If the CFS CPU utilization was last updated before the | 333 | * If the CFS CPU utilization was last updated before the |
317 | * previous frequency update and the time elapsed between the | 334 | * previous frequency update and the time elapsed between the |
@@ -325,28 +342,22 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) | |||
325 | if (delta_ns > TICK_NSEC) { | 342 | if (delta_ns > TICK_NSEC) { |
326 | j_sg_cpu->iowait_boost = 0; | 343 | j_sg_cpu->iowait_boost = 0; |
327 | j_sg_cpu->iowait_boost_pending = false; | 344 | j_sg_cpu->iowait_boost_pending = false; |
328 | j_sg_cpu->util_cfs = 0; | ||
329 | if (j_sg_cpu->util_dl == 0) | ||
330 | continue; | ||
331 | } | 345 | } |
332 | if (j_sg_cpu->flags & SCHED_CPUFREQ_RT) | ||
333 | return policy->cpuinfo.max_freq; | ||
334 | 346 | ||
335 | j_max = j_sg_cpu->max; | 347 | j_max = j_sg_cpu->max; |
336 | j_util = sugov_aggregate_util(j_sg_cpu); | 348 | j_util = sugov_aggregate_util(j_sg_cpu); |
349 | sugov_iowait_boost(j_sg_cpu, &j_util, &j_max); | ||
337 | if (j_util * max > j_max * util) { | 350 | if (j_util * max > j_max * util) { |
338 | util = j_util; | 351 | util = j_util; |
339 | max = j_max; | 352 | max = j_max; |
340 | } | 353 | } |
341 | |||
342 | sugov_iowait_boost(j_sg_cpu, &util, &max); | ||
343 | } | 354 | } |
344 | 355 | ||
345 | return get_next_freq(sg_policy, util, max); | 356 | return get_next_freq(sg_policy, util, max); |
346 | } | 357 | } |
347 | 358 | ||
348 | static void sugov_update_shared(struct update_util_data *hook, u64 time, | 359 | static void |
349 | unsigned int flags) | 360 | sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) |
350 | { | 361 | { |
351 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); | 362 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); |
352 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 363 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
@@ -354,18 +365,13 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, | |||
354 | 365 | ||
355 | raw_spin_lock(&sg_policy->update_lock); | 366 | raw_spin_lock(&sg_policy->update_lock); |
356 | 367 | ||
357 | sugov_get_util(sg_cpu); | 368 | sugov_set_iowait_boost(sg_cpu, time, flags); |
358 | sg_cpu->flags = flags; | ||
359 | |||
360 | sugov_set_iowait_boost(sg_cpu, time); | ||
361 | sg_cpu->last_update = time; | 369 | sg_cpu->last_update = time; |
362 | 370 | ||
363 | if (sugov_should_update_freq(sg_policy, time)) { | 371 | ignore_dl_rate_limit(sg_cpu, sg_policy); |
364 | if (flags & SCHED_CPUFREQ_RT) | ||
365 | next_f = sg_policy->policy->cpuinfo.max_freq; | ||
366 | else | ||
367 | next_f = sugov_next_freq_shared(sg_cpu, time); | ||
368 | 372 | ||
373 | if (sugov_should_update_freq(sg_policy, time)) { | ||
374 | next_f = sugov_next_freq_shared(sg_cpu, time); | ||
369 | sugov_update_commit(sg_policy, time, next_f); | 375 | sugov_update_commit(sg_policy, time, next_f); |
370 | } | 376 | } |
371 | 377 | ||
@@ -423,8 +429,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) | |||
423 | return sprintf(buf, "%u\n", tunables->rate_limit_us); | 429 | return sprintf(buf, "%u\n", tunables->rate_limit_us); |
424 | } | 430 | } |
425 | 431 | ||
426 | static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, | 432 | static ssize_t |
427 | size_t count) | 433 | rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count) |
428 | { | 434 | { |
429 | struct sugov_tunables *tunables = to_sugov_tunables(attr_set); | 435 | struct sugov_tunables *tunables = to_sugov_tunables(attr_set); |
430 | struct sugov_policy *sg_policy; | 436 | struct sugov_policy *sg_policy; |
@@ -479,11 +485,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) | |||
479 | { | 485 | { |
480 | struct task_struct *thread; | 486 | struct task_struct *thread; |
481 | struct sched_attr attr = { | 487 | struct sched_attr attr = { |
482 | .size = sizeof(struct sched_attr), | 488 | .size = sizeof(struct sched_attr), |
483 | .sched_policy = SCHED_DEADLINE, | 489 | .sched_policy = SCHED_DEADLINE, |
484 | .sched_flags = SCHED_FLAG_SUGOV, | 490 | .sched_flags = SCHED_FLAG_SUGOV, |
485 | .sched_nice = 0, | 491 | .sched_nice = 0, |
486 | .sched_priority = 0, | 492 | .sched_priority = 0, |
487 | /* | 493 | /* |
488 | * Fake (unused) bandwidth; workaround to "fix" | 494 | * Fake (unused) bandwidth; workaround to "fix" |
489 | * priority inheritance. | 495 | * priority inheritance. |
@@ -662,21 +668,20 @@ static int sugov_start(struct cpufreq_policy *policy) | |||
662 | struct sugov_policy *sg_policy = policy->governor_data; | 668 | struct sugov_policy *sg_policy = policy->governor_data; |
663 | unsigned int cpu; | 669 | unsigned int cpu; |
664 | 670 | ||
665 | sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; | 671 | sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; |
666 | sg_policy->last_freq_update_time = 0; | 672 | sg_policy->last_freq_update_time = 0; |
667 | sg_policy->next_freq = UINT_MAX; | 673 | sg_policy->next_freq = UINT_MAX; |
668 | sg_policy->work_in_progress = false; | 674 | sg_policy->work_in_progress = false; |
669 | sg_policy->need_freq_update = false; | 675 | sg_policy->need_freq_update = false; |
670 | sg_policy->cached_raw_freq = 0; | 676 | sg_policy->cached_raw_freq = 0; |
671 | 677 | ||
672 | for_each_cpu(cpu, policy->cpus) { | 678 | for_each_cpu(cpu, policy->cpus) { |
673 | struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); | 679 | struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); |
674 | 680 | ||
675 | memset(sg_cpu, 0, sizeof(*sg_cpu)); | 681 | memset(sg_cpu, 0, sizeof(*sg_cpu)); |
676 | sg_cpu->cpu = cpu; | 682 | sg_cpu->cpu = cpu; |
677 | sg_cpu->sg_policy = sg_policy; | 683 | sg_cpu->sg_policy = sg_policy; |
678 | sg_cpu->flags = 0; | 684 | sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; |
679 | sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; | ||
680 | } | 685 | } |
681 | 686 | ||
682 | for_each_cpu(cpu, policy->cpus) { | 687 | for_each_cpu(cpu, policy->cpus) { |
@@ -720,14 +725,14 @@ static void sugov_limits(struct cpufreq_policy *policy) | |||
720 | } | 725 | } |
721 | 726 | ||
722 | static struct cpufreq_governor schedutil_gov = { | 727 | static struct cpufreq_governor schedutil_gov = { |
723 | .name = "schedutil", | 728 | .name = "schedutil", |
724 | .owner = THIS_MODULE, | 729 | .owner = THIS_MODULE, |
725 | .dynamic_switching = true, | 730 | .dynamic_switching = true, |
726 | .init = sugov_init, | 731 | .init = sugov_init, |
727 | .exit = sugov_exit, | 732 | .exit = sugov_exit, |
728 | .start = sugov_start, | 733 | .start = sugov_start, |
729 | .stop = sugov_stop, | 734 | .stop = sugov_stop, |
730 | .limits = sugov_limits, | 735 | .limits = sugov_limits, |
731 | }; | 736 | }; |
732 | 737 | ||
733 | #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL | 738 | #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL |
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 2511aba36b89..daaadf939ccb 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c | |||
@@ -14,7 +14,7 @@ | |||
14 | * | 14 | * |
15 | * going from the lowest priority to the highest. CPUs in the INVALID state | 15 | * going from the lowest priority to the highest. CPUs in the INVALID state |
16 | * are not eligible for routing. The system maintains this state with | 16 | * are not eligible for routing. The system maintains this state with |
17 | * a 2 dimensional bitmap (the first for priority class, the second for cpus | 17 | * a 2 dimensional bitmap (the first for priority class, the second for CPUs |
18 | * in that class). Therefore a typical application without affinity | 18 | * in that class). Therefore a typical application without affinity |
19 | * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit | 19 | * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit |
20 | * searches). For tasks with affinity restrictions, the algorithm has a | 20 | * searches). For tasks with affinity restrictions, the algorithm has a |
@@ -26,12 +26,7 @@ | |||
26 | * as published by the Free Software Foundation; version 2 | 26 | * as published by the Free Software Foundation; version 2 |
27 | * of the License. | 27 | * of the License. |
28 | */ | 28 | */ |
29 | 29 | #include "sched.h" | |
30 | #include <linux/gfp.h> | ||
31 | #include <linux/sched.h> | ||
32 | #include <linux/sched/rt.h> | ||
33 | #include <linux/slab.h> | ||
34 | #include "cpupri.h" | ||
35 | 30 | ||
36 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ | 31 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ |
37 | static int convert_prio(int prio) | 32 | static int convert_prio(int prio) |
@@ -128,9 +123,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, | |||
128 | } | 123 | } |
129 | 124 | ||
130 | /** | 125 | /** |
131 | * cpupri_set - update the cpu priority setting | 126 | * cpupri_set - update the CPU priority setting |
132 | * @cp: The cpupri context | 127 | * @cp: The cpupri context |
133 | * @cpu: The target cpu | 128 | * @cpu: The target CPU |
134 | * @newpri: The priority (INVALID-RT99) to assign to this CPU | 129 | * @newpri: The priority (INVALID-RT99) to assign to this CPU |
135 | * | 130 | * |
136 | * Note: Assumes cpu_rq(cpu)->lock is locked | 131 | * Note: Assumes cpu_rq(cpu)->lock is locked |
@@ -151,7 +146,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
151 | return; | 146 | return; |
152 | 147 | ||
153 | /* | 148 | /* |
154 | * If the cpu was currently mapped to a different value, we | 149 | * If the CPU was currently mapped to a different value, we |
155 | * need to map it to the new value then remove the old value. | 150 | * need to map it to the new value then remove the old value. |
156 | * Note, we must add the new value first, otherwise we risk the | 151 | * Note, we must add the new value first, otherwise we risk the |
157 | * cpu being missed by the priority loop in cpupri_find. | 152 | * cpu being missed by the priority loop in cpupri_find. |
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index bab050019071..7dc20a3232e7 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h | |||
@@ -1,32 +1,25 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _LINUX_CPUPRI_H | ||
3 | #define _LINUX_CPUPRI_H | ||
4 | |||
5 | #include <linux/sched.h> | ||
6 | 2 | ||
7 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) | 3 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) |
8 | 4 | ||
9 | #define CPUPRI_INVALID -1 | 5 | #define CPUPRI_INVALID -1 |
10 | #define CPUPRI_IDLE 0 | 6 | #define CPUPRI_IDLE 0 |
11 | #define CPUPRI_NORMAL 1 | 7 | #define CPUPRI_NORMAL 1 |
12 | /* values 2-101 are RT priorities 0-99 */ | 8 | /* values 2-101 are RT priorities 0-99 */ |
13 | 9 | ||
14 | struct cpupri_vec { | 10 | struct cpupri_vec { |
15 | atomic_t count; | 11 | atomic_t count; |
16 | cpumask_var_t mask; | 12 | cpumask_var_t mask; |
17 | }; | 13 | }; |
18 | 14 | ||
19 | struct cpupri { | 15 | struct cpupri { |
20 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; | 16 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; |
21 | int *cpu_to_pri; | 17 | int *cpu_to_pri; |
22 | }; | 18 | }; |
23 | 19 | ||
24 | #ifdef CONFIG_SMP | 20 | #ifdef CONFIG_SMP |
25 | int cpupri_find(struct cpupri *cp, | 21 | int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); |
26 | struct task_struct *p, struct cpumask *lowest_mask); | ||
27 | void cpupri_set(struct cpupri *cp, int cpu, int pri); | 22 | void cpupri_set(struct cpupri *cp, int cpu, int pri); |
28 | int cpupri_init(struct cpupri *cp); | 23 | int cpupri_init(struct cpupri *cp); |
29 | void cpupri_cleanup(struct cpupri *cp); | 24 | void cpupri_cleanup(struct cpupri *cp); |
30 | #endif | 25 | #endif |
31 | |||
32 | #endif /* _LINUX_CPUPRI_H */ | ||
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index bac6ac9a4ec7..0796f938c4f0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -1,10 +1,6 @@ | |||
1 | #include <linux/export.h> | 1 | /* |
2 | #include <linux/sched.h> | 2 | * Simple CPU accounting cgroup controller |
3 | #include <linux/tsacct_kern.h> | 3 | */ |
4 | #include <linux/kernel_stat.h> | ||
5 | #include <linux/static_key.h> | ||
6 | #include <linux/context_tracking.h> | ||
7 | #include <linux/sched/cputime.h> | ||
8 | #include "sched.h" | 4 | #include "sched.h" |
9 | 5 | ||
10 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 6 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
@@ -113,9 +109,9 @@ static inline void task_group_account_field(struct task_struct *p, int index, | |||
113 | } | 109 | } |
114 | 110 | ||
115 | /* | 111 | /* |
116 | * Account user cpu time to a process. | 112 | * Account user CPU time to a process. |
117 | * @p: the process that the cpu time gets accounted to | 113 | * @p: the process that the CPU time gets accounted to |
118 | * @cputime: the cpu time spent in user space since the last update | 114 | * @cputime: the CPU time spent in user space since the last update |
119 | */ | 115 | */ |
120 | void account_user_time(struct task_struct *p, u64 cputime) | 116 | void account_user_time(struct task_struct *p, u64 cputime) |
121 | { | 117 | { |
@@ -135,9 +131,9 @@ void account_user_time(struct task_struct *p, u64 cputime) | |||
135 | } | 131 | } |
136 | 132 | ||
137 | /* | 133 | /* |
138 | * Account guest cpu time to a process. | 134 | * Account guest CPU time to a process. |
139 | * @p: the process that the cpu time gets accounted to | 135 | * @p: the process that the CPU time gets accounted to |
140 | * @cputime: the cpu time spent in virtual machine since the last update | 136 | * @cputime: the CPU time spent in virtual machine since the last update |
141 | */ | 137 | */ |
142 | void account_guest_time(struct task_struct *p, u64 cputime) | 138 | void account_guest_time(struct task_struct *p, u64 cputime) |
143 | { | 139 | { |
@@ -159,9 +155,9 @@ void account_guest_time(struct task_struct *p, u64 cputime) | |||
159 | } | 155 | } |
160 | 156 | ||
161 | /* | 157 | /* |
162 | * Account system cpu time to a process and desired cpustat field | 158 | * Account system CPU time to a process and desired cpustat field |
163 | * @p: the process that the cpu time gets accounted to | 159 | * @p: the process that the CPU time gets accounted to |
164 | * @cputime: the cpu time spent in kernel space since the last update | 160 | * @cputime: the CPU time spent in kernel space since the last update |
165 | * @index: pointer to cpustat field that has to be updated | 161 | * @index: pointer to cpustat field that has to be updated |
166 | */ | 162 | */ |
167 | void account_system_index_time(struct task_struct *p, | 163 | void account_system_index_time(struct task_struct *p, |
@@ -179,10 +175,10 @@ void account_system_index_time(struct task_struct *p, | |||
179 | } | 175 | } |
180 | 176 | ||
181 | /* | 177 | /* |
182 | * Account system cpu time to a process. | 178 | * Account system CPU time to a process. |
183 | * @p: the process that the cpu time gets accounted to | 179 | * @p: the process that the CPU time gets accounted to |
184 | * @hardirq_offset: the offset to subtract from hardirq_count() | 180 | * @hardirq_offset: the offset to subtract from hardirq_count() |
185 | * @cputime: the cpu time spent in kernel space since the last update | 181 | * @cputime: the CPU time spent in kernel space since the last update |
186 | */ | 182 | */ |
187 | void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) | 183 | void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) |
188 | { | 184 | { |
@@ -205,7 +201,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) | |||
205 | 201 | ||
206 | /* | 202 | /* |
207 | * Account for involuntary wait time. | 203 | * Account for involuntary wait time. |
208 | * @cputime: the cpu time spent in involuntary wait | 204 | * @cputime: the CPU time spent in involuntary wait |
209 | */ | 205 | */ |
210 | void account_steal_time(u64 cputime) | 206 | void account_steal_time(u64 cputime) |
211 | { | 207 | { |
@@ -216,7 +212,7 @@ void account_steal_time(u64 cputime) | |||
216 | 212 | ||
217 | /* | 213 | /* |
218 | * Account for idle time. | 214 | * Account for idle time. |
219 | * @cputime: the cpu time spent in idle wait | 215 | * @cputime: the CPU time spent in idle wait |
220 | */ | 216 | */ |
221 | void account_idle_time(u64 cputime) | 217 | void account_idle_time(u64 cputime) |
222 | { | 218 | { |
@@ -338,7 +334,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
338 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 334 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
339 | /* | 335 | /* |
340 | * Account a tick to a process and cpustat | 336 | * Account a tick to a process and cpustat |
341 | * @p: the process that the cpu time gets accounted to | 337 | * @p: the process that the CPU time gets accounted to |
342 | * @user_tick: is the tick from userspace | 338 | * @user_tick: is the tick from userspace |
343 | * @rq: the pointer to rq | 339 | * @rq: the pointer to rq |
344 | * | 340 | * |
@@ -400,17 +396,16 @@ static void irqtime_account_idle_ticks(int ticks) | |||
400 | irqtime_account_process_tick(current, 0, rq, ticks); | 396 | irqtime_account_process_tick(current, 0, rq, ticks); |
401 | } | 397 | } |
402 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 398 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
403 | static inline void irqtime_account_idle_ticks(int ticks) {} | 399 | static inline void irqtime_account_idle_ticks(int ticks) { } |
404 | static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, | 400 | static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, |
405 | struct rq *rq, int nr_ticks) {} | 401 | struct rq *rq, int nr_ticks) { } |
406 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | 402 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
407 | 403 | ||
408 | /* | 404 | /* |
409 | * Use precise platform statistics if available: | 405 | * Use precise platform statistics if available: |
410 | */ | 406 | */ |
411 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | 407 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING |
412 | 408 | # ifndef __ARCH_HAS_VTIME_TASK_SWITCH | |
413 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH | ||
414 | void vtime_common_task_switch(struct task_struct *prev) | 409 | void vtime_common_task_switch(struct task_struct *prev) |
415 | { | 410 | { |
416 | if (is_idle_task(prev)) | 411 | if (is_idle_task(prev)) |
@@ -421,8 +416,7 @@ void vtime_common_task_switch(struct task_struct *prev) | |||
421 | vtime_flush(prev); | 416 | vtime_flush(prev); |
422 | arch_vtime_task_switch(prev); | 417 | arch_vtime_task_switch(prev); |
423 | } | 418 | } |
424 | #endif | 419 | # endif |
425 | |||
426 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ | 420 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ |
427 | 421 | ||
428 | 422 | ||
@@ -469,10 +463,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) | |||
469 | *ut = cputime.utime; | 463 | *ut = cputime.utime; |
470 | *st = cputime.stime; | 464 | *st = cputime.stime; |
471 | } | 465 | } |
472 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | 466 | |
467 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */ | ||
468 | |||
473 | /* | 469 | /* |
474 | * Account a single tick of cpu time. | 470 | * Account a single tick of CPU time. |
475 | * @p: the process that the cpu time gets accounted to | 471 | * @p: the process that the CPU time gets accounted to |
476 | * @user_tick: indicates if the tick is a user or a system tick | 472 | * @user_tick: indicates if the tick is a user or a system tick |
477 | */ | 473 | */ |
478 | void account_process_tick(struct task_struct *p, int user_tick) | 474 | void account_process_tick(struct task_struct *p, int user_tick) |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9df09782025c..d1c7bf7c7e5b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -17,9 +17,6 @@ | |||
17 | */ | 17 | */ |
18 | #include "sched.h" | 18 | #include "sched.h" |
19 | 19 | ||
20 | #include <linux/slab.h> | ||
21 | #include <uapi/linux/sched/types.h> | ||
22 | |||
23 | struct dl_bandwidth def_dl_bandwidth; | 20 | struct dl_bandwidth def_dl_bandwidth; |
24 | 21 | ||
25 | static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) | 22 | static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) |
@@ -87,7 +84,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) | |||
87 | SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ | 84 | SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ |
88 | SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); | 85 | SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); |
89 | /* kick cpufreq (see the comment in kernel/sched/sched.h). */ | 86 | /* kick cpufreq (see the comment in kernel/sched/sched.h). */ |
90 | cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); | 87 | cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); |
91 | } | 88 | } |
92 | 89 | ||
93 | static inline | 90 | static inline |
@@ -101,7 +98,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) | |||
101 | if (dl_rq->running_bw > old) | 98 | if (dl_rq->running_bw > old) |
102 | dl_rq->running_bw = 0; | 99 | dl_rq->running_bw = 0; |
103 | /* kick cpufreq (see the comment in kernel/sched/sched.h). */ | 100 | /* kick cpufreq (see the comment in kernel/sched/sched.h). */ |
104 | cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); | 101 | cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); |
105 | } | 102 | } |
106 | 103 | ||
107 | static inline | 104 | static inline |
@@ -514,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head); | |||
514 | static void push_dl_tasks(struct rq *); | 511 | static void push_dl_tasks(struct rq *); |
515 | static void pull_dl_task(struct rq *); | 512 | static void pull_dl_task(struct rq *); |
516 | 513 | ||
517 | static inline void queue_push_tasks(struct rq *rq) | 514 | static inline void deadline_queue_push_tasks(struct rq *rq) |
518 | { | 515 | { |
519 | if (!has_pushable_dl_tasks(rq)) | 516 | if (!has_pushable_dl_tasks(rq)) |
520 | return; | 517 | return; |
@@ -522,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq) | |||
522 | queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); | 519 | queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); |
523 | } | 520 | } |
524 | 521 | ||
525 | static inline void queue_pull_task(struct rq *rq) | 522 | static inline void deadline_queue_pull_task(struct rq *rq) |
526 | { | 523 | { |
527 | queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); | 524 | queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); |
528 | } | 525 | } |
@@ -539,12 +536,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p | |||
539 | 536 | ||
540 | /* | 537 | /* |
541 | * If we cannot preempt any rq, fall back to pick any | 538 | * If we cannot preempt any rq, fall back to pick any |
542 | * online cpu. | 539 | * online CPU: |
543 | */ | 540 | */ |
544 | cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); | 541 | cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); |
545 | if (cpu >= nr_cpu_ids) { | 542 | if (cpu >= nr_cpu_ids) { |
546 | /* | 543 | /* |
547 | * Fail to find any suitable cpu. | 544 | * Failed to find any suitable CPU. |
548 | * The task will never come back! | 545 | * The task will never come back! |
549 | */ | 546 | */ |
550 | BUG_ON(dl_bandwidth_enabled()); | 547 | BUG_ON(dl_bandwidth_enabled()); |
@@ -597,19 +594,18 @@ static inline void pull_dl_task(struct rq *rq) | |||
597 | { | 594 | { |
598 | } | 595 | } |
599 | 596 | ||
600 | static inline void queue_push_tasks(struct rq *rq) | 597 | static inline void deadline_queue_push_tasks(struct rq *rq) |
601 | { | 598 | { |
602 | } | 599 | } |
603 | 600 | ||
604 | static inline void queue_pull_task(struct rq *rq) | 601 | static inline void deadline_queue_pull_task(struct rq *rq) |
605 | { | 602 | { |
606 | } | 603 | } |
607 | #endif /* CONFIG_SMP */ | 604 | #endif /* CONFIG_SMP */ |
608 | 605 | ||
609 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); | 606 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); |
610 | static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); | 607 | static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); |
611 | static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | 608 | static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); |
612 | int flags); | ||
613 | 609 | ||
614 | /* | 610 | /* |
615 | * We are being explicitly informed that a new instance is starting, | 611 | * We are being explicitly informed that a new instance is starting, |
@@ -1763,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | |||
1763 | if (hrtick_enabled(rq)) | 1759 | if (hrtick_enabled(rq)) |
1764 | start_hrtick_dl(rq, p); | 1760 | start_hrtick_dl(rq, p); |
1765 | 1761 | ||
1766 | queue_push_tasks(rq); | 1762 | deadline_queue_push_tasks(rq); |
1767 | 1763 | ||
1768 | return p; | 1764 | return p; |
1769 | } | 1765 | } |
@@ -1776,6 +1772,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) | |||
1776 | enqueue_pushable_dl_task(rq, p); | 1772 | enqueue_pushable_dl_task(rq, p); |
1777 | } | 1773 | } |
1778 | 1774 | ||
1775 | /* | ||
1776 | * scheduler tick hitting a task of our scheduling class. | ||
1777 | * | ||
1778 | * NOTE: This function can be called remotely by the tick offload that | ||
1779 | * goes along full dynticks. Therefore no local assumption can be made | ||
1780 | * and everything must be accessed through the @rq and @curr passed in | ||
1781 | * parameters. | ||
1782 | */ | ||
1779 | static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) | 1783 | static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) |
1780 | { | 1784 | { |
1781 | update_curr_dl(rq); | 1785 | update_curr_dl(rq); |
@@ -1865,7 +1869,7 @@ static int find_later_rq(struct task_struct *task) | |||
1865 | 1869 | ||
1866 | /* | 1870 | /* |
1867 | * We have to consider system topology and task affinity | 1871 | * We have to consider system topology and task affinity |
1868 | * first, then we can look for a suitable cpu. | 1872 | * first, then we can look for a suitable CPU. |
1869 | */ | 1873 | */ |
1870 | if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) | 1874 | if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) |
1871 | return -1; | 1875 | return -1; |
@@ -1879,7 +1883,7 @@ static int find_later_rq(struct task_struct *task) | |||
1879 | * Now we check how well this matches with task's | 1883 | * Now we check how well this matches with task's |
1880 | * affinity and system topology. | 1884 | * affinity and system topology. |
1881 | * | 1885 | * |
1882 | * The last cpu where the task run is our first | 1886 | * The last CPU where the task run is our first |
1883 | * guess, since it is most likely cache-hot there. | 1887 | * guess, since it is most likely cache-hot there. |
1884 | */ | 1888 | */ |
1885 | if (cpumask_test_cpu(cpu, later_mask)) | 1889 | if (cpumask_test_cpu(cpu, later_mask)) |
@@ -1909,9 +1913,9 @@ static int find_later_rq(struct task_struct *task) | |||
1909 | best_cpu = cpumask_first_and(later_mask, | 1913 | best_cpu = cpumask_first_and(later_mask, |
1910 | sched_domain_span(sd)); | 1914 | sched_domain_span(sd)); |
1911 | /* | 1915 | /* |
1912 | * Last chance: if a cpu being in both later_mask | 1916 | * Last chance: if a CPU being in both later_mask |
1913 | * and current sd span is valid, that becomes our | 1917 | * and current sd span is valid, that becomes our |
1914 | * choice. Of course, the latest possible cpu is | 1918 | * choice. Of course, the latest possible CPU is |
1915 | * already under consideration through later_mask. | 1919 | * already under consideration through later_mask. |
1916 | */ | 1920 | */ |
1917 | if (best_cpu < nr_cpu_ids) { | 1921 | if (best_cpu < nr_cpu_ids) { |
@@ -2067,7 +2071,7 @@ retry: | |||
2067 | if (task == next_task) { | 2071 | if (task == next_task) { |
2068 | /* | 2072 | /* |
2069 | * The task is still there. We don't try | 2073 | * The task is still there. We don't try |
2070 | * again, some other cpu will pull it when ready. | 2074 | * again, some other CPU will pull it when ready. |
2071 | */ | 2075 | */ |
2072 | goto out; | 2076 | goto out; |
2073 | } | 2077 | } |
@@ -2300,12 +2304,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) | |||
2300 | /* | 2304 | /* |
2301 | * Since this might be the only -deadline task on the rq, | 2305 | * Since this might be the only -deadline task on the rq, |
2302 | * this is the right place to try to pull some other one | 2306 | * this is the right place to try to pull some other one |
2303 | * from an overloaded cpu, if any. | 2307 | * from an overloaded CPU, if any. |
2304 | */ | 2308 | */ |
2305 | if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) | 2309 | if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) |
2306 | return; | 2310 | return; |
2307 | 2311 | ||
2308 | queue_pull_task(rq); | 2312 | deadline_queue_pull_task(rq); |
2309 | } | 2313 | } |
2310 | 2314 | ||
2311 | /* | 2315 | /* |
@@ -2327,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
2327 | if (rq->curr != p) { | 2331 | if (rq->curr != p) { |
2328 | #ifdef CONFIG_SMP | 2332 | #ifdef CONFIG_SMP |
2329 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) | 2333 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) |
2330 | queue_push_tasks(rq); | 2334 | deadline_queue_push_tasks(rq); |
2331 | #endif | 2335 | #endif |
2332 | if (dl_task(rq->curr)) | 2336 | if (dl_task(rq->curr)) |
2333 | check_preempt_curr_dl(rq, p, 0); | 2337 | check_preempt_curr_dl(rq, p, 0); |
@@ -2352,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, | |||
2352 | * or lowering its prio, so... | 2356 | * or lowering its prio, so... |
2353 | */ | 2357 | */ |
2354 | if (!rq->dl.overloaded) | 2358 | if (!rq->dl.overloaded) |
2355 | queue_pull_task(rq); | 2359 | deadline_queue_pull_task(rq); |
2356 | 2360 | ||
2357 | /* | 2361 | /* |
2358 | * If we now have a earlier deadline task than p, | 2362 | * If we now have a earlier deadline task than p, |
@@ -2626,17 +2630,17 @@ void __dl_clear_params(struct task_struct *p) | |||
2626 | { | 2630 | { |
2627 | struct sched_dl_entity *dl_se = &p->dl; | 2631 | struct sched_dl_entity *dl_se = &p->dl; |
2628 | 2632 | ||
2629 | dl_se->dl_runtime = 0; | 2633 | dl_se->dl_runtime = 0; |
2630 | dl_se->dl_deadline = 0; | 2634 | dl_se->dl_deadline = 0; |
2631 | dl_se->dl_period = 0; | 2635 | dl_se->dl_period = 0; |
2632 | dl_se->flags = 0; | 2636 | dl_se->flags = 0; |
2633 | dl_se->dl_bw = 0; | 2637 | dl_se->dl_bw = 0; |
2634 | dl_se->dl_density = 0; | 2638 | dl_se->dl_density = 0; |
2635 | 2639 | ||
2636 | dl_se->dl_throttled = 0; | 2640 | dl_se->dl_throttled = 0; |
2637 | dl_se->dl_yielded = 0; | 2641 | dl_se->dl_yielded = 0; |
2638 | dl_se->dl_non_contending = 0; | 2642 | dl_se->dl_non_contending = 0; |
2639 | dl_se->dl_overrun = 0; | 2643 | dl_se->dl_overrun = 0; |
2640 | } | 2644 | } |
2641 | 2645 | ||
2642 | bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) | 2646 | bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) |
@@ -2655,21 +2659,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) | |||
2655 | #ifdef CONFIG_SMP | 2659 | #ifdef CONFIG_SMP |
2656 | int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) | 2660 | int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) |
2657 | { | 2661 | { |
2658 | unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, | 2662 | unsigned int dest_cpu; |
2659 | cs_cpus_allowed); | ||
2660 | struct dl_bw *dl_b; | 2663 | struct dl_bw *dl_b; |
2661 | bool overflow; | 2664 | bool overflow; |
2662 | int cpus, ret; | 2665 | int cpus, ret; |
2663 | unsigned long flags; | 2666 | unsigned long flags; |
2664 | 2667 | ||
2668 | dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); | ||
2669 | |||
2665 | rcu_read_lock_sched(); | 2670 | rcu_read_lock_sched(); |
2666 | dl_b = dl_bw_of(dest_cpu); | 2671 | dl_b = dl_bw_of(dest_cpu); |
2667 | raw_spin_lock_irqsave(&dl_b->lock, flags); | 2672 | raw_spin_lock_irqsave(&dl_b->lock, flags); |
2668 | cpus = dl_bw_cpus(dest_cpu); | 2673 | cpus = dl_bw_cpus(dest_cpu); |
2669 | overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); | 2674 | overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); |
2670 | if (overflow) | 2675 | if (overflow) { |
2671 | ret = -EBUSY; | 2676 | ret = -EBUSY; |
2672 | else { | 2677 | } else { |
2673 | /* | 2678 | /* |
2674 | * We reserve space for this task in the destination | 2679 | * We reserve space for this task in the destination |
2675 | * root_domain, as we can't fail after this point. | 2680 | * root_domain, as we can't fail after this point. |
@@ -2681,6 +2686,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo | |||
2681 | } | 2686 | } |
2682 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 2687 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
2683 | rcu_read_unlock_sched(); | 2688 | rcu_read_unlock_sched(); |
2689 | |||
2684 | return ret; | 2690 | return ret; |
2685 | } | 2691 | } |
2686 | 2692 | ||
@@ -2701,6 +2707,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, | |||
2701 | ret = 0; | 2707 | ret = 0; |
2702 | raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); | 2708 | raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); |
2703 | rcu_read_unlock_sched(); | 2709 | rcu_read_unlock_sched(); |
2710 | |||
2704 | return ret; | 2711 | return ret; |
2705 | } | 2712 | } |
2706 | 2713 | ||
@@ -2718,6 +2725,7 @@ bool dl_cpu_busy(unsigned int cpu) | |||
2718 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | 2725 | overflow = __dl_overflow(dl_b, cpus, 0, 0); |
2719 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 2726 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
2720 | rcu_read_unlock_sched(); | 2727 | rcu_read_unlock_sched(); |
2728 | |||
2721 | return overflow; | 2729 | return overflow; |
2722 | } | 2730 | } |
2723 | #endif | 2731 | #endif |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 72c401b3b15c..15b10e210a6b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/sched/debug.c | 2 | * kernel/sched/debug.c |
3 | * | 3 | * |
4 | * Print the CFS rbtree | 4 | * Print the CFS rbtree and other debugging details |
5 | * | 5 | * |
6 | * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar | 6 | * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar |
7 | * | 7 | * |
@@ -9,16 +9,6 @@ | |||
9 | * it under the terms of the GNU General Public License version 2 as | 9 | * it under the terms of the GNU General Public License version 2 as |
10 | * published by the Free Software Foundation. | 10 | * published by the Free Software Foundation. |
11 | */ | 11 | */ |
12 | |||
13 | #include <linux/proc_fs.h> | ||
14 | #include <linux/sched/mm.h> | ||
15 | #include <linux/sched/task.h> | ||
16 | #include <linux/seq_file.h> | ||
17 | #include <linux/kallsyms.h> | ||
18 | #include <linux/utsname.h> | ||
19 | #include <linux/mempolicy.h> | ||
20 | #include <linux/debugfs.h> | ||
21 | |||
22 | #include "sched.h" | 12 | #include "sched.h" |
23 | 13 | ||
24 | static DEFINE_SPINLOCK(sched_debug_lock); | 14 | static DEFINE_SPINLOCK(sched_debug_lock); |
@@ -274,34 +264,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
274 | if (table == NULL) | 264 | if (table == NULL) |
275 | return NULL; | 265 | return NULL; |
276 | 266 | ||
277 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | 267 | set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); |
278 | sizeof(long), 0644, proc_doulongvec_minmax, false); | 268 | set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); |
279 | set_table_entry(&table[1], "max_interval", &sd->max_interval, | 269 | set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
280 | sizeof(long), 0644, proc_doulongvec_minmax, false); | 270 | set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
281 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, | 271 | set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
282 | sizeof(int), 0644, proc_dointvec_minmax, true); | 272 | set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
283 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, | 273 | set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
284 | sizeof(int), 0644, proc_dointvec_minmax, true); | 274 | set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false); |
285 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, | 275 | set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false); |
286 | sizeof(int), 0644, proc_dointvec_minmax, true); | 276 | set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false); |
287 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, | 277 | set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false); |
288 | sizeof(int), 0644, proc_dointvec_minmax, true); | 278 | set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false); |
289 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, | 279 | set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); |
290 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
291 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, | ||
292 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
293 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | ||
294 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
295 | set_table_entry(&table[9], "cache_nice_tries", | ||
296 | &sd->cache_nice_tries, | ||
297 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
298 | set_table_entry(&table[10], "flags", &sd->flags, | ||
299 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
300 | set_table_entry(&table[11], "max_newidle_lb_cost", | ||
301 | &sd->max_newidle_lb_cost, | ||
302 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
303 | set_table_entry(&table[12], "name", sd->name, | ||
304 | CORENAME_MAX_SIZE, 0444, proc_dostring, false); | ||
305 | /* &table[13] is terminator */ | 280 | /* &table[13] is terminator */ |
306 | 281 | ||
307 | return table; | 282 | return table; |
@@ -332,8 +307,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) | |||
332 | return table; | 307 | return table; |
333 | } | 308 | } |
334 | 309 | ||
335 | static cpumask_var_t sd_sysctl_cpus; | 310 | static cpumask_var_t sd_sysctl_cpus; |
336 | static struct ctl_table_header *sd_sysctl_header; | 311 | static struct ctl_table_header *sd_sysctl_header; |
337 | 312 | ||
338 | void register_sched_domain_sysctl(void) | 313 | void register_sched_domain_sysctl(void) |
339 | { | 314 | { |
@@ -413,14 +388,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
413 | { | 388 | { |
414 | struct sched_entity *se = tg->se[cpu]; | 389 | struct sched_entity *se = tg->se[cpu]; |
415 | 390 | ||
416 | #define P(F) \ | 391 | #define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) |
417 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) | 392 | #define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) |
418 | #define P_SCHEDSTAT(F) \ | 393 | #define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) |
419 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) | 394 | #define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) |
420 | #define PN(F) \ | ||
421 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) | ||
422 | #define PN_SCHEDSTAT(F) \ | ||
423 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) | ||
424 | 395 | ||
425 | if (!se) | 396 | if (!se) |
426 | return; | 397 | return; |
@@ -428,6 +399,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
428 | PN(se->exec_start); | 399 | PN(se->exec_start); |
429 | PN(se->vruntime); | 400 | PN(se->vruntime); |
430 | PN(se->sum_exec_runtime); | 401 | PN(se->sum_exec_runtime); |
402 | |||
431 | if (schedstat_enabled()) { | 403 | if (schedstat_enabled()) { |
432 | PN_SCHEDSTAT(se->statistics.wait_start); | 404 | PN_SCHEDSTAT(se->statistics.wait_start); |
433 | PN_SCHEDSTAT(se->statistics.sleep_start); | 405 | PN_SCHEDSTAT(se->statistics.sleep_start); |
@@ -440,6 +412,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
440 | PN_SCHEDSTAT(se->statistics.wait_sum); | 412 | PN_SCHEDSTAT(se->statistics.wait_sum); |
441 | P_SCHEDSTAT(se->statistics.wait_count); | 413 | P_SCHEDSTAT(se->statistics.wait_count); |
442 | } | 414 | } |
415 | |||
443 | P(se->load.weight); | 416 | P(se->load.weight); |
444 | P(se->runnable_weight); | 417 | P(se->runnable_weight); |
445 | #ifdef CONFIG_SMP | 418 | #ifdef CONFIG_SMP |
@@ -464,6 +437,7 @@ static char *task_group_path(struct task_group *tg) | |||
464 | return group_path; | 437 | return group_path; |
465 | 438 | ||
466 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); | 439 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); |
440 | |||
467 | return group_path; | 441 | return group_path; |
468 | } | 442 | } |
469 | #endif | 443 | #endif |
@@ -569,6 +543,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
569 | cfs_rq->avg.runnable_load_avg); | 543 | cfs_rq->avg.runnable_load_avg); |
570 | SEQ_printf(m, " .%-30s: %lu\n", "util_avg", | 544 | SEQ_printf(m, " .%-30s: %lu\n", "util_avg", |
571 | cfs_rq->avg.util_avg); | 545 | cfs_rq->avg.util_avg); |
546 | SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", | ||
547 | cfs_rq->avg.util_est.enqueued); | ||
572 | SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", | 548 | SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", |
573 | cfs_rq->removed.load_avg); | 549 | cfs_rq->removed.load_avg); |
574 | SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", | 550 | SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", |
@@ -804,9 +780,9 @@ void sysrq_sched_debug_show(void) | |||
804 | /* | 780 | /* |
805 | * This itererator needs some explanation. | 781 | * This itererator needs some explanation. |
806 | * It returns 1 for the header position. | 782 | * It returns 1 for the header position. |
807 | * This means 2 is cpu 0. | 783 | * This means 2 is CPU 0. |
808 | * In a hotplugged system some cpus, including cpu 0, may be missing so we have | 784 | * In a hotplugged system some CPUs, including CPU 0, may be missing so we have |
809 | * to use cpumask_* to iterate over the cpus. | 785 | * to use cpumask_* to iterate over the CPUs. |
810 | */ | 786 | */ |
811 | static void *sched_debug_start(struct seq_file *file, loff_t *offset) | 787 | static void *sched_debug_start(struct seq_file *file, loff_t *offset) |
812 | { | 788 | { |
@@ -826,6 +802,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset) | |||
826 | 802 | ||
827 | if (n < nr_cpu_ids) | 803 | if (n < nr_cpu_ids) |
828 | return (void *)(unsigned long)(n + 2); | 804 | return (void *)(unsigned long)(n + 2); |
805 | |||
829 | return NULL; | 806 | return NULL; |
830 | } | 807 | } |
831 | 808 | ||
@@ -840,10 +817,10 @@ static void sched_debug_stop(struct seq_file *file, void *data) | |||
840 | } | 817 | } |
841 | 818 | ||
842 | static const struct seq_operations sched_debug_sops = { | 819 | static const struct seq_operations sched_debug_sops = { |
843 | .start = sched_debug_start, | 820 | .start = sched_debug_start, |
844 | .next = sched_debug_next, | 821 | .next = sched_debug_next, |
845 | .stop = sched_debug_stop, | 822 | .stop = sched_debug_stop, |
846 | .show = sched_debug_show, | 823 | .show = sched_debug_show, |
847 | }; | 824 | }; |
848 | 825 | ||
849 | static int sched_debug_release(struct inode *inode, struct file *file) | 826 | static int sched_debug_release(struct inode *inode, struct file *file) |
@@ -881,14 +858,10 @@ static int __init init_sched_debug_procfs(void) | |||
881 | 858 | ||
882 | __initcall(init_sched_debug_procfs); | 859 | __initcall(init_sched_debug_procfs); |
883 | 860 | ||
884 | #define __P(F) \ | 861 | #define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) |
885 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) | 862 | #define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) |
886 | #define P(F) \ | 863 | #define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) |
887 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) | 864 | #define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) |
888 | #define __PN(F) \ | ||
889 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) | ||
890 | #define PN(F) \ | ||
891 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) | ||
892 | 865 | ||
893 | 866 | ||
894 | #ifdef CONFIG_NUMA_BALANCING | 867 | #ifdef CONFIG_NUMA_BALANCING |
@@ -1023,6 +996,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, | |||
1023 | P(se.avg.runnable_load_avg); | 996 | P(se.avg.runnable_load_avg); |
1024 | P(se.avg.util_avg); | 997 | P(se.avg.util_avg); |
1025 | P(se.avg.last_update_time); | 998 | P(se.avg.last_update_time); |
999 | P(se.avg.util_est.ewma); | ||
1000 | P(se.avg.util_est.enqueued); | ||
1026 | #endif | 1001 | #endif |
1027 | P(policy); | 1002 | P(policy); |
1028 | P(prio); | 1003 | P(prio); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5eb3ffc9be84..0951d1c58d2f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -20,25 +20,10 @@ | |||
20 | * Adaptive scheduling granularity, math enhancements by Peter Zijlstra | 20 | * Adaptive scheduling granularity, math enhancements by Peter Zijlstra |
21 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra | 21 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra |
22 | */ | 22 | */ |
23 | 23 | #include "sched.h" | |
24 | #include <linux/sched/mm.h> | ||
25 | #include <linux/sched/topology.h> | ||
26 | |||
27 | #include <linux/latencytop.h> | ||
28 | #include <linux/cpumask.h> | ||
29 | #include <linux/cpuidle.h> | ||
30 | #include <linux/slab.h> | ||
31 | #include <linux/profile.h> | ||
32 | #include <linux/interrupt.h> | ||
33 | #include <linux/mempolicy.h> | ||
34 | #include <linux/migrate.h> | ||
35 | #include <linux/task_work.h> | ||
36 | #include <linux/sched/isolation.h> | ||
37 | 24 | ||
38 | #include <trace/events/sched.h> | 25 | #include <trace/events/sched.h> |
39 | 26 | ||
40 | #include "sched.h" | ||
41 | |||
42 | /* | 27 | /* |
43 | * Targeted preemption latency for CPU-bound tasks: | 28 | * Targeted preemption latency for CPU-bound tasks: |
44 | * | 29 | * |
@@ -103,7 +88,7 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | |||
103 | 88 | ||
104 | #ifdef CONFIG_SMP | 89 | #ifdef CONFIG_SMP |
105 | /* | 90 | /* |
106 | * For asym packing, by default the lower numbered cpu has higher priority. | 91 | * For asym packing, by default the lower numbered CPU has higher priority. |
107 | */ | 92 | */ |
108 | int __weak arch_asym_cpu_priority(int cpu) | 93 | int __weak arch_asym_cpu_priority(int cpu) |
109 | { | 94 | { |
@@ -787,7 +772,7 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
787 | * For !fair tasks do: | 772 | * For !fair tasks do: |
788 | * | 773 | * |
789 | update_cfs_rq_load_avg(now, cfs_rq); | 774 | update_cfs_rq_load_avg(now, cfs_rq); |
790 | attach_entity_load_avg(cfs_rq, se); | 775 | attach_entity_load_avg(cfs_rq, se, 0); |
791 | switched_from_fair(rq, p); | 776 | switched_from_fair(rq, p); |
792 | * | 777 | * |
793 | * such that the next switched_to_fair() has the | 778 | * such that the next switched_to_fair() has the |
@@ -1181,7 +1166,7 @@ pid_t task_numa_group_id(struct task_struct *p) | |||
1181 | } | 1166 | } |
1182 | 1167 | ||
1183 | /* | 1168 | /* |
1184 | * The averaged statistics, shared & private, memory & cpu, | 1169 | * The averaged statistics, shared & private, memory & CPU, |
1185 | * occupy the first half of the array. The second half of the | 1170 | * occupy the first half of the array. The second half of the |
1186 | * array is for current counters, which are averaged into the | 1171 | * array is for current counters, which are averaged into the |
1187 | * first set by task_numa_placement. | 1172 | * first set by task_numa_placement. |
@@ -1587,7 +1572,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1587 | * be incurred if the tasks were swapped. | 1572 | * be incurred if the tasks were swapped. |
1588 | */ | 1573 | */ |
1589 | if (cur) { | 1574 | if (cur) { |
1590 | /* Skip this swap candidate if cannot move to the source cpu */ | 1575 | /* Skip this swap candidate if cannot move to the source CPU: */ |
1591 | if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) | 1576 | if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) |
1592 | goto unlock; | 1577 | goto unlock; |
1593 | 1578 | ||
@@ -1631,7 +1616,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1631 | goto balance; | 1616 | goto balance; |
1632 | } | 1617 | } |
1633 | 1618 | ||
1634 | /* Balance doesn't matter much if we're running a task per cpu */ | 1619 | /* Balance doesn't matter much if we're running a task per CPU: */ |
1635 | if (imp > env->best_imp && src_rq->nr_running == 1 && | 1620 | if (imp > env->best_imp && src_rq->nr_running == 1 && |
1636 | dst_rq->nr_running == 1) | 1621 | dst_rq->nr_running == 1) |
1637 | goto assign; | 1622 | goto assign; |
@@ -1676,7 +1661,7 @@ balance: | |||
1676 | */ | 1661 | */ |
1677 | if (!cur) { | 1662 | if (!cur) { |
1678 | /* | 1663 | /* |
1679 | * select_idle_siblings() uses an per-cpu cpumask that | 1664 | * select_idle_siblings() uses an per-CPU cpumask that |
1680 | * can be used from IRQ context. | 1665 | * can be used from IRQ context. |
1681 | */ | 1666 | */ |
1682 | local_irq_disable(); | 1667 | local_irq_disable(); |
@@ -1869,6 +1854,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
1869 | static void numa_migrate_preferred(struct task_struct *p) | 1854 | static void numa_migrate_preferred(struct task_struct *p) |
1870 | { | 1855 | { |
1871 | unsigned long interval = HZ; | 1856 | unsigned long interval = HZ; |
1857 | unsigned long numa_migrate_retry; | ||
1872 | 1858 | ||
1873 | /* This task has no NUMA fault statistics yet */ | 1859 | /* This task has no NUMA fault statistics yet */ |
1874 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) | 1860 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) |
@@ -1876,7 +1862,18 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
1876 | 1862 | ||
1877 | /* Periodically retry migrating the task to the preferred node */ | 1863 | /* Periodically retry migrating the task to the preferred node */ |
1878 | interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); | 1864 | interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); |
1879 | p->numa_migrate_retry = jiffies + interval; | 1865 | numa_migrate_retry = jiffies + interval; |
1866 | |||
1867 | /* | ||
1868 | * Check that the new retry threshold is after the current one. If | ||
1869 | * the retry is in the future, it implies that wake_affine has | ||
1870 | * temporarily asked NUMA balancing to backoff from placement. | ||
1871 | */ | ||
1872 | if (numa_migrate_retry > p->numa_migrate_retry) | ||
1873 | return; | ||
1874 | |||
1875 | /* Safe to try placing the task on the preferred node */ | ||
1876 | p->numa_migrate_retry = numa_migrate_retry; | ||
1880 | 1877 | ||
1881 | /* Success if task is already running on preferred CPU */ | 1878 | /* Success if task is already running on preferred CPU */ |
1882 | if (task_node(p) == p->numa_preferred_nid) | 1879 | if (task_node(p) == p->numa_preferred_nid) |
@@ -2823,7 +2820,7 @@ void reweight_task(struct task_struct *p, int prio) | |||
2823 | } | 2820 | } |
2824 | 2821 | ||
2825 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2822 | #ifdef CONFIG_FAIR_GROUP_SCHED |
2826 | # ifdef CONFIG_SMP | 2823 | #ifdef CONFIG_SMP |
2827 | /* | 2824 | /* |
2828 | * All this does is approximate the hierarchical proportion which includes that | 2825 | * All this does is approximate the hierarchical proportion which includes that |
2829 | * global sum we all love to hate. | 2826 | * global sum we all love to hate. |
@@ -2974,7 +2971,7 @@ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) | |||
2974 | 2971 | ||
2975 | return clamp_t(long, runnable, MIN_SHARES, shares); | 2972 | return clamp_t(long, runnable, MIN_SHARES, shares); |
2976 | } | 2973 | } |
2977 | # endif /* CONFIG_SMP */ | 2974 | #endif /* CONFIG_SMP */ |
2978 | 2975 | ||
2979 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | 2976 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); |
2980 | 2977 | ||
@@ -3012,11 +3009,11 @@ static inline void update_cfs_group(struct sched_entity *se) | |||
3012 | } | 3009 | } |
3013 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 3010 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
3014 | 3011 | ||
3015 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | 3012 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) |
3016 | { | 3013 | { |
3017 | struct rq *rq = rq_of(cfs_rq); | 3014 | struct rq *rq = rq_of(cfs_rq); |
3018 | 3015 | ||
3019 | if (&rq->cfs == cfs_rq) { | 3016 | if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) { |
3020 | /* | 3017 | /* |
3021 | * There are a few boundary cases this might miss but it should | 3018 | * There are a few boundary cases this might miss but it should |
3022 | * get called often enough that that should (hopefully) not be | 3019 | * get called often enough that that should (hopefully) not be |
@@ -3031,7 +3028,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | |||
3031 | * | 3028 | * |
3032 | * See cpu_util(). | 3029 | * See cpu_util(). |
3033 | */ | 3030 | */ |
3034 | cpufreq_update_util(rq, 0); | 3031 | cpufreq_update_util(rq, flags); |
3035 | } | 3032 | } |
3036 | } | 3033 | } |
3037 | 3034 | ||
@@ -3246,6 +3243,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna | |||
3246 | } | 3243 | } |
3247 | 3244 | ||
3248 | /* | 3245 | /* |
3246 | * When a task is dequeued, its estimated utilization should not be update if | ||
3247 | * its util_avg has not been updated at least once. | ||
3248 | * This flag is used to synchronize util_avg updates with util_est updates. | ||
3249 | * We map this information into the LSB bit of the utilization saved at | ||
3250 | * dequeue time (i.e. util_est.dequeued). | ||
3251 | */ | ||
3252 | #define UTIL_AVG_UNCHANGED 0x1 | ||
3253 | |||
3254 | static inline void cfs_se_util_change(struct sched_avg *avg) | ||
3255 | { | ||
3256 | unsigned int enqueued; | ||
3257 | |||
3258 | if (!sched_feat(UTIL_EST)) | ||
3259 | return; | ||
3260 | |||
3261 | /* Avoid store if the flag has been already set */ | ||
3262 | enqueued = avg->util_est.enqueued; | ||
3263 | if (!(enqueued & UTIL_AVG_UNCHANGED)) | ||
3264 | return; | ||
3265 | |||
3266 | /* Reset flag to report util_avg has been updated */ | ||
3267 | enqueued &= ~UTIL_AVG_UNCHANGED; | ||
3268 | WRITE_ONCE(avg->util_est.enqueued, enqueued); | ||
3269 | } | ||
3270 | |||
3271 | /* | ||
3249 | * sched_entity: | 3272 | * sched_entity: |
3250 | * | 3273 | * |
3251 | * task: | 3274 | * task: |
@@ -3296,6 +3319,7 @@ __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entit | |||
3296 | cfs_rq->curr == se)) { | 3319 | cfs_rq->curr == se)) { |
3297 | 3320 | ||
3298 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); | 3321 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); |
3322 | cfs_se_util_change(&se->avg); | ||
3299 | return 1; | 3323 | return 1; |
3300 | } | 3324 | } |
3301 | 3325 | ||
@@ -3350,7 +3374,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) | |||
3350 | } | 3374 | } |
3351 | 3375 | ||
3352 | /* | 3376 | /* |
3353 | * Called within set_task_rq() right before setting a task's cpu. The | 3377 | * Called within set_task_rq() right before setting a task's CPU. The |
3354 | * caller only guarantees p->pi_lock is held; no other assumptions, | 3378 | * caller only guarantees p->pi_lock is held; no other assumptions, |
3355 | * including the state of rq->lock, should be made. | 3379 | * including the state of rq->lock, should be made. |
3356 | */ | 3380 | */ |
@@ -3529,7 +3553,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf | |||
3529 | 3553 | ||
3530 | /* | 3554 | /* |
3531 | * runnable_sum can't be lower than running_sum | 3555 | * runnable_sum can't be lower than running_sum |
3532 | * As running sum is scale with cpu capacity wehreas the runnable sum | 3556 | * As running sum is scale with CPU capacity wehreas the runnable sum |
3533 | * is not we rescale running_sum 1st | 3557 | * is not we rescale running_sum 1st |
3534 | */ | 3558 | */ |
3535 | running_sum = se->avg.util_sum / | 3559 | running_sum = se->avg.util_sum / |
@@ -3689,7 +3713,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | |||
3689 | #endif | 3713 | #endif |
3690 | 3714 | ||
3691 | if (decayed) | 3715 | if (decayed) |
3692 | cfs_rq_util_change(cfs_rq); | 3716 | cfs_rq_util_change(cfs_rq, 0); |
3693 | 3717 | ||
3694 | return decayed; | 3718 | return decayed; |
3695 | } | 3719 | } |
@@ -3702,7 +3726,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | |||
3702 | * Must call update_cfs_rq_load_avg() before this, since we rely on | 3726 | * Must call update_cfs_rq_load_avg() before this, since we rely on |
3703 | * cfs_rq->avg.last_update_time being current. | 3727 | * cfs_rq->avg.last_update_time being current. |
3704 | */ | 3728 | */ |
3705 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3729 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
3706 | { | 3730 | { |
3707 | u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; | 3731 | u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; |
3708 | 3732 | ||
@@ -3738,7 +3762,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
3738 | 3762 | ||
3739 | add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); | 3763 | add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); |
3740 | 3764 | ||
3741 | cfs_rq_util_change(cfs_rq); | 3765 | cfs_rq_util_change(cfs_rq, flags); |
3742 | } | 3766 | } |
3743 | 3767 | ||
3744 | /** | 3768 | /** |
@@ -3757,7 +3781,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
3757 | 3781 | ||
3758 | add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); | 3782 | add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); |
3759 | 3783 | ||
3760 | cfs_rq_util_change(cfs_rq); | 3784 | cfs_rq_util_change(cfs_rq, 0); |
3761 | } | 3785 | } |
3762 | 3786 | ||
3763 | /* | 3787 | /* |
@@ -3787,7 +3811,14 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
3787 | 3811 | ||
3788 | if (!se->avg.last_update_time && (flags & DO_ATTACH)) { | 3812 | if (!se->avg.last_update_time && (flags & DO_ATTACH)) { |
3789 | 3813 | ||
3790 | attach_entity_load_avg(cfs_rq, se); | 3814 | /* |
3815 | * DO_ATTACH means we're here from enqueue_entity(). | ||
3816 | * !last_update_time means we've passed through | ||
3817 | * migrate_task_rq_fair() indicating we migrated. | ||
3818 | * | ||
3819 | * IOW we're enqueueing a task on a new CPU. | ||
3820 | */ | ||
3821 | attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION); | ||
3791 | update_tg_load_avg(cfs_rq, 0); | 3822 | update_tg_load_avg(cfs_rq, 0); |
3792 | 3823 | ||
3793 | } else if (decayed && (flags & UPDATE_TG)) | 3824 | } else if (decayed && (flags & UPDATE_TG)) |
@@ -3869,6 +3900,120 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) | |||
3869 | 3900 | ||
3870 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf); | 3901 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf); |
3871 | 3902 | ||
3903 | static inline unsigned long task_util(struct task_struct *p) | ||
3904 | { | ||
3905 | return READ_ONCE(p->se.avg.util_avg); | ||
3906 | } | ||
3907 | |||
3908 | static inline unsigned long _task_util_est(struct task_struct *p) | ||
3909 | { | ||
3910 | struct util_est ue = READ_ONCE(p->se.avg.util_est); | ||
3911 | |||
3912 | return max(ue.ewma, ue.enqueued); | ||
3913 | } | ||
3914 | |||
3915 | static inline unsigned long task_util_est(struct task_struct *p) | ||
3916 | { | ||
3917 | return max(task_util(p), _task_util_est(p)); | ||
3918 | } | ||
3919 | |||
3920 | static inline void util_est_enqueue(struct cfs_rq *cfs_rq, | ||
3921 | struct task_struct *p) | ||
3922 | { | ||
3923 | unsigned int enqueued; | ||
3924 | |||
3925 | if (!sched_feat(UTIL_EST)) | ||
3926 | return; | ||
3927 | |||
3928 | /* Update root cfs_rq's estimated utilization */ | ||
3929 | enqueued = cfs_rq->avg.util_est.enqueued; | ||
3930 | enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED); | ||
3931 | WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); | ||
3932 | } | ||
3933 | |||
3934 | /* | ||
3935 | * Check if a (signed) value is within a specified (unsigned) margin, | ||
3936 | * based on the observation that: | ||
3937 | * | ||
3938 | * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) | ||
3939 | * | ||
3940 | * NOTE: this only works when value + maring < INT_MAX. | ||
3941 | */ | ||
3942 | static inline bool within_margin(int value, int margin) | ||
3943 | { | ||
3944 | return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); | ||
3945 | } | ||
3946 | |||
3947 | static void | ||
3948 | util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) | ||
3949 | { | ||
3950 | long last_ewma_diff; | ||
3951 | struct util_est ue; | ||
3952 | |||
3953 | if (!sched_feat(UTIL_EST)) | ||
3954 | return; | ||
3955 | |||
3956 | /* | ||
3957 | * Update root cfs_rq's estimated utilization | ||
3958 | * | ||
3959 | * If *p is the last task then the root cfs_rq's estimated utilization | ||
3960 | * of a CPU is 0 by definition. | ||
3961 | */ | ||
3962 | ue.enqueued = 0; | ||
3963 | if (cfs_rq->nr_running) { | ||
3964 | ue.enqueued = cfs_rq->avg.util_est.enqueued; | ||
3965 | ue.enqueued -= min_t(unsigned int, ue.enqueued, | ||
3966 | (_task_util_est(p) | UTIL_AVG_UNCHANGED)); | ||
3967 | } | ||
3968 | WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); | ||
3969 | |||
3970 | /* | ||
3971 | * Skip update of task's estimated utilization when the task has not | ||
3972 | * yet completed an activation, e.g. being migrated. | ||
3973 | */ | ||
3974 | if (!task_sleep) | ||
3975 | return; | ||
3976 | |||
3977 | /* | ||
3978 | * If the PELT values haven't changed since enqueue time, | ||
3979 | * skip the util_est update. | ||
3980 | */ | ||
3981 | ue = p->se.avg.util_est; | ||
3982 | if (ue.enqueued & UTIL_AVG_UNCHANGED) | ||
3983 | return; | ||
3984 | |||
3985 | /* | ||
3986 | * Skip update of task's estimated utilization when its EWMA is | ||
3987 | * already ~1% close to its last activation value. | ||
3988 | */ | ||
3989 | ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); | ||
3990 | last_ewma_diff = ue.enqueued - ue.ewma; | ||
3991 | if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) | ||
3992 | return; | ||
3993 | |||
3994 | /* | ||
3995 | * Update Task's estimated utilization | ||
3996 | * | ||
3997 | * When *p completes an activation we can consolidate another sample | ||
3998 | * of the task size. This is done by storing the current PELT value | ||
3999 | * as ue.enqueued and by using this value to update the Exponential | ||
4000 | * Weighted Moving Average (EWMA): | ||
4001 | * | ||
4002 | * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) | ||
4003 | * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) | ||
4004 | * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) | ||
4005 | * = w * ( last_ewma_diff ) + ewma(t-1) | ||
4006 | * = w * (last_ewma_diff + ewma(t-1) / w) | ||
4007 | * | ||
4008 | * Where 'w' is the weight of new samples, which is configured to be | ||
4009 | * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) | ||
4010 | */ | ||
4011 | ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; | ||
4012 | ue.ewma += last_ewma_diff; | ||
4013 | ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; | ||
4014 | WRITE_ONCE(p->se.avg.util_est, ue); | ||
4015 | } | ||
4016 | |||
3872 | #else /* CONFIG_SMP */ | 4017 | #else /* CONFIG_SMP */ |
3873 | 4018 | ||
3874 | static inline int | 4019 | static inline int |
@@ -3883,13 +4028,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | |||
3883 | 4028 | ||
3884 | static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) | 4029 | static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) |
3885 | { | 4030 | { |
3886 | cfs_rq_util_change(cfs_rq); | 4031 | cfs_rq_util_change(cfs_rq, 0); |
3887 | } | 4032 | } |
3888 | 4033 | ||
3889 | static inline void remove_entity_load_avg(struct sched_entity *se) {} | 4034 | static inline void remove_entity_load_avg(struct sched_entity *se) {} |
3890 | 4035 | ||
3891 | static inline void | 4036 | static inline void |
3892 | attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | 4037 | attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} |
3893 | static inline void | 4038 | static inline void |
3894 | detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | 4039 | detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} |
3895 | 4040 | ||
@@ -3898,6 +4043,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf) | |||
3898 | return 0; | 4043 | return 0; |
3899 | } | 4044 | } |
3900 | 4045 | ||
4046 | static inline void | ||
4047 | util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} | ||
4048 | |||
4049 | static inline void | ||
4050 | util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, | ||
4051 | bool task_sleep) {} | ||
4052 | |||
3901 | #endif /* CONFIG_SMP */ | 4053 | #endif /* CONFIG_SMP */ |
3902 | 4054 | ||
3903 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) | 4055 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) |
@@ -4676,7 +4828,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | |||
4676 | if (!se) | 4828 | if (!se) |
4677 | add_nr_running(rq, task_delta); | 4829 | add_nr_running(rq, task_delta); |
4678 | 4830 | ||
4679 | /* determine whether we need to wake up potentially idle cpu */ | 4831 | /* Determine whether we need to wake up potentially idle CPU: */ |
4680 | if (rq->curr == rq->idle && rq->cfs.nr_running) | 4832 | if (rq->curr == rq->idle && rq->cfs.nr_running) |
4681 | resched_curr(rq); | 4833 | resched_curr(rq); |
4682 | } | 4834 | } |
@@ -5041,7 +5193,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
5041 | } | 5193 | } |
5042 | 5194 | ||
5043 | /* | 5195 | /* |
5044 | * Both these cpu hotplug callbacks race against unregister_fair_sched_group() | 5196 | * Both these CPU hotplug callbacks race against unregister_fair_sched_group() |
5045 | * | 5197 | * |
5046 | * The race is harmless, since modifying bandwidth settings of unhooked group | 5198 | * The race is harmless, since modifying bandwidth settings of unhooked group |
5047 | * bits doesn't do much. | 5199 | * bits doesn't do much. |
@@ -5086,7 +5238,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) | |||
5086 | */ | 5238 | */ |
5087 | cfs_rq->runtime_remaining = 1; | 5239 | cfs_rq->runtime_remaining = 1; |
5088 | /* | 5240 | /* |
5089 | * Offline rq is schedulable till cpu is completely disabled | 5241 | * Offline rq is schedulable till CPU is completely disabled |
5090 | * in take_cpu_down(), so we prevent new cfs throttling here. | 5242 | * in take_cpu_down(), so we prevent new cfs throttling here. |
5091 | */ | 5243 | */ |
5092 | cfs_rq->runtime_enabled = 0; | 5244 | cfs_rq->runtime_enabled = 0; |
@@ -5245,6 +5397,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
5245 | if (!se) | 5397 | if (!se) |
5246 | add_nr_running(rq, 1); | 5398 | add_nr_running(rq, 1); |
5247 | 5399 | ||
5400 | util_est_enqueue(&rq->cfs, p); | ||
5248 | hrtick_update(rq); | 5401 | hrtick_update(rq); |
5249 | } | 5402 | } |
5250 | 5403 | ||
@@ -5304,6 +5457,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
5304 | if (!se) | 5457 | if (!se) |
5305 | sub_nr_running(rq, 1); | 5458 | sub_nr_running(rq, 1); |
5306 | 5459 | ||
5460 | util_est_dequeue(&rq->cfs, p, task_sleep); | ||
5307 | hrtick_update(rq); | 5461 | hrtick_update(rq); |
5308 | } | 5462 | } |
5309 | 5463 | ||
@@ -5323,8 +5477,8 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); | |||
5323 | * | 5477 | * |
5324 | * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load | 5478 | * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load |
5325 | * | 5479 | * |
5326 | * If a cpu misses updates for n ticks (as it was idle) and update gets | 5480 | * If a CPU misses updates for n ticks (as it was idle) and update gets |
5327 | * called on the n+1-th tick when cpu may be busy, then we have: | 5481 | * called on the n+1-th tick when CPU may be busy, then we have: |
5328 | * | 5482 | * |
5329 | * load_n = (1 - 1/2^i)^n * load_0 | 5483 | * load_n = (1 - 1/2^i)^n * load_0 |
5330 | * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load | 5484 | * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load |
@@ -5379,6 +5533,15 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | |||
5379 | } | 5533 | } |
5380 | return load; | 5534 | return load; |
5381 | } | 5535 | } |
5536 | |||
5537 | static struct { | ||
5538 | cpumask_var_t idle_cpus_mask; | ||
5539 | atomic_t nr_cpus; | ||
5540 | int has_blocked; /* Idle CPUS has blocked load */ | ||
5541 | unsigned long next_balance; /* in jiffy units */ | ||
5542 | unsigned long next_blocked; /* Next update of blocked load in jiffies */ | ||
5543 | } nohz ____cacheline_aligned; | ||
5544 | |||
5382 | #endif /* CONFIG_NO_HZ_COMMON */ | 5545 | #endif /* CONFIG_NO_HZ_COMMON */ |
5383 | 5546 | ||
5384 | /** | 5547 | /** |
@@ -5468,7 +5631,7 @@ static unsigned long weighted_cpuload(struct rq *rq) | |||
5468 | #ifdef CONFIG_NO_HZ_COMMON | 5631 | #ifdef CONFIG_NO_HZ_COMMON |
5469 | /* | 5632 | /* |
5470 | * There is no sane way to deal with nohz on smp when using jiffies because the | 5633 | * There is no sane way to deal with nohz on smp when using jiffies because the |
5471 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | 5634 | * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading |
5472 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | 5635 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. |
5473 | * | 5636 | * |
5474 | * Therefore we need to avoid the delta approach from the regular tick when | 5637 | * Therefore we need to avoid the delta approach from the regular tick when |
@@ -5579,7 +5742,7 @@ void cpu_load_update_active(struct rq *this_rq) | |||
5579 | } | 5742 | } |
5580 | 5743 | ||
5581 | /* | 5744 | /* |
5582 | * Return a low guess at the load of a migration-source cpu weighted | 5745 | * Return a low guess at the load of a migration-source CPU weighted |
5583 | * according to the scheduling class and "nice" value. | 5746 | * according to the scheduling class and "nice" value. |
5584 | * | 5747 | * |
5585 | * We want to under-estimate the load of migration sources, to | 5748 | * We want to under-estimate the load of migration sources, to |
@@ -5597,7 +5760,7 @@ static unsigned long source_load(int cpu, int type) | |||
5597 | } | 5760 | } |
5598 | 5761 | ||
5599 | /* | 5762 | /* |
5600 | * Return a high guess at the load of a migration-target cpu weighted | 5763 | * Return a high guess at the load of a migration-target CPU weighted |
5601 | * according to the scheduling class and "nice" value. | 5764 | * according to the scheduling class and "nice" value. |
5602 | */ | 5765 | */ |
5603 | static unsigned long target_load(int cpu, int type) | 5766 | static unsigned long target_load(int cpu, int type) |
@@ -5724,7 +5887,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, | |||
5724 | unsigned long task_load; | 5887 | unsigned long task_load; |
5725 | 5888 | ||
5726 | this_eff_load = target_load(this_cpu, sd->wake_idx); | 5889 | this_eff_load = target_load(this_cpu, sd->wake_idx); |
5727 | prev_eff_load = source_load(prev_cpu, sd->wake_idx); | ||
5728 | 5890 | ||
5729 | if (sync) { | 5891 | if (sync) { |
5730 | unsigned long current_load = task_h_load(current); | 5892 | unsigned long current_load = task_h_load(current); |
@@ -5742,18 +5904,69 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, | |||
5742 | this_eff_load *= 100; | 5904 | this_eff_load *= 100; |
5743 | this_eff_load *= capacity_of(prev_cpu); | 5905 | this_eff_load *= capacity_of(prev_cpu); |
5744 | 5906 | ||
5907 | prev_eff_load = source_load(prev_cpu, sd->wake_idx); | ||
5745 | prev_eff_load -= task_load; | 5908 | prev_eff_load -= task_load; |
5746 | if (sched_feat(WA_BIAS)) | 5909 | if (sched_feat(WA_BIAS)) |
5747 | prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; | 5910 | prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; |
5748 | prev_eff_load *= capacity_of(this_cpu); | 5911 | prev_eff_load *= capacity_of(this_cpu); |
5749 | 5912 | ||
5750 | return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits; | 5913 | /* |
5914 | * If sync, adjust the weight of prev_eff_load such that if | ||
5915 | * prev_eff == this_eff that select_idle_sibling() will consider | ||
5916 | * stacking the wakee on top of the waker if no other CPU is | ||
5917 | * idle. | ||
5918 | */ | ||
5919 | if (sync) | ||
5920 | prev_eff_load += 1; | ||
5921 | |||
5922 | return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits; | ||
5923 | } | ||
5924 | |||
5925 | #ifdef CONFIG_NUMA_BALANCING | ||
5926 | static void | ||
5927 | update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) | ||
5928 | { | ||
5929 | unsigned long interval; | ||
5930 | |||
5931 | if (!static_branch_likely(&sched_numa_balancing)) | ||
5932 | return; | ||
5933 | |||
5934 | /* If balancing has no preference then continue gathering data */ | ||
5935 | if (p->numa_preferred_nid == -1) | ||
5936 | return; | ||
5937 | |||
5938 | /* | ||
5939 | * If the wakeup is not affecting locality then it is neutral from | ||
5940 | * the perspective of NUMA balacing so continue gathering data. | ||
5941 | */ | ||
5942 | if (cpu_to_node(prev_cpu) == cpu_to_node(target)) | ||
5943 | return; | ||
5944 | |||
5945 | /* | ||
5946 | * Temporarily prevent NUMA balancing trying to place waker/wakee after | ||
5947 | * wakee has been moved by wake_affine. This will potentially allow | ||
5948 | * related tasks to converge and update their data placement. The | ||
5949 | * 4 * numa_scan_period is to allow the two-pass filter to migrate | ||
5950 | * hot data to the wakers node. | ||
5951 | */ | ||
5952 | interval = max(sysctl_numa_balancing_scan_delay, | ||
5953 | p->numa_scan_period << 2); | ||
5954 | p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); | ||
5955 | |||
5956 | interval = max(sysctl_numa_balancing_scan_delay, | ||
5957 | current->numa_scan_period << 2); | ||
5958 | current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); | ||
5751 | } | 5959 | } |
5960 | #else | ||
5961 | static void | ||
5962 | update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) | ||
5963 | { | ||
5964 | } | ||
5965 | #endif | ||
5752 | 5966 | ||
5753 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, | 5967 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, |
5754 | int prev_cpu, int sync) | 5968 | int this_cpu, int prev_cpu, int sync) |
5755 | { | 5969 | { |
5756 | int this_cpu = smp_processor_id(); | ||
5757 | int target = nr_cpumask_bits; | 5970 | int target = nr_cpumask_bits; |
5758 | 5971 | ||
5759 | if (sched_feat(WA_IDLE)) | 5972 | if (sched_feat(WA_IDLE)) |
@@ -5766,12 +5979,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, | |||
5766 | if (target == nr_cpumask_bits) | 5979 | if (target == nr_cpumask_bits) |
5767 | return prev_cpu; | 5980 | return prev_cpu; |
5768 | 5981 | ||
5982 | update_wa_numa_placement(p, prev_cpu, target); | ||
5769 | schedstat_inc(sd->ttwu_move_affine); | 5983 | schedstat_inc(sd->ttwu_move_affine); |
5770 | schedstat_inc(p->se.statistics.nr_wakeups_affine); | 5984 | schedstat_inc(p->se.statistics.nr_wakeups_affine); |
5771 | return target; | 5985 | return target; |
5772 | } | 5986 | } |
5773 | 5987 | ||
5774 | static inline unsigned long task_util(struct task_struct *p); | ||
5775 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p); | 5988 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p); |
5776 | 5989 | ||
5777 | static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) | 5990 | static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) |
@@ -5826,7 +6039,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
5826 | max_spare_cap = 0; | 6039 | max_spare_cap = 0; |
5827 | 6040 | ||
5828 | for_each_cpu(i, sched_group_span(group)) { | 6041 | for_each_cpu(i, sched_group_span(group)) { |
5829 | /* Bias balancing toward cpus of our domain */ | 6042 | /* Bias balancing toward CPUs of our domain */ |
5830 | if (local_group) | 6043 | if (local_group) |
5831 | load = source_load(i, load_idx); | 6044 | load = source_load(i, load_idx); |
5832 | else | 6045 | else |
@@ -5856,7 +6069,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
5856 | if (min_runnable_load > (runnable_load + imbalance)) { | 6069 | if (min_runnable_load > (runnable_load + imbalance)) { |
5857 | /* | 6070 | /* |
5858 | * The runnable load is significantly smaller | 6071 | * The runnable load is significantly smaller |
5859 | * so we can pick this new cpu | 6072 | * so we can pick this new CPU: |
5860 | */ | 6073 | */ |
5861 | min_runnable_load = runnable_load; | 6074 | min_runnable_load = runnable_load; |
5862 | min_avg_load = avg_load; | 6075 | min_avg_load = avg_load; |
@@ -5865,7 +6078,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
5865 | (100*min_avg_load > imbalance_scale*avg_load)) { | 6078 | (100*min_avg_load > imbalance_scale*avg_load)) { |
5866 | /* | 6079 | /* |
5867 | * The runnable loads are close so take the | 6080 | * The runnable loads are close so take the |
5868 | * blocked load into account through avg_load. | 6081 | * blocked load into account through avg_load: |
5869 | */ | 6082 | */ |
5870 | min_avg_load = avg_load; | 6083 | min_avg_load = avg_load; |
5871 | idlest = group; | 6084 | idlest = group; |
@@ -5903,6 +6116,18 @@ skip_spare: | |||
5903 | if (!idlest) | 6116 | if (!idlest) |
5904 | return NULL; | 6117 | return NULL; |
5905 | 6118 | ||
6119 | /* | ||
6120 | * When comparing groups across NUMA domains, it's possible for the | ||
6121 | * local domain to be very lightly loaded relative to the remote | ||
6122 | * domains but "imbalance" skews the comparison making remote CPUs | ||
6123 | * look much more favourable. When considering cross-domain, add | ||
6124 | * imbalance to the runnable load on the remote node and consider | ||
6125 | * staying local. | ||
6126 | */ | ||
6127 | if ((sd->flags & SD_NUMA) && | ||
6128 | min_runnable_load + imbalance >= this_runnable_load) | ||
6129 | return NULL; | ||
6130 | |||
5906 | if (min_runnable_load > (this_runnable_load + imbalance)) | 6131 | if (min_runnable_load > (this_runnable_load + imbalance)) |
5907 | return NULL; | 6132 | return NULL; |
5908 | 6133 | ||
@@ -5914,7 +6139,7 @@ skip_spare: | |||
5914 | } | 6139 | } |
5915 | 6140 | ||
5916 | /* | 6141 | /* |
5917 | * find_idlest_group_cpu - find the idlest cpu among the cpus in group. | 6142 | * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. |
5918 | */ | 6143 | */ |
5919 | static int | 6144 | static int |
5920 | find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | 6145 | find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) |
@@ -5992,12 +6217,12 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p | |||
5992 | 6217 | ||
5993 | new_cpu = find_idlest_group_cpu(group, p, cpu); | 6218 | new_cpu = find_idlest_group_cpu(group, p, cpu); |
5994 | if (new_cpu == cpu) { | 6219 | if (new_cpu == cpu) { |
5995 | /* Now try balancing at a lower domain level of cpu */ | 6220 | /* Now try balancing at a lower domain level of 'cpu': */ |
5996 | sd = sd->child; | 6221 | sd = sd->child; |
5997 | continue; | 6222 | continue; |
5998 | } | 6223 | } |
5999 | 6224 | ||
6000 | /* Now try balancing at a lower domain level of new_cpu */ | 6225 | /* Now try balancing at a lower domain level of 'new_cpu': */ |
6001 | cpu = new_cpu; | 6226 | cpu = new_cpu; |
6002 | weight = sd->span_weight; | 6227 | weight = sd->span_weight; |
6003 | sd = NULL; | 6228 | sd = NULL; |
@@ -6007,7 +6232,6 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p | |||
6007 | if (tmp->flags & sd_flag) | 6232 | if (tmp->flags & sd_flag) |
6008 | sd = tmp; | 6233 | sd = tmp; |
6009 | } | 6234 | } |
6010 | /* while loop will break here if sd == NULL */ | ||
6011 | } | 6235 | } |
6012 | 6236 | ||
6013 | return new_cpu; | 6237 | return new_cpu; |
@@ -6203,12 +6427,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
6203 | return target; | 6427 | return target; |
6204 | 6428 | ||
6205 | /* | 6429 | /* |
6206 | * If the previous cpu is cache affine and idle, don't be stupid. | 6430 | * If the previous CPU is cache affine and idle, don't be stupid: |
6207 | */ | 6431 | */ |
6208 | if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) | 6432 | if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) |
6209 | return prev; | 6433 | return prev; |
6210 | 6434 | ||
6211 | /* Check a recently used CPU as a potential idle candidate */ | 6435 | /* Check a recently used CPU as a potential idle candidate: */ |
6212 | recent_used_cpu = p->recent_used_cpu; | 6436 | recent_used_cpu = p->recent_used_cpu; |
6213 | if (recent_used_cpu != prev && | 6437 | if (recent_used_cpu != prev && |
6214 | recent_used_cpu != target && | 6438 | recent_used_cpu != target && |
@@ -6217,7 +6441,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
6217 | cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { | 6441 | cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { |
6218 | /* | 6442 | /* |
6219 | * Replace recent_used_cpu with prev as it is a potential | 6443 | * Replace recent_used_cpu with prev as it is a potential |
6220 | * candidate for the next wake. | 6444 | * candidate for the next wake: |
6221 | */ | 6445 | */ |
6222 | p->recent_used_cpu = prev; | 6446 | p->recent_used_cpu = prev; |
6223 | return recent_used_cpu; | 6447 | return recent_used_cpu; |
@@ -6242,11 +6466,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
6242 | return target; | 6466 | return target; |
6243 | } | 6467 | } |
6244 | 6468 | ||
6245 | /* | 6469 | /** |
6246 | * cpu_util returns the amount of capacity of a CPU that is used by CFS | 6470 | * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks |
6247 | * tasks. The unit of the return value must be the one of capacity so we can | 6471 | * @cpu: the CPU to get the utilization of |
6248 | * compare the utilization with the capacity of the CPU that is available for | 6472 | * |
6249 | * CFS task (ie cpu_capacity). | 6473 | * The unit of the return value must be the one of capacity so we can compare |
6474 | * the utilization with the capacity of the CPU that is available for CFS task | ||
6475 | * (ie cpu_capacity). | ||
6250 | * | 6476 | * |
6251 | * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the | 6477 | * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the |
6252 | * recent utilization of currently non-runnable tasks on a CPU. It represents | 6478 | * recent utilization of currently non-runnable tasks on a CPU. It represents |
@@ -6257,6 +6483,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
6257 | * current capacity (capacity_curr <= capacity_orig) of the CPU because it is | 6483 | * current capacity (capacity_curr <= capacity_orig) of the CPU because it is |
6258 | * the running time on this CPU scaled by capacity_curr. | 6484 | * the running time on this CPU scaled by capacity_curr. |
6259 | * | 6485 | * |
6486 | * The estimated utilization of a CPU is defined to be the maximum between its | ||
6487 | * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks | ||
6488 | * currently RUNNABLE on that CPU. | ||
6489 | * This allows to properly represent the expected utilization of a CPU which | ||
6490 | * has just got a big task running since a long sleep period. At the same time | ||
6491 | * however it preserves the benefits of the "blocked utilization" in | ||
6492 | * describing the potential for other tasks waking up on the same CPU. | ||
6493 | * | ||
6260 | * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even | 6494 | * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even |
6261 | * higher than capacity_orig because of unfortunate rounding in | 6495 | * higher than capacity_orig because of unfortunate rounding in |
6262 | * cfs.avg.util_avg or just after migrating tasks and new task wakeups until | 6496 | * cfs.avg.util_avg or just after migrating tasks and new task wakeups until |
@@ -6267,36 +6501,77 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
6267 | * available capacity. We allow utilization to overshoot capacity_curr (but not | 6501 | * available capacity. We allow utilization to overshoot capacity_curr (but not |
6268 | * capacity_orig) as it useful for predicting the capacity required after task | 6502 | * capacity_orig) as it useful for predicting the capacity required after task |
6269 | * migrations (scheduler-driven DVFS). | 6503 | * migrations (scheduler-driven DVFS). |
6504 | * | ||
6505 | * Return: the (estimated) utilization for the specified CPU | ||
6270 | */ | 6506 | */ |
6271 | static unsigned long cpu_util(int cpu) | 6507 | static inline unsigned long cpu_util(int cpu) |
6272 | { | 6508 | { |
6273 | unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; | 6509 | struct cfs_rq *cfs_rq; |
6274 | unsigned long capacity = capacity_orig_of(cpu); | 6510 | unsigned int util; |
6275 | 6511 | ||
6276 | return (util >= capacity) ? capacity : util; | 6512 | cfs_rq = &cpu_rq(cpu)->cfs; |
6277 | } | 6513 | util = READ_ONCE(cfs_rq->avg.util_avg); |
6278 | 6514 | ||
6279 | static inline unsigned long task_util(struct task_struct *p) | 6515 | if (sched_feat(UTIL_EST)) |
6280 | { | 6516 | util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); |
6281 | return p->se.avg.util_avg; | 6517 | |
6518 | return min_t(unsigned long, util, capacity_orig_of(cpu)); | ||
6282 | } | 6519 | } |
6283 | 6520 | ||
6284 | /* | 6521 | /* |
6285 | * cpu_util_wake: Compute cpu utilization with any contributions from | 6522 | * cpu_util_wake: Compute CPU utilization with any contributions from |
6286 | * the waking task p removed. | 6523 | * the waking task p removed. |
6287 | */ | 6524 | */ |
6288 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p) | 6525 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p) |
6289 | { | 6526 | { |
6290 | unsigned long util, capacity; | 6527 | struct cfs_rq *cfs_rq; |
6528 | unsigned int util; | ||
6291 | 6529 | ||
6292 | /* Task has no contribution or is new */ | 6530 | /* Task has no contribution or is new */ |
6293 | if (cpu != task_cpu(p) || !p->se.avg.last_update_time) | 6531 | if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) |
6294 | return cpu_util(cpu); | 6532 | return cpu_util(cpu); |
6295 | 6533 | ||
6296 | capacity = capacity_orig_of(cpu); | 6534 | cfs_rq = &cpu_rq(cpu)->cfs; |
6297 | util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); | 6535 | util = READ_ONCE(cfs_rq->avg.util_avg); |
6298 | 6536 | ||
6299 | return (util >= capacity) ? capacity : util; | 6537 | /* Discount task's blocked util from CPU's util */ |
6538 | util -= min_t(unsigned int, util, task_util(p)); | ||
6539 | |||
6540 | /* | ||
6541 | * Covered cases: | ||
6542 | * | ||
6543 | * a) if *p is the only task sleeping on this CPU, then: | ||
6544 | * cpu_util (== task_util) > util_est (== 0) | ||
6545 | * and thus we return: | ||
6546 | * cpu_util_wake = (cpu_util - task_util) = 0 | ||
6547 | * | ||
6548 | * b) if other tasks are SLEEPING on this CPU, which is now exiting | ||
6549 | * IDLE, then: | ||
6550 | * cpu_util >= task_util | ||
6551 | * cpu_util > util_est (== 0) | ||
6552 | * and thus we discount *p's blocked utilization to return: | ||
6553 | * cpu_util_wake = (cpu_util - task_util) >= 0 | ||
6554 | * | ||
6555 | * c) if other tasks are RUNNABLE on that CPU and | ||
6556 | * util_est > cpu_util | ||
6557 | * then we use util_est since it returns a more restrictive | ||
6558 | * estimation of the spare capacity on that CPU, by just | ||
6559 | * considering the expected utilization of tasks already | ||
6560 | * runnable on that CPU. | ||
6561 | * | ||
6562 | * Cases a) and b) are covered by the above code, while case c) is | ||
6563 | * covered by the following code when estimated utilization is | ||
6564 | * enabled. | ||
6565 | */ | ||
6566 | if (sched_feat(UTIL_EST)) | ||
6567 | util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); | ||
6568 | |||
6569 | /* | ||
6570 | * Utilization (estimated) can exceed the CPU capacity, thus let's | ||
6571 | * clamp to the maximum CPU capacity to ensure consistency with | ||
6572 | * the cpu_util call. | ||
6573 | */ | ||
6574 | return min_t(unsigned long, util, capacity_orig_of(cpu)); | ||
6300 | } | 6575 | } |
6301 | 6576 | ||
6302 | /* | 6577 | /* |
@@ -6328,10 +6603,10 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) | |||
6328 | * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, | 6603 | * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, |
6329 | * SD_BALANCE_FORK, or SD_BALANCE_EXEC. | 6604 | * SD_BALANCE_FORK, or SD_BALANCE_EXEC. |
6330 | * | 6605 | * |
6331 | * Balances load by selecting the idlest cpu in the idlest group, or under | 6606 | * Balances load by selecting the idlest CPU in the idlest group, or under |
6332 | * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. | 6607 | * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set. |
6333 | * | 6608 | * |
6334 | * Returns the target cpu number. | 6609 | * Returns the target CPU number. |
6335 | * | 6610 | * |
6336 | * preempt must be disabled. | 6611 | * preempt must be disabled. |
6337 | */ | 6612 | */ |
@@ -6342,7 +6617,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
6342 | int cpu = smp_processor_id(); | 6617 | int cpu = smp_processor_id(); |
6343 | int new_cpu = prev_cpu; | 6618 | int new_cpu = prev_cpu; |
6344 | int want_affine = 0; | 6619 | int want_affine = 0; |
6345 | int sync = wake_flags & WF_SYNC; | 6620 | int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); |
6346 | 6621 | ||
6347 | if (sd_flag & SD_BALANCE_WAKE) { | 6622 | if (sd_flag & SD_BALANCE_WAKE) { |
6348 | record_wakee(p); | 6623 | record_wakee(p); |
@@ -6356,7 +6631,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
6356 | break; | 6631 | break; |
6357 | 6632 | ||
6358 | /* | 6633 | /* |
6359 | * If both cpu and prev_cpu are part of this domain, | 6634 | * If both 'cpu' and 'prev_cpu' are part of this domain, |
6360 | * cpu is a valid SD_WAKE_AFFINE target. | 6635 | * cpu is a valid SD_WAKE_AFFINE target. |
6361 | */ | 6636 | */ |
6362 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && | 6637 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && |
@@ -6376,7 +6651,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
6376 | if (cpu == prev_cpu) | 6651 | if (cpu == prev_cpu) |
6377 | goto pick_cpu; | 6652 | goto pick_cpu; |
6378 | 6653 | ||
6379 | new_cpu = wake_affine(affine_sd, p, prev_cpu, sync); | 6654 | new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync); |
6380 | } | 6655 | } |
6381 | 6656 | ||
6382 | if (sd && !(sd_flag & SD_BALANCE_FORK)) { | 6657 | if (sd && !(sd_flag & SD_BALANCE_FORK)) { |
@@ -6407,9 +6682,9 @@ pick_cpu: | |||
6407 | static void detach_entity_cfs_rq(struct sched_entity *se); | 6682 | static void detach_entity_cfs_rq(struct sched_entity *se); |
6408 | 6683 | ||
6409 | /* | 6684 | /* |
6410 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and | 6685 | * Called immediately before a task is migrated to a new CPU; task_cpu(p) and |
6411 | * cfs_rq_of(p) references at time of call are still valid and identify the | 6686 | * cfs_rq_of(p) references at time of call are still valid and identify the |
6412 | * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. | 6687 | * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. |
6413 | */ | 6688 | */ |
6414 | static void migrate_task_rq_fair(struct task_struct *p) | 6689 | static void migrate_task_rq_fair(struct task_struct *p) |
6415 | { | 6690 | { |
@@ -6738,7 +7013,7 @@ simple: | |||
6738 | 7013 | ||
6739 | p = task_of(se); | 7014 | p = task_of(se); |
6740 | 7015 | ||
6741 | done: __maybe_unused | 7016 | done: __maybe_unused; |
6742 | #ifdef CONFIG_SMP | 7017 | #ifdef CONFIG_SMP |
6743 | /* | 7018 | /* |
6744 | * Move the next running task to the front of | 7019 | * Move the next running task to the front of |
@@ -6843,17 +7118,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6843 | * BASICS | 7118 | * BASICS |
6844 | * | 7119 | * |
6845 | * The purpose of load-balancing is to achieve the same basic fairness the | 7120 | * The purpose of load-balancing is to achieve the same basic fairness the |
6846 | * per-cpu scheduler provides, namely provide a proportional amount of compute | 7121 | * per-CPU scheduler provides, namely provide a proportional amount of compute |
6847 | * time to each task. This is expressed in the following equation: | 7122 | * time to each task. This is expressed in the following equation: |
6848 | * | 7123 | * |
6849 | * W_i,n/P_i == W_j,n/P_j for all i,j (1) | 7124 | * W_i,n/P_i == W_j,n/P_j for all i,j (1) |
6850 | * | 7125 | * |
6851 | * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight | 7126 | * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight |
6852 | * W_i,0 is defined as: | 7127 | * W_i,0 is defined as: |
6853 | * | 7128 | * |
6854 | * W_i,0 = \Sum_j w_i,j (2) | 7129 | * W_i,0 = \Sum_j w_i,j (2) |
6855 | * | 7130 | * |
6856 | * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight | 7131 | * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight |
6857 | * is derived from the nice value as per sched_prio_to_weight[]. | 7132 | * is derived from the nice value as per sched_prio_to_weight[]. |
6858 | * | 7133 | * |
6859 | * The weight average is an exponential decay average of the instantaneous | 7134 | * The weight average is an exponential decay average of the instantaneous |
@@ -6861,7 +7136,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6861 | * | 7136 | * |
6862 | * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) | 7137 | * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) |
6863 | * | 7138 | * |
6864 | * C_i is the compute capacity of cpu i, typically it is the | 7139 | * C_i is the compute capacity of CPU i, typically it is the |
6865 | * fraction of 'recent' time available for SCHED_OTHER task execution. But it | 7140 | * fraction of 'recent' time available for SCHED_OTHER task execution. But it |
6866 | * can also include other factors [XXX]. | 7141 | * can also include other factors [XXX]. |
6867 | * | 7142 | * |
@@ -6882,11 +7157,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6882 | * SCHED DOMAINS | 7157 | * SCHED DOMAINS |
6883 | * | 7158 | * |
6884 | * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) | 7159 | * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) |
6885 | * for all i,j solution, we create a tree of cpus that follows the hardware | 7160 | * for all i,j solution, we create a tree of CPUs that follows the hardware |
6886 | * topology where each level pairs two lower groups (or better). This results | 7161 | * topology where each level pairs two lower groups (or better). This results |
6887 | * in O(log n) layers. Furthermore we reduce the number of cpus going up the | 7162 | * in O(log n) layers. Furthermore we reduce the number of CPUs going up the |
6888 | * tree to only the first of the previous level and we decrease the frequency | 7163 | * tree to only the first of the previous level and we decrease the frequency |
6889 | * of load-balance at each level inv. proportional to the number of cpus in | 7164 | * of load-balance at each level inv. proportional to the number of CPUs in |
6890 | * the groups. | 7165 | * the groups. |
6891 | * | 7166 | * |
6892 | * This yields: | 7167 | * This yields: |
@@ -6895,7 +7170,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6895 | * \Sum { --- * --- * 2^i } = O(n) (5) | 7170 | * \Sum { --- * --- * 2^i } = O(n) (5) |
6896 | * i = 0 2^i 2^i | 7171 | * i = 0 2^i 2^i |
6897 | * `- size of each group | 7172 | * `- size of each group |
6898 | * | | `- number of cpus doing load-balance | 7173 | * | | `- number of CPUs doing load-balance |
6899 | * | `- freq | 7174 | * | `- freq |
6900 | * `- sum over all levels | 7175 | * `- sum over all levels |
6901 | * | 7176 | * |
@@ -6903,7 +7178,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6903 | * this makes (5) the runtime complexity of the balancer. | 7178 | * this makes (5) the runtime complexity of the balancer. |
6904 | * | 7179 | * |
6905 | * An important property here is that each CPU is still (indirectly) connected | 7180 | * An important property here is that each CPU is still (indirectly) connected |
6906 | * to every other cpu in at most O(log n) steps: | 7181 | * to every other CPU in at most O(log n) steps: |
6907 | * | 7182 | * |
6908 | * The adjacency matrix of the resulting graph is given by: | 7183 | * The adjacency matrix of the resulting graph is given by: |
6909 | * | 7184 | * |
@@ -6915,7 +7190,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6915 | * | 7190 | * |
6916 | * A^(log_2 n)_i,j != 0 for all i,j (7) | 7191 | * A^(log_2 n)_i,j != 0 for all i,j (7) |
6917 | * | 7192 | * |
6918 | * Showing there's indeed a path between every cpu in at most O(log n) steps. | 7193 | * Showing there's indeed a path between every CPU in at most O(log n) steps. |
6919 | * The task movement gives a factor of O(m), giving a convergence complexity | 7194 | * The task movement gives a factor of O(m), giving a convergence complexity |
6920 | * of: | 7195 | * of: |
6921 | * | 7196 | * |
@@ -6925,7 +7200,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6925 | * WORK CONSERVING | 7200 | * WORK CONSERVING |
6926 | * | 7201 | * |
6927 | * In order to avoid CPUs going idle while there's still work to do, new idle | 7202 | * In order to avoid CPUs going idle while there's still work to do, new idle |
6928 | * balancing is more aggressive and has the newly idle cpu iterate up the domain | 7203 | * balancing is more aggressive and has the newly idle CPU iterate up the domain |
6929 | * tree itself instead of relying on other CPUs to bring it work. | 7204 | * tree itself instead of relying on other CPUs to bring it work. |
6930 | * | 7205 | * |
6931 | * This adds some complexity to both (5) and (8) but it reduces the total idle | 7206 | * This adds some complexity to both (5) and (8) but it reduces the total idle |
@@ -6946,7 +7221,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6946 | * | 7221 | * |
6947 | * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) | 7222 | * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) |
6948 | * | 7223 | * |
6949 | * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. | 7224 | * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i. |
6950 | * | 7225 | * |
6951 | * The big problem is S_k, its a global sum needed to compute a local (W_i) | 7226 | * The big problem is S_k, its a global sum needed to compute a local (W_i) |
6952 | * property. | 7227 | * property. |
@@ -6963,6 +7238,8 @@ enum fbq_type { regular, remote, all }; | |||
6963 | #define LBF_NEED_BREAK 0x02 | 7238 | #define LBF_NEED_BREAK 0x02 |
6964 | #define LBF_DST_PINNED 0x04 | 7239 | #define LBF_DST_PINNED 0x04 |
6965 | #define LBF_SOME_PINNED 0x08 | 7240 | #define LBF_SOME_PINNED 0x08 |
7241 | #define LBF_NOHZ_STATS 0x10 | ||
7242 | #define LBF_NOHZ_AGAIN 0x20 | ||
6966 | 7243 | ||
6967 | struct lb_env { | 7244 | struct lb_env { |
6968 | struct sched_domain *sd; | 7245 | struct sched_domain *sd; |
@@ -7110,7 +7387,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
7110 | env->flags |= LBF_SOME_PINNED; | 7387 | env->flags |= LBF_SOME_PINNED; |
7111 | 7388 | ||
7112 | /* | 7389 | /* |
7113 | * Remember if this task can be migrated to any other cpu in | 7390 | * Remember if this task can be migrated to any other CPU in |
7114 | * our sched_group. We may want to revisit it if we couldn't | 7391 | * our sched_group. We may want to revisit it if we couldn't |
7115 | * meet load balance goals by pulling other tasks on src_cpu. | 7392 | * meet load balance goals by pulling other tasks on src_cpu. |
7116 | * | 7393 | * |
@@ -7120,7 +7397,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
7120 | if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) | 7397 | if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) |
7121 | return 0; | 7398 | return 0; |
7122 | 7399 | ||
7123 | /* Prevent to re-select dst_cpu via env's cpus */ | 7400 | /* Prevent to re-select dst_cpu via env's CPUs: */ |
7124 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { | 7401 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { |
7125 | if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { | 7402 | if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { |
7126 | env->flags |= LBF_DST_PINNED; | 7403 | env->flags |= LBF_DST_PINNED; |
@@ -7347,6 +7624,17 @@ static void attach_tasks(struct lb_env *env) | |||
7347 | rq_unlock(env->dst_rq, &rf); | 7624 | rq_unlock(env->dst_rq, &rf); |
7348 | } | 7625 | } |
7349 | 7626 | ||
7627 | static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) | ||
7628 | { | ||
7629 | if (cfs_rq->avg.load_avg) | ||
7630 | return true; | ||
7631 | |||
7632 | if (cfs_rq->avg.util_avg) | ||
7633 | return true; | ||
7634 | |||
7635 | return false; | ||
7636 | } | ||
7637 | |||
7350 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7638 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7351 | 7639 | ||
7352 | static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) | 7640 | static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) |
@@ -7371,6 +7659,7 @@ static void update_blocked_averages(int cpu) | |||
7371 | struct rq *rq = cpu_rq(cpu); | 7659 | struct rq *rq = cpu_rq(cpu); |
7372 | struct cfs_rq *cfs_rq, *pos; | 7660 | struct cfs_rq *cfs_rq, *pos; |
7373 | struct rq_flags rf; | 7661 | struct rq_flags rf; |
7662 | bool done = true; | ||
7374 | 7663 | ||
7375 | rq_lock_irqsave(rq, &rf); | 7664 | rq_lock_irqsave(rq, &rf); |
7376 | update_rq_clock(rq); | 7665 | update_rq_clock(rq); |
@@ -7400,7 +7689,17 @@ static void update_blocked_averages(int cpu) | |||
7400 | */ | 7689 | */ |
7401 | if (cfs_rq_is_decayed(cfs_rq)) | 7690 | if (cfs_rq_is_decayed(cfs_rq)) |
7402 | list_del_leaf_cfs_rq(cfs_rq); | 7691 | list_del_leaf_cfs_rq(cfs_rq); |
7692 | |||
7693 | /* Don't need periodic decay once load/util_avg are null */ | ||
7694 | if (cfs_rq_has_blocked(cfs_rq)) | ||
7695 | done = false; | ||
7403 | } | 7696 | } |
7697 | |||
7698 | #ifdef CONFIG_NO_HZ_COMMON | ||
7699 | rq->last_blocked_load_update_tick = jiffies; | ||
7700 | if (done) | ||
7701 | rq->has_blocked_load = 0; | ||
7702 | #endif | ||
7404 | rq_unlock_irqrestore(rq, &rf); | 7703 | rq_unlock_irqrestore(rq, &rf); |
7405 | } | 7704 | } |
7406 | 7705 | ||
@@ -7460,6 +7759,11 @@ static inline void update_blocked_averages(int cpu) | |||
7460 | rq_lock_irqsave(rq, &rf); | 7759 | rq_lock_irqsave(rq, &rf); |
7461 | update_rq_clock(rq); | 7760 | update_rq_clock(rq); |
7462 | update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); | 7761 | update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); |
7762 | #ifdef CONFIG_NO_HZ_COMMON | ||
7763 | rq->last_blocked_load_update_tick = jiffies; | ||
7764 | if (!cfs_rq_has_blocked(cfs_rq)) | ||
7765 | rq->has_blocked_load = 0; | ||
7766 | #endif | ||
7463 | rq_unlock_irqrestore(rq, &rf); | 7767 | rq_unlock_irqrestore(rq, &rf); |
7464 | } | 7768 | } |
7465 | 7769 | ||
@@ -7694,8 +7998,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) | |||
7694 | * Group imbalance indicates (and tries to solve) the problem where balancing | 7998 | * Group imbalance indicates (and tries to solve) the problem where balancing |
7695 | * groups is inadequate due to ->cpus_allowed constraints. | 7999 | * groups is inadequate due to ->cpus_allowed constraints. |
7696 | * | 8000 | * |
7697 | * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a | 8001 | * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a |
7698 | * cpumask covering 1 cpu of the first group and 3 cpus of the second group. | 8002 | * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. |
7699 | * Something like: | 8003 | * Something like: |
7700 | * | 8004 | * |
7701 | * { 0 1 2 3 } { 4 5 6 7 } | 8005 | * { 0 1 2 3 } { 4 5 6 7 } |
@@ -7703,7 +8007,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) | |||
7703 | * | 8007 | * |
7704 | * If we were to balance group-wise we'd place two tasks in the first group and | 8008 | * If we were to balance group-wise we'd place two tasks in the first group and |
7705 | * two tasks in the second group. Clearly this is undesired as it will overload | 8009 | * two tasks in the second group. Clearly this is undesired as it will overload |
7706 | * cpu 3 and leave one of the cpus in the second group unused. | 8010 | * cpu 3 and leave one of the CPUs in the second group unused. |
7707 | * | 8011 | * |
7708 | * The current solution to this issue is detecting the skew in the first group | 8012 | * The current solution to this issue is detecting the skew in the first group |
7709 | * by noticing the lower domain failed to reach balance and had difficulty | 8013 | * by noticing the lower domain failed to reach balance and had difficulty |
@@ -7794,6 +8098,28 @@ group_type group_classify(struct sched_group *group, | |||
7794 | return group_other; | 8098 | return group_other; |
7795 | } | 8099 | } |
7796 | 8100 | ||
8101 | static bool update_nohz_stats(struct rq *rq, bool force) | ||
8102 | { | ||
8103 | #ifdef CONFIG_NO_HZ_COMMON | ||
8104 | unsigned int cpu = rq->cpu; | ||
8105 | |||
8106 | if (!rq->has_blocked_load) | ||
8107 | return false; | ||
8108 | |||
8109 | if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) | ||
8110 | return false; | ||
8111 | |||
8112 | if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick)) | ||
8113 | return true; | ||
8114 | |||
8115 | update_blocked_averages(cpu); | ||
8116 | |||
8117 | return rq->has_blocked_load; | ||
8118 | #else | ||
8119 | return false; | ||
8120 | #endif | ||
8121 | } | ||
8122 | |||
7797 | /** | 8123 | /** |
7798 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 8124 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
7799 | * @env: The load balancing environment. | 8125 | * @env: The load balancing environment. |
@@ -7816,7 +8142,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
7816 | for_each_cpu_and(i, sched_group_span(group), env->cpus) { | 8142 | for_each_cpu_and(i, sched_group_span(group), env->cpus) { |
7817 | struct rq *rq = cpu_rq(i); | 8143 | struct rq *rq = cpu_rq(i); |
7818 | 8144 | ||
7819 | /* Bias balancing toward cpus of our domain */ | 8145 | if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) |
8146 | env->flags |= LBF_NOHZ_AGAIN; | ||
8147 | |||
8148 | /* Bias balancing toward CPUs of our domain: */ | ||
7820 | if (local_group) | 8149 | if (local_group) |
7821 | load = target_load(i, load_idx); | 8150 | load = target_load(i, load_idx); |
7822 | else | 8151 | else |
@@ -7902,7 +8231,7 @@ asym_packing: | |||
7902 | if (!(env->sd->flags & SD_ASYM_PACKING)) | 8231 | if (!(env->sd->flags & SD_ASYM_PACKING)) |
7903 | return true; | 8232 | return true; |
7904 | 8233 | ||
7905 | /* No ASYM_PACKING if target cpu is already busy */ | 8234 | /* No ASYM_PACKING if target CPU is already busy */ |
7906 | if (env->idle == CPU_NOT_IDLE) | 8235 | if (env->idle == CPU_NOT_IDLE) |
7907 | return true; | 8236 | return true; |
7908 | /* | 8237 | /* |
@@ -7915,7 +8244,7 @@ asym_packing: | |||
7915 | if (!sds->busiest) | 8244 | if (!sds->busiest) |
7916 | return true; | 8245 | return true; |
7917 | 8246 | ||
7918 | /* Prefer to move from lowest priority cpu's work */ | 8247 | /* Prefer to move from lowest priority CPU's work */ |
7919 | if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, | 8248 | if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, |
7920 | sg->asym_prefer_cpu)) | 8249 | sg->asym_prefer_cpu)) |
7921 | return true; | 8250 | return true; |
@@ -7971,6 +8300,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
7971 | if (child && child->flags & SD_PREFER_SIBLING) | 8300 | if (child && child->flags & SD_PREFER_SIBLING) |
7972 | prefer_sibling = 1; | 8301 | prefer_sibling = 1; |
7973 | 8302 | ||
8303 | #ifdef CONFIG_NO_HZ_COMMON | ||
8304 | if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) | ||
8305 | env->flags |= LBF_NOHZ_STATS; | ||
8306 | #endif | ||
8307 | |||
7974 | load_idx = get_sd_load_idx(env->sd, env->idle); | 8308 | load_idx = get_sd_load_idx(env->sd, env->idle); |
7975 | 8309 | ||
7976 | do { | 8310 | do { |
@@ -8024,6 +8358,15 @@ next_group: | |||
8024 | sg = sg->next; | 8358 | sg = sg->next; |
8025 | } while (sg != env->sd->groups); | 8359 | } while (sg != env->sd->groups); |
8026 | 8360 | ||
8361 | #ifdef CONFIG_NO_HZ_COMMON | ||
8362 | if ((env->flags & LBF_NOHZ_AGAIN) && | ||
8363 | cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) { | ||
8364 | |||
8365 | WRITE_ONCE(nohz.next_blocked, | ||
8366 | jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD)); | ||
8367 | } | ||
8368 | #endif | ||
8369 | |||
8027 | if (env->sd->flags & SD_NUMA) | 8370 | if (env->sd->flags & SD_NUMA) |
8028 | env->fbq_type = fbq_classify_group(&sds->busiest_stat); | 8371 | env->fbq_type = fbq_classify_group(&sds->busiest_stat); |
8029 | 8372 | ||
@@ -8168,7 +8511,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
8168 | if (busiest->group_type == group_imbalanced) { | 8511 | if (busiest->group_type == group_imbalanced) { |
8169 | /* | 8512 | /* |
8170 | * In the group_imb case we cannot rely on group-wide averages | 8513 | * In the group_imb case we cannot rely on group-wide averages |
8171 | * to ensure cpu-load equilibrium, look at wider averages. XXX | 8514 | * to ensure CPU-load equilibrium, look at wider averages. XXX |
8172 | */ | 8515 | */ |
8173 | busiest->load_per_task = | 8516 | busiest->load_per_task = |
8174 | min(busiest->load_per_task, sds->avg_load); | 8517 | min(busiest->load_per_task, sds->avg_load); |
@@ -8187,7 +8530,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
8187 | } | 8530 | } |
8188 | 8531 | ||
8189 | /* | 8532 | /* |
8190 | * If there aren't any idle cpus, avoid creating some. | 8533 | * If there aren't any idle CPUs, avoid creating some. |
8191 | */ | 8534 | */ |
8192 | if (busiest->group_type == group_overloaded && | 8535 | if (busiest->group_type == group_overloaded && |
8193 | local->group_type == group_overloaded) { | 8536 | local->group_type == group_overloaded) { |
@@ -8201,9 +8544,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
8201 | } | 8544 | } |
8202 | 8545 | ||
8203 | /* | 8546 | /* |
8204 | * We're trying to get all the cpus to the average_load, so we don't | 8547 | * We're trying to get all the CPUs to the average_load, so we don't |
8205 | * want to push ourselves above the average load, nor do we wish to | 8548 | * want to push ourselves above the average load, nor do we wish to |
8206 | * reduce the max loaded cpu below the average load. At the same time, | 8549 | * reduce the max loaded CPU below the average load. At the same time, |
8207 | * we also don't want to reduce the group load below the group | 8550 | * we also don't want to reduce the group load below the group |
8208 | * capacity. Thus we look for the minimum possible imbalance. | 8551 | * capacity. Thus we look for the minimum possible imbalance. |
8209 | */ | 8552 | */ |
@@ -8297,9 +8640,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
8297 | 8640 | ||
8298 | if (env->idle == CPU_IDLE) { | 8641 | if (env->idle == CPU_IDLE) { |
8299 | /* | 8642 | /* |
8300 | * This cpu is idle. If the busiest group is not overloaded | 8643 | * This CPU is idle. If the busiest group is not overloaded |
8301 | * and there is no imbalance between this and busiest group | 8644 | * and there is no imbalance between this and busiest group |
8302 | * wrt idle cpus, it is balanced. The imbalance becomes | 8645 | * wrt idle CPUs, it is balanced. The imbalance becomes |
8303 | * significant if the diff is greater than 1 otherwise we | 8646 | * significant if the diff is greater than 1 otherwise we |
8304 | * might end up to just move the imbalance on another group | 8647 | * might end up to just move the imbalance on another group |
8305 | */ | 8648 | */ |
@@ -8327,7 +8670,7 @@ out_balanced: | |||
8327 | } | 8670 | } |
8328 | 8671 | ||
8329 | /* | 8672 | /* |
8330 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 8673 | * find_busiest_queue - find the busiest runqueue among the CPUs in the group. |
8331 | */ | 8674 | */ |
8332 | static struct rq *find_busiest_queue(struct lb_env *env, | 8675 | static struct rq *find_busiest_queue(struct lb_env *env, |
8333 | struct sched_group *group) | 8676 | struct sched_group *group) |
@@ -8371,7 +8714,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
8371 | 8714 | ||
8372 | /* | 8715 | /* |
8373 | * When comparing with imbalance, use weighted_cpuload() | 8716 | * When comparing with imbalance, use weighted_cpuload() |
8374 | * which is not scaled with the cpu capacity. | 8717 | * which is not scaled with the CPU capacity. |
8375 | */ | 8718 | */ |
8376 | 8719 | ||
8377 | if (rq->nr_running == 1 && wl > env->imbalance && | 8720 | if (rq->nr_running == 1 && wl > env->imbalance && |
@@ -8379,9 +8722,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
8379 | continue; | 8722 | continue; |
8380 | 8723 | ||
8381 | /* | 8724 | /* |
8382 | * For the load comparisons with the other cpu's, consider | 8725 | * For the load comparisons with the other CPU's, consider |
8383 | * the weighted_cpuload() scaled with the cpu capacity, so | 8726 | * the weighted_cpuload() scaled with the CPU capacity, so |
8384 | * that the load can be moved away from the cpu that is | 8727 | * that the load can be moved away from the CPU that is |
8385 | * potentially running at a lower capacity. | 8728 | * potentially running at a lower capacity. |
8386 | * | 8729 | * |
8387 | * Thus we're looking for max(wl_i / capacity_i), crosswise | 8730 | * Thus we're looking for max(wl_i / capacity_i), crosswise |
@@ -8452,13 +8795,13 @@ static int should_we_balance(struct lb_env *env) | |||
8452 | return 0; | 8795 | return 0; |
8453 | 8796 | ||
8454 | /* | 8797 | /* |
8455 | * In the newly idle case, we will allow all the cpu's | 8798 | * In the newly idle case, we will allow all the CPUs |
8456 | * to do the newly idle load balance. | 8799 | * to do the newly idle load balance. |
8457 | */ | 8800 | */ |
8458 | if (env->idle == CPU_NEWLY_IDLE) | 8801 | if (env->idle == CPU_NEWLY_IDLE) |
8459 | return 1; | 8802 | return 1; |
8460 | 8803 | ||
8461 | /* Try to find first idle cpu */ | 8804 | /* Try to find first idle CPU */ |
8462 | for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { | 8805 | for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { |
8463 | if (!idle_cpu(cpu)) | 8806 | if (!idle_cpu(cpu)) |
8464 | continue; | 8807 | continue; |
@@ -8471,7 +8814,7 @@ static int should_we_balance(struct lb_env *env) | |||
8471 | balance_cpu = group_balance_cpu(sg); | 8814 | balance_cpu = group_balance_cpu(sg); |
8472 | 8815 | ||
8473 | /* | 8816 | /* |
8474 | * First idle cpu or the first cpu(busiest) in this sched group | 8817 | * First idle CPU or the first CPU(busiest) in this sched group |
8475 | * is eligible for doing load balancing at this and above domains. | 8818 | * is eligible for doing load balancing at this and above domains. |
8476 | */ | 8819 | */ |
8477 | return balance_cpu == env->dst_cpu; | 8820 | return balance_cpu == env->dst_cpu; |
@@ -8580,7 +8923,7 @@ more_balance: | |||
8580 | * Revisit (affine) tasks on src_cpu that couldn't be moved to | 8923 | * Revisit (affine) tasks on src_cpu that couldn't be moved to |
8581 | * us and move them to an alternate dst_cpu in our sched_group | 8924 | * us and move them to an alternate dst_cpu in our sched_group |
8582 | * where they can run. The upper limit on how many times we | 8925 | * where they can run. The upper limit on how many times we |
8583 | * iterate on same src_cpu is dependent on number of cpus in our | 8926 | * iterate on same src_cpu is dependent on number of CPUs in our |
8584 | * sched_group. | 8927 | * sched_group. |
8585 | * | 8928 | * |
8586 | * This changes load balance semantics a bit on who can move | 8929 | * This changes load balance semantics a bit on who can move |
@@ -8597,7 +8940,7 @@ more_balance: | |||
8597 | */ | 8940 | */ |
8598 | if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { | 8941 | if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { |
8599 | 8942 | ||
8600 | /* Prevent to re-select dst_cpu via env's cpus */ | 8943 | /* Prevent to re-select dst_cpu via env's CPUs */ |
8601 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | 8944 | cpumask_clear_cpu(env.dst_cpu, env.cpus); |
8602 | 8945 | ||
8603 | env.dst_rq = cpu_rq(env.new_dst_cpu); | 8946 | env.dst_rq = cpu_rq(env.new_dst_cpu); |
@@ -8659,9 +9002,10 @@ more_balance: | |||
8659 | 9002 | ||
8660 | raw_spin_lock_irqsave(&busiest->lock, flags); | 9003 | raw_spin_lock_irqsave(&busiest->lock, flags); |
8661 | 9004 | ||
8662 | /* don't kick the active_load_balance_cpu_stop, | 9005 | /* |
8663 | * if the curr task on busiest cpu can't be | 9006 | * Don't kick the active_load_balance_cpu_stop, |
8664 | * moved to this_cpu | 9007 | * if the curr task on busiest CPU can't be |
9008 | * moved to this_cpu: | ||
8665 | */ | 9009 | */ |
8666 | if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { | 9010 | if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { |
8667 | raw_spin_unlock_irqrestore(&busiest->lock, | 9011 | raw_spin_unlock_irqrestore(&busiest->lock, |
@@ -8773,121 +9117,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance) | |||
8773 | } | 9117 | } |
8774 | 9118 | ||
8775 | /* | 9119 | /* |
8776 | * idle_balance is called by schedule() if this_cpu is about to become | 9120 | * active_load_balance_cpu_stop is run by the CPU stopper. It pushes |
8777 | * idle. Attempts to pull tasks from other CPUs. | ||
8778 | */ | ||
8779 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf) | ||
8780 | { | ||
8781 | unsigned long next_balance = jiffies + HZ; | ||
8782 | int this_cpu = this_rq->cpu; | ||
8783 | struct sched_domain *sd; | ||
8784 | int pulled_task = 0; | ||
8785 | u64 curr_cost = 0; | ||
8786 | |||
8787 | /* | ||
8788 | * We must set idle_stamp _before_ calling idle_balance(), such that we | ||
8789 | * measure the duration of idle_balance() as idle time. | ||
8790 | */ | ||
8791 | this_rq->idle_stamp = rq_clock(this_rq); | ||
8792 | |||
8793 | /* | ||
8794 | * Do not pull tasks towards !active CPUs... | ||
8795 | */ | ||
8796 | if (!cpu_active(this_cpu)) | ||
8797 | return 0; | ||
8798 | |||
8799 | /* | ||
8800 | * This is OK, because current is on_cpu, which avoids it being picked | ||
8801 | * for load-balance and preemption/IRQs are still disabled avoiding | ||
8802 | * further scheduler activity on it and we're being very careful to | ||
8803 | * re-start the picking loop. | ||
8804 | */ | ||
8805 | rq_unpin_lock(this_rq, rf); | ||
8806 | |||
8807 | if (this_rq->avg_idle < sysctl_sched_migration_cost || | ||
8808 | !this_rq->rd->overload) { | ||
8809 | rcu_read_lock(); | ||
8810 | sd = rcu_dereference_check_sched_domain(this_rq->sd); | ||
8811 | if (sd) | ||
8812 | update_next_balance(sd, &next_balance); | ||
8813 | rcu_read_unlock(); | ||
8814 | |||
8815 | goto out; | ||
8816 | } | ||
8817 | |||
8818 | raw_spin_unlock(&this_rq->lock); | ||
8819 | |||
8820 | update_blocked_averages(this_cpu); | ||
8821 | rcu_read_lock(); | ||
8822 | for_each_domain(this_cpu, sd) { | ||
8823 | int continue_balancing = 1; | ||
8824 | u64 t0, domain_cost; | ||
8825 | |||
8826 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
8827 | continue; | ||
8828 | |||
8829 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { | ||
8830 | update_next_balance(sd, &next_balance); | ||
8831 | break; | ||
8832 | } | ||
8833 | |||
8834 | if (sd->flags & SD_BALANCE_NEWIDLE) { | ||
8835 | t0 = sched_clock_cpu(this_cpu); | ||
8836 | |||
8837 | pulled_task = load_balance(this_cpu, this_rq, | ||
8838 | sd, CPU_NEWLY_IDLE, | ||
8839 | &continue_balancing); | ||
8840 | |||
8841 | domain_cost = sched_clock_cpu(this_cpu) - t0; | ||
8842 | if (domain_cost > sd->max_newidle_lb_cost) | ||
8843 | sd->max_newidle_lb_cost = domain_cost; | ||
8844 | |||
8845 | curr_cost += domain_cost; | ||
8846 | } | ||
8847 | |||
8848 | update_next_balance(sd, &next_balance); | ||
8849 | |||
8850 | /* | ||
8851 | * Stop searching for tasks to pull if there are | ||
8852 | * now runnable tasks on this rq. | ||
8853 | */ | ||
8854 | if (pulled_task || this_rq->nr_running > 0) | ||
8855 | break; | ||
8856 | } | ||
8857 | rcu_read_unlock(); | ||
8858 | |||
8859 | raw_spin_lock(&this_rq->lock); | ||
8860 | |||
8861 | if (curr_cost > this_rq->max_idle_balance_cost) | ||
8862 | this_rq->max_idle_balance_cost = curr_cost; | ||
8863 | |||
8864 | /* | ||
8865 | * While browsing the domains, we released the rq lock, a task could | ||
8866 | * have been enqueued in the meantime. Since we're not going idle, | ||
8867 | * pretend we pulled a task. | ||
8868 | */ | ||
8869 | if (this_rq->cfs.h_nr_running && !pulled_task) | ||
8870 | pulled_task = 1; | ||
8871 | |||
8872 | out: | ||
8873 | /* Move the next balance forward */ | ||
8874 | if (time_after(this_rq->next_balance, next_balance)) | ||
8875 | this_rq->next_balance = next_balance; | ||
8876 | |||
8877 | /* Is there a task of a high priority class? */ | ||
8878 | if (this_rq->nr_running != this_rq->cfs.h_nr_running) | ||
8879 | pulled_task = -1; | ||
8880 | |||
8881 | if (pulled_task) | ||
8882 | this_rq->idle_stamp = 0; | ||
8883 | |||
8884 | rq_repin_lock(this_rq, rf); | ||
8885 | |||
8886 | return pulled_task; | ||
8887 | } | ||
8888 | |||
8889 | /* | ||
8890 | * active_load_balance_cpu_stop is run by cpu stopper. It pushes | ||
8891 | * running tasks off the busiest CPU onto idle CPUs. It requires at | 9121 | * running tasks off the busiest CPU onto idle CPUs. It requires at |
8892 | * least 1 task to be running on each physical CPU where possible, and | 9122 | * least 1 task to be running on each physical CPU where possible, and |
8893 | * avoids physical / logical imbalances. | 9123 | * avoids physical / logical imbalances. |
@@ -8911,7 +9141,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
8911 | if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) | 9141 | if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) |
8912 | goto out_unlock; | 9142 | goto out_unlock; |
8913 | 9143 | ||
8914 | /* make sure the requested cpu hasn't gone down in the meantime */ | 9144 | /* Make sure the requested CPU hasn't gone down in the meantime: */ |
8915 | if (unlikely(busiest_cpu != smp_processor_id() || | 9145 | if (unlikely(busiest_cpu != smp_processor_id() || |
8916 | !busiest_rq->active_balance)) | 9146 | !busiest_rq->active_balance)) |
8917 | goto out_unlock; | 9147 | goto out_unlock; |
@@ -8923,7 +9153,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
8923 | /* | 9153 | /* |
8924 | * This condition is "impossible", if it occurs | 9154 | * This condition is "impossible", if it occurs |
8925 | * we need to fix it. Originally reported by | 9155 | * we need to fix it. Originally reported by |
8926 | * Bjorn Helgaas on a 128-cpu setup. | 9156 | * Bjorn Helgaas on a 128-CPU setup. |
8927 | */ | 9157 | */ |
8928 | BUG_ON(busiest_rq == target_rq); | 9158 | BUG_ON(busiest_rq == target_rq); |
8929 | 9159 | ||
@@ -8977,141 +9207,6 @@ out_unlock: | |||
8977 | return 0; | 9207 | return 0; |
8978 | } | 9208 | } |
8979 | 9209 | ||
8980 | static inline int on_null_domain(struct rq *rq) | ||
8981 | { | ||
8982 | return unlikely(!rcu_dereference_sched(rq->sd)); | ||
8983 | } | ||
8984 | |||
8985 | #ifdef CONFIG_NO_HZ_COMMON | ||
8986 | /* | ||
8987 | * idle load balancing details | ||
8988 | * - When one of the busy CPUs notice that there may be an idle rebalancing | ||
8989 | * needed, they will kick the idle load balancer, which then does idle | ||
8990 | * load balancing for all the idle CPUs. | ||
8991 | */ | ||
8992 | static struct { | ||
8993 | cpumask_var_t idle_cpus_mask; | ||
8994 | atomic_t nr_cpus; | ||
8995 | unsigned long next_balance; /* in jiffy units */ | ||
8996 | } nohz ____cacheline_aligned; | ||
8997 | |||
8998 | static inline int find_new_ilb(void) | ||
8999 | { | ||
9000 | int ilb = cpumask_first(nohz.idle_cpus_mask); | ||
9001 | |||
9002 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) | ||
9003 | return ilb; | ||
9004 | |||
9005 | return nr_cpu_ids; | ||
9006 | } | ||
9007 | |||
9008 | /* | ||
9009 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | ||
9010 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | ||
9011 | * CPU (if there is one). | ||
9012 | */ | ||
9013 | static void nohz_balancer_kick(void) | ||
9014 | { | ||
9015 | int ilb_cpu; | ||
9016 | |||
9017 | nohz.next_balance++; | ||
9018 | |||
9019 | ilb_cpu = find_new_ilb(); | ||
9020 | |||
9021 | if (ilb_cpu >= nr_cpu_ids) | ||
9022 | return; | ||
9023 | |||
9024 | if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) | ||
9025 | return; | ||
9026 | /* | ||
9027 | * Use smp_send_reschedule() instead of resched_cpu(). | ||
9028 | * This way we generate a sched IPI on the target cpu which | ||
9029 | * is idle. And the softirq performing nohz idle load balance | ||
9030 | * will be run before returning from the IPI. | ||
9031 | */ | ||
9032 | smp_send_reschedule(ilb_cpu); | ||
9033 | return; | ||
9034 | } | ||
9035 | |||
9036 | void nohz_balance_exit_idle(unsigned int cpu) | ||
9037 | { | ||
9038 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { | ||
9039 | /* | ||
9040 | * Completely isolated CPUs don't ever set, so we must test. | ||
9041 | */ | ||
9042 | if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { | ||
9043 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | ||
9044 | atomic_dec(&nohz.nr_cpus); | ||
9045 | } | ||
9046 | clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | ||
9047 | } | ||
9048 | } | ||
9049 | |||
9050 | static inline void set_cpu_sd_state_busy(void) | ||
9051 | { | ||
9052 | struct sched_domain *sd; | ||
9053 | int cpu = smp_processor_id(); | ||
9054 | |||
9055 | rcu_read_lock(); | ||
9056 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); | ||
9057 | |||
9058 | if (!sd || !sd->nohz_idle) | ||
9059 | goto unlock; | ||
9060 | sd->nohz_idle = 0; | ||
9061 | |||
9062 | atomic_inc(&sd->shared->nr_busy_cpus); | ||
9063 | unlock: | ||
9064 | rcu_read_unlock(); | ||
9065 | } | ||
9066 | |||
9067 | void set_cpu_sd_state_idle(void) | ||
9068 | { | ||
9069 | struct sched_domain *sd; | ||
9070 | int cpu = smp_processor_id(); | ||
9071 | |||
9072 | rcu_read_lock(); | ||
9073 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); | ||
9074 | |||
9075 | if (!sd || sd->nohz_idle) | ||
9076 | goto unlock; | ||
9077 | sd->nohz_idle = 1; | ||
9078 | |||
9079 | atomic_dec(&sd->shared->nr_busy_cpus); | ||
9080 | unlock: | ||
9081 | rcu_read_unlock(); | ||
9082 | } | ||
9083 | |||
9084 | /* | ||
9085 | * This routine will record that the cpu is going idle with tick stopped. | ||
9086 | * This info will be used in performing idle load balancing in the future. | ||
9087 | */ | ||
9088 | void nohz_balance_enter_idle(int cpu) | ||
9089 | { | ||
9090 | /* | ||
9091 | * If this cpu is going down, then nothing needs to be done. | ||
9092 | */ | ||
9093 | if (!cpu_active(cpu)) | ||
9094 | return; | ||
9095 | |||
9096 | /* Spare idle load balancing on CPUs that don't want to be disturbed: */ | ||
9097 | if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) | ||
9098 | return; | ||
9099 | |||
9100 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) | ||
9101 | return; | ||
9102 | |||
9103 | /* | ||
9104 | * If we're a completely isolated CPU, we don't play. | ||
9105 | */ | ||
9106 | if (on_null_domain(cpu_rq(cpu))) | ||
9107 | return; | ||
9108 | |||
9109 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); | ||
9110 | atomic_inc(&nohz.nr_cpus); | ||
9111 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | ||
9112 | } | ||
9113 | #endif | ||
9114 | |||
9115 | static DEFINE_SPINLOCK(balancing); | 9210 | static DEFINE_SPINLOCK(balancing); |
9116 | 9211 | ||
9117 | /* | 9212 | /* |
@@ -9141,8 +9236,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) | |||
9141 | int need_serialize, need_decay = 0; | 9236 | int need_serialize, need_decay = 0; |
9142 | u64 max_cost = 0; | 9237 | u64 max_cost = 0; |
9143 | 9238 | ||
9144 | update_blocked_averages(cpu); | ||
9145 | |||
9146 | rcu_read_lock(); | 9239 | rcu_read_lock(); |
9147 | for_each_domain(cpu, sd) { | 9240 | for_each_domain(cpu, sd) { |
9148 | /* | 9241 | /* |
@@ -9232,68 +9325,56 @@ out: | |||
9232 | } | 9325 | } |
9233 | } | 9326 | } |
9234 | 9327 | ||
9328 | static inline int on_null_domain(struct rq *rq) | ||
9329 | { | ||
9330 | return unlikely(!rcu_dereference_sched(rq->sd)); | ||
9331 | } | ||
9332 | |||
9235 | #ifdef CONFIG_NO_HZ_COMMON | 9333 | #ifdef CONFIG_NO_HZ_COMMON |
9236 | /* | 9334 | /* |
9237 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the | 9335 | * idle load balancing details |
9238 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 9336 | * - When one of the busy CPUs notice that there may be an idle rebalancing |
9337 | * needed, they will kick the idle load balancer, which then does idle | ||
9338 | * load balancing for all the idle CPUs. | ||
9239 | */ | 9339 | */ |
9240 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | ||
9241 | { | ||
9242 | int this_cpu = this_rq->cpu; | ||
9243 | struct rq *rq; | ||
9244 | int balance_cpu; | ||
9245 | /* Earliest time when we have to do rebalance again */ | ||
9246 | unsigned long next_balance = jiffies + 60*HZ; | ||
9247 | int update_next_balance = 0; | ||
9248 | 9340 | ||
9249 | if (idle != CPU_IDLE || | 9341 | static inline int find_new_ilb(void) |
9250 | !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) | 9342 | { |
9251 | goto end; | 9343 | int ilb = cpumask_first(nohz.idle_cpus_mask); |
9252 | 9344 | ||
9253 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { | 9345 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) |
9254 | if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) | 9346 | return ilb; |
9255 | continue; | ||
9256 | 9347 | ||
9257 | /* | 9348 | return nr_cpu_ids; |
9258 | * If this cpu gets work to do, stop the load balancing | 9349 | } |
9259 | * work being done for other cpus. Next load | ||
9260 | * balancing owner will pick it up. | ||
9261 | */ | ||
9262 | if (need_resched()) | ||
9263 | break; | ||
9264 | 9350 | ||
9265 | rq = cpu_rq(balance_cpu); | 9351 | /* |
9352 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | ||
9353 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | ||
9354 | * CPU (if there is one). | ||
9355 | */ | ||
9356 | static void kick_ilb(unsigned int flags) | ||
9357 | { | ||
9358 | int ilb_cpu; | ||
9266 | 9359 | ||
9267 | /* | 9360 | nohz.next_balance++; |
9268 | * If time for next balance is due, | ||
9269 | * do the balance. | ||
9270 | */ | ||
9271 | if (time_after_eq(jiffies, rq->next_balance)) { | ||
9272 | struct rq_flags rf; | ||
9273 | 9361 | ||
9274 | rq_lock_irq(rq, &rf); | 9362 | ilb_cpu = find_new_ilb(); |
9275 | update_rq_clock(rq); | ||
9276 | cpu_load_update_idle(rq); | ||
9277 | rq_unlock_irq(rq, &rf); | ||
9278 | 9363 | ||
9279 | rebalance_domains(rq, CPU_IDLE); | 9364 | if (ilb_cpu >= nr_cpu_ids) |
9280 | } | 9365 | return; |
9281 | 9366 | ||
9282 | if (time_after(next_balance, rq->next_balance)) { | 9367 | flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); |
9283 | next_balance = rq->next_balance; | 9368 | if (flags & NOHZ_KICK_MASK) |
9284 | update_next_balance = 1; | 9369 | return; |
9285 | } | ||
9286 | } | ||
9287 | 9370 | ||
9288 | /* | 9371 | /* |
9289 | * next_balance will be updated only when there is a need. | 9372 | * Use smp_send_reschedule() instead of resched_cpu(). |
9290 | * When the CPU is attached to null domain for ex, it will not be | 9373 | * This way we generate a sched IPI on the target CPU which |
9291 | * updated. | 9374 | * is idle. And the softirq performing nohz idle load balance |
9375 | * will be run before returning from the IPI. | ||
9292 | */ | 9376 | */ |
9293 | if (likely(update_next_balance)) | 9377 | smp_send_reschedule(ilb_cpu); |
9294 | nohz.next_balance = next_balance; | ||
9295 | end: | ||
9296 | clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); | ||
9297 | } | 9378 | } |
9298 | 9379 | ||
9299 | /* | 9380 | /* |
@@ -9307,36 +9388,41 @@ end: | |||
9307 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 9388 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
9308 | * domain span are idle. | 9389 | * domain span are idle. |
9309 | */ | 9390 | */ |
9310 | static inline bool nohz_kick_needed(struct rq *rq) | 9391 | static void nohz_balancer_kick(struct rq *rq) |
9311 | { | 9392 | { |
9312 | unsigned long now = jiffies; | 9393 | unsigned long now = jiffies; |
9313 | struct sched_domain_shared *sds; | 9394 | struct sched_domain_shared *sds; |
9314 | struct sched_domain *sd; | 9395 | struct sched_domain *sd; |
9315 | int nr_busy, i, cpu = rq->cpu; | 9396 | int nr_busy, i, cpu = rq->cpu; |
9316 | bool kick = false; | 9397 | unsigned int flags = 0; |
9317 | 9398 | ||
9318 | if (unlikely(rq->idle_balance)) | 9399 | if (unlikely(rq->idle_balance)) |
9319 | return false; | 9400 | return; |
9320 | 9401 | ||
9321 | /* | 9402 | /* |
9322 | * We may be recently in ticked or tickless idle mode. At the first | 9403 | * We may be recently in ticked or tickless idle mode. At the first |
9323 | * busy tick after returning from idle, we will update the busy stats. | 9404 | * busy tick after returning from idle, we will update the busy stats. |
9324 | */ | 9405 | */ |
9325 | set_cpu_sd_state_busy(); | 9406 | nohz_balance_exit_idle(rq); |
9326 | nohz_balance_exit_idle(cpu); | ||
9327 | 9407 | ||
9328 | /* | 9408 | /* |
9329 | * None are in tickless mode and hence no need for NOHZ idle load | 9409 | * None are in tickless mode and hence no need for NOHZ idle load |
9330 | * balancing. | 9410 | * balancing. |
9331 | */ | 9411 | */ |
9332 | if (likely(!atomic_read(&nohz.nr_cpus))) | 9412 | if (likely(!atomic_read(&nohz.nr_cpus))) |
9333 | return false; | 9413 | return; |
9414 | |||
9415 | if (READ_ONCE(nohz.has_blocked) && | ||
9416 | time_after(now, READ_ONCE(nohz.next_blocked))) | ||
9417 | flags = NOHZ_STATS_KICK; | ||
9334 | 9418 | ||
9335 | if (time_before(now, nohz.next_balance)) | 9419 | if (time_before(now, nohz.next_balance)) |
9336 | return false; | 9420 | goto out; |
9337 | 9421 | ||
9338 | if (rq->nr_running >= 2) | 9422 | if (rq->nr_running >= 2) { |
9339 | return true; | 9423 | flags = NOHZ_KICK_MASK; |
9424 | goto out; | ||
9425 | } | ||
9340 | 9426 | ||
9341 | rcu_read_lock(); | 9427 | rcu_read_lock(); |
9342 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | 9428 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); |
@@ -9347,7 +9433,7 @@ static inline bool nohz_kick_needed(struct rq *rq) | |||
9347 | */ | 9433 | */ |
9348 | nr_busy = atomic_read(&sds->nr_busy_cpus); | 9434 | nr_busy = atomic_read(&sds->nr_busy_cpus); |
9349 | if (nr_busy > 1) { | 9435 | if (nr_busy > 1) { |
9350 | kick = true; | 9436 | flags = NOHZ_KICK_MASK; |
9351 | goto unlock; | 9437 | goto unlock; |
9352 | } | 9438 | } |
9353 | 9439 | ||
@@ -9357,7 +9443,7 @@ static inline bool nohz_kick_needed(struct rq *rq) | |||
9357 | if (sd) { | 9443 | if (sd) { |
9358 | if ((rq->cfs.h_nr_running >= 1) && | 9444 | if ((rq->cfs.h_nr_running >= 1) && |
9359 | check_cpu_capacity(rq, sd)) { | 9445 | check_cpu_capacity(rq, sd)) { |
9360 | kick = true; | 9446 | flags = NOHZ_KICK_MASK; |
9361 | goto unlock; | 9447 | goto unlock; |
9362 | } | 9448 | } |
9363 | } | 9449 | } |
@@ -9370,18 +9456,421 @@ static inline bool nohz_kick_needed(struct rq *rq) | |||
9370 | continue; | 9456 | continue; |
9371 | 9457 | ||
9372 | if (sched_asym_prefer(i, cpu)) { | 9458 | if (sched_asym_prefer(i, cpu)) { |
9373 | kick = true; | 9459 | flags = NOHZ_KICK_MASK; |
9374 | goto unlock; | 9460 | goto unlock; |
9375 | } | 9461 | } |
9376 | } | 9462 | } |
9377 | } | 9463 | } |
9378 | unlock: | 9464 | unlock: |
9379 | rcu_read_unlock(); | 9465 | rcu_read_unlock(); |
9380 | return kick; | 9466 | out: |
9467 | if (flags) | ||
9468 | kick_ilb(flags); | ||
9469 | } | ||
9470 | |||
9471 | static void set_cpu_sd_state_busy(int cpu) | ||
9472 | { | ||
9473 | struct sched_domain *sd; | ||
9474 | |||
9475 | rcu_read_lock(); | ||
9476 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); | ||
9477 | |||
9478 | if (!sd || !sd->nohz_idle) | ||
9479 | goto unlock; | ||
9480 | sd->nohz_idle = 0; | ||
9481 | |||
9482 | atomic_inc(&sd->shared->nr_busy_cpus); | ||
9483 | unlock: | ||
9484 | rcu_read_unlock(); | ||
9485 | } | ||
9486 | |||
9487 | void nohz_balance_exit_idle(struct rq *rq) | ||
9488 | { | ||
9489 | SCHED_WARN_ON(rq != this_rq()); | ||
9490 | |||
9491 | if (likely(!rq->nohz_tick_stopped)) | ||
9492 | return; | ||
9493 | |||
9494 | rq->nohz_tick_stopped = 0; | ||
9495 | cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask); | ||
9496 | atomic_dec(&nohz.nr_cpus); | ||
9497 | |||
9498 | set_cpu_sd_state_busy(rq->cpu); | ||
9499 | } | ||
9500 | |||
9501 | static void set_cpu_sd_state_idle(int cpu) | ||
9502 | { | ||
9503 | struct sched_domain *sd; | ||
9504 | |||
9505 | rcu_read_lock(); | ||
9506 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); | ||
9507 | |||
9508 | if (!sd || sd->nohz_idle) | ||
9509 | goto unlock; | ||
9510 | sd->nohz_idle = 1; | ||
9511 | |||
9512 | atomic_dec(&sd->shared->nr_busy_cpus); | ||
9513 | unlock: | ||
9514 | rcu_read_unlock(); | ||
9515 | } | ||
9516 | |||
9517 | /* | ||
9518 | * This routine will record that the CPU is going idle with tick stopped. | ||
9519 | * This info will be used in performing idle load balancing in the future. | ||
9520 | */ | ||
9521 | void nohz_balance_enter_idle(int cpu) | ||
9522 | { | ||
9523 | struct rq *rq = cpu_rq(cpu); | ||
9524 | |||
9525 | SCHED_WARN_ON(cpu != smp_processor_id()); | ||
9526 | |||
9527 | /* If this CPU is going down, then nothing needs to be done: */ | ||
9528 | if (!cpu_active(cpu)) | ||
9529 | return; | ||
9530 | |||
9531 | /* Spare idle load balancing on CPUs that don't want to be disturbed: */ | ||
9532 | if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) | ||
9533 | return; | ||
9534 | |||
9535 | /* | ||
9536 | * Can be set safely without rq->lock held | ||
9537 | * If a clear happens, it will have evaluated last additions because | ||
9538 | * rq->lock is held during the check and the clear | ||
9539 | */ | ||
9540 | rq->has_blocked_load = 1; | ||
9541 | |||
9542 | /* | ||
9543 | * The tick is still stopped but load could have been added in the | ||
9544 | * meantime. We set the nohz.has_blocked flag to trig a check of the | ||
9545 | * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear | ||
9546 | * of nohz.has_blocked can only happen after checking the new load | ||
9547 | */ | ||
9548 | if (rq->nohz_tick_stopped) | ||
9549 | goto out; | ||
9550 | |||
9551 | /* If we're a completely isolated CPU, we don't play: */ | ||
9552 | if (on_null_domain(rq)) | ||
9553 | return; | ||
9554 | |||
9555 | rq->nohz_tick_stopped = 1; | ||
9556 | |||
9557 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); | ||
9558 | atomic_inc(&nohz.nr_cpus); | ||
9559 | |||
9560 | /* | ||
9561 | * Ensures that if nohz_idle_balance() fails to observe our | ||
9562 | * @idle_cpus_mask store, it must observe the @has_blocked | ||
9563 | * store. | ||
9564 | */ | ||
9565 | smp_mb__after_atomic(); | ||
9566 | |||
9567 | set_cpu_sd_state_idle(cpu); | ||
9568 | |||
9569 | out: | ||
9570 | /* | ||
9571 | * Each time a cpu enter idle, we assume that it has blocked load and | ||
9572 | * enable the periodic update of the load of idle cpus | ||
9573 | */ | ||
9574 | WRITE_ONCE(nohz.has_blocked, 1); | ||
9575 | } | ||
9576 | |||
9577 | /* | ||
9578 | * Internal function that runs load balance for all idle cpus. The load balance | ||
9579 | * can be a simple update of blocked load or a complete load balance with | ||
9580 | * tasks movement depending of flags. | ||
9581 | * The function returns false if the loop has stopped before running | ||
9582 | * through all idle CPUs. | ||
9583 | */ | ||
9584 | static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, | ||
9585 | enum cpu_idle_type idle) | ||
9586 | { | ||
9587 | /* Earliest time when we have to do rebalance again */ | ||
9588 | unsigned long now = jiffies; | ||
9589 | unsigned long next_balance = now + 60*HZ; | ||
9590 | bool has_blocked_load = false; | ||
9591 | int update_next_balance = 0; | ||
9592 | int this_cpu = this_rq->cpu; | ||
9593 | int balance_cpu; | ||
9594 | int ret = false; | ||
9595 | struct rq *rq; | ||
9596 | |||
9597 | SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); | ||
9598 | |||
9599 | /* | ||
9600 | * We assume there will be no idle load after this update and clear | ||
9601 | * the has_blocked flag. If a cpu enters idle in the mean time, it will | ||
9602 | * set the has_blocked flag and trig another update of idle load. | ||
9603 | * Because a cpu that becomes idle, is added to idle_cpus_mask before | ||
9604 | * setting the flag, we are sure to not clear the state and not | ||
9605 | * check the load of an idle cpu. | ||
9606 | */ | ||
9607 | WRITE_ONCE(nohz.has_blocked, 0); | ||
9608 | |||
9609 | /* | ||
9610 | * Ensures that if we miss the CPU, we must see the has_blocked | ||
9611 | * store from nohz_balance_enter_idle(). | ||
9612 | */ | ||
9613 | smp_mb(); | ||
9614 | |||
9615 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { | ||
9616 | if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) | ||
9617 | continue; | ||
9618 | |||
9619 | /* | ||
9620 | * If this CPU gets work to do, stop the load balancing | ||
9621 | * work being done for other CPUs. Next load | ||
9622 | * balancing owner will pick it up. | ||
9623 | */ | ||
9624 | if (need_resched()) { | ||
9625 | has_blocked_load = true; | ||
9626 | goto abort; | ||
9627 | } | ||
9628 | |||
9629 | rq = cpu_rq(balance_cpu); | ||
9630 | |||
9631 | has_blocked_load |= update_nohz_stats(rq, true); | ||
9632 | |||
9633 | /* | ||
9634 | * If time for next balance is due, | ||
9635 | * do the balance. | ||
9636 | */ | ||
9637 | if (time_after_eq(jiffies, rq->next_balance)) { | ||
9638 | struct rq_flags rf; | ||
9639 | |||
9640 | rq_lock_irqsave(rq, &rf); | ||
9641 | update_rq_clock(rq); | ||
9642 | cpu_load_update_idle(rq); | ||
9643 | rq_unlock_irqrestore(rq, &rf); | ||
9644 | |||
9645 | if (flags & NOHZ_BALANCE_KICK) | ||
9646 | rebalance_domains(rq, CPU_IDLE); | ||
9647 | } | ||
9648 | |||
9649 | if (time_after(next_balance, rq->next_balance)) { | ||
9650 | next_balance = rq->next_balance; | ||
9651 | update_next_balance = 1; | ||
9652 | } | ||
9653 | } | ||
9654 | |||
9655 | /* Newly idle CPU doesn't need an update */ | ||
9656 | if (idle != CPU_NEWLY_IDLE) { | ||
9657 | update_blocked_averages(this_cpu); | ||
9658 | has_blocked_load |= this_rq->has_blocked_load; | ||
9659 | } | ||
9660 | |||
9661 | if (flags & NOHZ_BALANCE_KICK) | ||
9662 | rebalance_domains(this_rq, CPU_IDLE); | ||
9663 | |||
9664 | WRITE_ONCE(nohz.next_blocked, | ||
9665 | now + msecs_to_jiffies(LOAD_AVG_PERIOD)); | ||
9666 | |||
9667 | /* The full idle balance loop has been done */ | ||
9668 | ret = true; | ||
9669 | |||
9670 | abort: | ||
9671 | /* There is still blocked load, enable periodic update */ | ||
9672 | if (has_blocked_load) | ||
9673 | WRITE_ONCE(nohz.has_blocked, 1); | ||
9674 | |||
9675 | /* | ||
9676 | * next_balance will be updated only when there is a need. | ||
9677 | * When the CPU is attached to null domain for ex, it will not be | ||
9678 | * updated. | ||
9679 | */ | ||
9680 | if (likely(update_next_balance)) | ||
9681 | nohz.next_balance = next_balance; | ||
9682 | |||
9683 | return ret; | ||
9684 | } | ||
9685 | |||
9686 | /* | ||
9687 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the | ||
9688 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | ||
9689 | */ | ||
9690 | static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | ||
9691 | { | ||
9692 | int this_cpu = this_rq->cpu; | ||
9693 | unsigned int flags; | ||
9694 | |||
9695 | if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) | ||
9696 | return false; | ||
9697 | |||
9698 | if (idle != CPU_IDLE) { | ||
9699 | atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); | ||
9700 | return false; | ||
9701 | } | ||
9702 | |||
9703 | /* | ||
9704 | * barrier, pairs with nohz_balance_enter_idle(), ensures ... | ||
9705 | */ | ||
9706 | flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); | ||
9707 | if (!(flags & NOHZ_KICK_MASK)) | ||
9708 | return false; | ||
9709 | |||
9710 | _nohz_idle_balance(this_rq, flags, idle); | ||
9711 | |||
9712 | return true; | ||
9713 | } | ||
9714 | |||
9715 | static void nohz_newidle_balance(struct rq *this_rq) | ||
9716 | { | ||
9717 | int this_cpu = this_rq->cpu; | ||
9718 | |||
9719 | /* | ||
9720 | * This CPU doesn't want to be disturbed by scheduler | ||
9721 | * housekeeping | ||
9722 | */ | ||
9723 | if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) | ||
9724 | return; | ||
9725 | |||
9726 | /* Will wake up very soon. No time for doing anything else*/ | ||
9727 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | ||
9728 | return; | ||
9729 | |||
9730 | /* Don't need to update blocked load of idle CPUs*/ | ||
9731 | if (!READ_ONCE(nohz.has_blocked) || | ||
9732 | time_before(jiffies, READ_ONCE(nohz.next_blocked))) | ||
9733 | return; | ||
9734 | |||
9735 | raw_spin_unlock(&this_rq->lock); | ||
9736 | /* | ||
9737 | * This CPU is going to be idle and blocked load of idle CPUs | ||
9738 | * need to be updated. Run the ilb locally as it is a good | ||
9739 | * candidate for ilb instead of waking up another idle CPU. | ||
9740 | * Kick an normal ilb if we failed to do the update. | ||
9741 | */ | ||
9742 | if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)) | ||
9743 | kick_ilb(NOHZ_STATS_KICK); | ||
9744 | raw_spin_lock(&this_rq->lock); | ||
9745 | } | ||
9746 | |||
9747 | #else /* !CONFIG_NO_HZ_COMMON */ | ||
9748 | static inline void nohz_balancer_kick(struct rq *rq) { } | ||
9749 | |||
9750 | static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | ||
9751 | { | ||
9752 | return false; | ||
9753 | } | ||
9754 | |||
9755 | static inline void nohz_newidle_balance(struct rq *this_rq) { } | ||
9756 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
9757 | |||
9758 | /* | ||
9759 | * idle_balance is called by schedule() if this_cpu is about to become | ||
9760 | * idle. Attempts to pull tasks from other CPUs. | ||
9761 | */ | ||
9762 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf) | ||
9763 | { | ||
9764 | unsigned long next_balance = jiffies + HZ; | ||
9765 | int this_cpu = this_rq->cpu; | ||
9766 | struct sched_domain *sd; | ||
9767 | int pulled_task = 0; | ||
9768 | u64 curr_cost = 0; | ||
9769 | |||
9770 | /* | ||
9771 | * We must set idle_stamp _before_ calling idle_balance(), such that we | ||
9772 | * measure the duration of idle_balance() as idle time. | ||
9773 | */ | ||
9774 | this_rq->idle_stamp = rq_clock(this_rq); | ||
9775 | |||
9776 | /* | ||
9777 | * Do not pull tasks towards !active CPUs... | ||
9778 | */ | ||
9779 | if (!cpu_active(this_cpu)) | ||
9780 | return 0; | ||
9781 | |||
9782 | /* | ||
9783 | * This is OK, because current is on_cpu, which avoids it being picked | ||
9784 | * for load-balance and preemption/IRQs are still disabled avoiding | ||
9785 | * further scheduler activity on it and we're being very careful to | ||
9786 | * re-start the picking loop. | ||
9787 | */ | ||
9788 | rq_unpin_lock(this_rq, rf); | ||
9789 | |||
9790 | if (this_rq->avg_idle < sysctl_sched_migration_cost || | ||
9791 | !this_rq->rd->overload) { | ||
9792 | |||
9793 | rcu_read_lock(); | ||
9794 | sd = rcu_dereference_check_sched_domain(this_rq->sd); | ||
9795 | if (sd) | ||
9796 | update_next_balance(sd, &next_balance); | ||
9797 | rcu_read_unlock(); | ||
9798 | |||
9799 | nohz_newidle_balance(this_rq); | ||
9800 | |||
9801 | goto out; | ||
9802 | } | ||
9803 | |||
9804 | raw_spin_unlock(&this_rq->lock); | ||
9805 | |||
9806 | update_blocked_averages(this_cpu); | ||
9807 | rcu_read_lock(); | ||
9808 | for_each_domain(this_cpu, sd) { | ||
9809 | int continue_balancing = 1; | ||
9810 | u64 t0, domain_cost; | ||
9811 | |||
9812 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
9813 | continue; | ||
9814 | |||
9815 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { | ||
9816 | update_next_balance(sd, &next_balance); | ||
9817 | break; | ||
9818 | } | ||
9819 | |||
9820 | if (sd->flags & SD_BALANCE_NEWIDLE) { | ||
9821 | t0 = sched_clock_cpu(this_cpu); | ||
9822 | |||
9823 | pulled_task = load_balance(this_cpu, this_rq, | ||
9824 | sd, CPU_NEWLY_IDLE, | ||
9825 | &continue_balancing); | ||
9826 | |||
9827 | domain_cost = sched_clock_cpu(this_cpu) - t0; | ||
9828 | if (domain_cost > sd->max_newidle_lb_cost) | ||
9829 | sd->max_newidle_lb_cost = domain_cost; | ||
9830 | |||
9831 | curr_cost += domain_cost; | ||
9832 | } | ||
9833 | |||
9834 | update_next_balance(sd, &next_balance); | ||
9835 | |||
9836 | /* | ||
9837 | * Stop searching for tasks to pull if there are | ||
9838 | * now runnable tasks on this rq. | ||
9839 | */ | ||
9840 | if (pulled_task || this_rq->nr_running > 0) | ||
9841 | break; | ||
9842 | } | ||
9843 | rcu_read_unlock(); | ||
9844 | |||
9845 | raw_spin_lock(&this_rq->lock); | ||
9846 | |||
9847 | if (curr_cost > this_rq->max_idle_balance_cost) | ||
9848 | this_rq->max_idle_balance_cost = curr_cost; | ||
9849 | |||
9850 | /* | ||
9851 | * While browsing the domains, we released the rq lock, a task could | ||
9852 | * have been enqueued in the meantime. Since we're not going idle, | ||
9853 | * pretend we pulled a task. | ||
9854 | */ | ||
9855 | if (this_rq->cfs.h_nr_running && !pulled_task) | ||
9856 | pulled_task = 1; | ||
9857 | |||
9858 | out: | ||
9859 | /* Move the next balance forward */ | ||
9860 | if (time_after(this_rq->next_balance, next_balance)) | ||
9861 | this_rq->next_balance = next_balance; | ||
9862 | |||
9863 | /* Is there a task of a high priority class? */ | ||
9864 | if (this_rq->nr_running != this_rq->cfs.h_nr_running) | ||
9865 | pulled_task = -1; | ||
9866 | |||
9867 | if (pulled_task) | ||
9868 | this_rq->idle_stamp = 0; | ||
9869 | |||
9870 | rq_repin_lock(this_rq, rf); | ||
9871 | |||
9872 | return pulled_task; | ||
9381 | } | 9873 | } |
9382 | #else | ||
9383 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } | ||
9384 | #endif | ||
9385 | 9874 | ||
9386 | /* | 9875 | /* |
9387 | * run_rebalance_domains is triggered when needed from the scheduler tick. | 9876 | * run_rebalance_domains is triggered when needed from the scheduler tick. |
@@ -9394,14 +9883,18 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) | |||
9394 | CPU_IDLE : CPU_NOT_IDLE; | 9883 | CPU_IDLE : CPU_NOT_IDLE; |
9395 | 9884 | ||
9396 | /* | 9885 | /* |
9397 | * If this cpu has a pending nohz_balance_kick, then do the | 9886 | * If this CPU has a pending nohz_balance_kick, then do the |
9398 | * balancing on behalf of the other idle cpus whose ticks are | 9887 | * balancing on behalf of the other idle CPUs whose ticks are |
9399 | * stopped. Do nohz_idle_balance *before* rebalance_domains to | 9888 | * stopped. Do nohz_idle_balance *before* rebalance_domains to |
9400 | * give the idle cpus a chance to load balance. Else we may | 9889 | * give the idle CPUs a chance to load balance. Else we may |
9401 | * load balance only within the local sched_domain hierarchy | 9890 | * load balance only within the local sched_domain hierarchy |
9402 | * and abort nohz_idle_balance altogether if we pull some load. | 9891 | * and abort nohz_idle_balance altogether if we pull some load. |
9403 | */ | 9892 | */ |
9404 | nohz_idle_balance(this_rq, idle); | 9893 | if (nohz_idle_balance(this_rq, idle)) |
9894 | return; | ||
9895 | |||
9896 | /* normal load balance */ | ||
9897 | update_blocked_averages(this_rq->cpu); | ||
9405 | rebalance_domains(this_rq, idle); | 9898 | rebalance_domains(this_rq, idle); |
9406 | } | 9899 | } |
9407 | 9900 | ||
@@ -9416,10 +9909,8 @@ void trigger_load_balance(struct rq *rq) | |||
9416 | 9909 | ||
9417 | if (time_after_eq(jiffies, rq->next_balance)) | 9910 | if (time_after_eq(jiffies, rq->next_balance)) |
9418 | raise_softirq(SCHED_SOFTIRQ); | 9911 | raise_softirq(SCHED_SOFTIRQ); |
9419 | #ifdef CONFIG_NO_HZ_COMMON | 9912 | |
9420 | if (nohz_kick_needed(rq)) | 9913 | nohz_balancer_kick(rq); |
9421 | nohz_balancer_kick(); | ||
9422 | #endif | ||
9423 | } | 9914 | } |
9424 | 9915 | ||
9425 | static void rq_online_fair(struct rq *rq) | 9916 | static void rq_online_fair(struct rq *rq) |
@@ -9440,7 +9931,12 @@ static void rq_offline_fair(struct rq *rq) | |||
9440 | #endif /* CONFIG_SMP */ | 9931 | #endif /* CONFIG_SMP */ |
9441 | 9932 | ||
9442 | /* | 9933 | /* |
9443 | * scheduler tick hitting a task of our scheduling class: | 9934 | * scheduler tick hitting a task of our scheduling class. |
9935 | * | ||
9936 | * NOTE: This function can be called remotely by the tick offload that | ||
9937 | * goes along full dynticks. Therefore no local assumption can be made | ||
9938 | * and everything must be accessed through the @rq and @curr passed in | ||
9939 | * parameters. | ||
9444 | */ | 9940 | */ |
9445 | static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | 9941 | static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) |
9446 | { | 9942 | { |
@@ -9591,7 +10087,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) | |||
9591 | 10087 | ||
9592 | /* Synchronize entity with its cfs_rq */ | 10088 | /* Synchronize entity with its cfs_rq */ |
9593 | update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); | 10089 | update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); |
9594 | attach_entity_load_avg(cfs_rq, se); | 10090 | attach_entity_load_avg(cfs_rq, se, 0); |
9595 | update_tg_load_avg(cfs_rq, false); | 10091 | update_tg_load_avg(cfs_rq, false); |
9596 | propagate_entity_cfs_rq(se); | 10092 | propagate_entity_cfs_rq(se); |
9597 | } | 10093 | } |
@@ -9993,6 +10489,7 @@ __init void init_sched_fair_class(void) | |||
9993 | 10489 | ||
9994 | #ifdef CONFIG_NO_HZ_COMMON | 10490 | #ifdef CONFIG_NO_HZ_COMMON |
9995 | nohz.next_balance = jiffies; | 10491 | nohz.next_balance = jiffies; |
10492 | nohz.next_blocked = jiffies; | ||
9996 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 10493 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
9997 | #endif | 10494 | #endif |
9998 | #endif /* SMP */ | 10495 | #endif /* SMP */ |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 9552fd5854bf..85ae8488039c 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true) | |||
85 | SCHED_FEAT(WA_IDLE, true) | 85 | SCHED_FEAT(WA_IDLE, true) |
86 | SCHED_FEAT(WA_WEIGHT, true) | 86 | SCHED_FEAT(WA_WEIGHT, true) |
87 | SCHED_FEAT(WA_BIAS, true) | 87 | SCHED_FEAT(WA_BIAS, true) |
88 | |||
89 | /* | ||
90 | * UtilEstimation. Use estimated CPU utilization. | ||
91 | */ | ||
92 | SCHED_FEAT(UTIL_EST, true) | ||
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 7dae9eb8c042..2975f195e1c4 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -1,23 +1,14 @@ | |||
1 | /* | 1 | /* |
2 | * Generic entry point for the idle threads | 2 | * Generic entry points for the idle threads and |
3 | * implementation of the idle task scheduling class. | ||
4 | * | ||
5 | * (NOTE: these are not related to SCHED_IDLE batch scheduled | ||
6 | * tasks which are handled in sched/fair.c ) | ||
3 | */ | 7 | */ |
4 | #include <linux/sched.h> | 8 | #include "sched.h" |
5 | #include <linux/sched/idle.h> | ||
6 | #include <linux/cpu.h> | ||
7 | #include <linux/cpuidle.h> | ||
8 | #include <linux/cpuhotplug.h> | ||
9 | #include <linux/tick.h> | ||
10 | #include <linux/mm.h> | ||
11 | #include <linux/stackprotector.h> | ||
12 | #include <linux/suspend.h> | ||
13 | #include <linux/livepatch.h> | ||
14 | |||
15 | #include <asm/tlb.h> | ||
16 | 9 | ||
17 | #include <trace/events/power.h> | 10 | #include <trace/events/power.h> |
18 | 11 | ||
19 | #include "sched.h" | ||
20 | |||
21 | /* Linker adds these: start and end of __cpuidle functions */ | 12 | /* Linker adds these: start and end of __cpuidle functions */ |
22 | extern char __cpuidle_text_start[], __cpuidle_text_end[]; | 13 | extern char __cpuidle_text_start[], __cpuidle_text_end[]; |
23 | 14 | ||
@@ -46,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable) | |||
46 | static int __init cpu_idle_poll_setup(char *__unused) | 37 | static int __init cpu_idle_poll_setup(char *__unused) |
47 | { | 38 | { |
48 | cpu_idle_force_poll = 1; | 39 | cpu_idle_force_poll = 1; |
40 | |||
49 | return 1; | 41 | return 1; |
50 | } | 42 | } |
51 | __setup("nohlt", cpu_idle_poll_setup); | 43 | __setup("nohlt", cpu_idle_poll_setup); |
@@ -53,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup); | |||
53 | static int __init cpu_idle_nopoll_setup(char *__unused) | 45 | static int __init cpu_idle_nopoll_setup(char *__unused) |
54 | { | 46 | { |
55 | cpu_idle_force_poll = 0; | 47 | cpu_idle_force_poll = 0; |
48 | |||
56 | return 1; | 49 | return 1; |
57 | } | 50 | } |
58 | __setup("hlt", cpu_idle_nopoll_setup); | 51 | __setup("hlt", cpu_idle_nopoll_setup); |
@@ -64,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void) | |||
64 | trace_cpu_idle_rcuidle(0, smp_processor_id()); | 57 | trace_cpu_idle_rcuidle(0, smp_processor_id()); |
65 | local_irq_enable(); | 58 | local_irq_enable(); |
66 | stop_critical_timings(); | 59 | stop_critical_timings(); |
60 | |||
67 | while (!tif_need_resched() && | 61 | while (!tif_need_resched() && |
68 | (cpu_idle_force_poll || tick_check_broadcast_expired())) | 62 | (cpu_idle_force_poll || tick_check_broadcast_expired())) |
69 | cpu_relax(); | 63 | cpu_relax(); |
70 | start_critical_timings(); | 64 | start_critical_timings(); |
71 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); | 65 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
72 | rcu_idle_exit(); | 66 | rcu_idle_exit(); |
67 | |||
73 | return 1; | 68 | return 1; |
74 | } | 69 | } |
75 | 70 | ||
@@ -332,8 +327,8 @@ void cpu_startup_entry(enum cpuhp_state state) | |||
332 | { | 327 | { |
333 | /* | 328 | /* |
334 | * This #ifdef needs to die, but it's too late in the cycle to | 329 | * This #ifdef needs to die, but it's too late in the cycle to |
335 | * make this generic (arm and sh have never invoked the canary | 330 | * make this generic (ARM and SH have never invoked the canary |
336 | * init for the non boot cpus!). Will be fixed in 3.11 | 331 | * init for the non boot CPUs!). Will be fixed in 3.11 |
337 | */ | 332 | */ |
338 | #ifdef CONFIG_X86 | 333 | #ifdef CONFIG_X86 |
339 | /* | 334 | /* |
@@ -350,3 +345,116 @@ void cpu_startup_entry(enum cpuhp_state state) | |||
350 | while (1) | 345 | while (1) |
351 | do_idle(); | 346 | do_idle(); |
352 | } | 347 | } |
348 | |||
349 | /* | ||
350 | * idle-task scheduling class. | ||
351 | */ | ||
352 | |||
353 | #ifdef CONFIG_SMP | ||
354 | static int | ||
355 | select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) | ||
356 | { | ||
357 | return task_cpu(p); /* IDLE tasks as never migrated */ | ||
358 | } | ||
359 | #endif | ||
360 | |||
361 | /* | ||
362 | * Idle tasks are unconditionally rescheduled: | ||
363 | */ | ||
364 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) | ||
365 | { | ||
366 | resched_curr(rq); | ||
367 | } | ||
368 | |||
369 | static struct task_struct * | ||
370 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | ||
371 | { | ||
372 | put_prev_task(rq, prev); | ||
373 | update_idle_core(rq); | ||
374 | schedstat_inc(rq->sched_goidle); | ||
375 | |||
376 | return rq->idle; | ||
377 | } | ||
378 | |||
379 | /* | ||
380 | * It is not legal to sleep in the idle task - print a warning | ||
381 | * message if some code attempts to do it: | ||
382 | */ | ||
383 | static void | ||
384 | dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) | ||
385 | { | ||
386 | raw_spin_unlock_irq(&rq->lock); | ||
387 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); | ||
388 | dump_stack(); | ||
389 | raw_spin_lock_irq(&rq->lock); | ||
390 | } | ||
391 | |||
392 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) | ||
393 | { | ||
394 | } | ||
395 | |||
396 | /* | ||
397 | * scheduler tick hitting a task of our scheduling class. | ||
398 | * | ||
399 | * NOTE: This function can be called remotely by the tick offload that | ||
400 | * goes along full dynticks. Therefore no local assumption can be made | ||
401 | * and everything must be accessed through the @rq and @curr passed in | ||
402 | * parameters. | ||
403 | */ | ||
404 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) | ||
405 | { | ||
406 | } | ||
407 | |||
408 | static void set_curr_task_idle(struct rq *rq) | ||
409 | { | ||
410 | } | ||
411 | |||
412 | static void switched_to_idle(struct rq *rq, struct task_struct *p) | ||
413 | { | ||
414 | BUG(); | ||
415 | } | ||
416 | |||
417 | static void | ||
418 | prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) | ||
419 | { | ||
420 | BUG(); | ||
421 | } | ||
422 | |||
423 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) | ||
424 | { | ||
425 | return 0; | ||
426 | } | ||
427 | |||
428 | static void update_curr_idle(struct rq *rq) | ||
429 | { | ||
430 | } | ||
431 | |||
432 | /* | ||
433 | * Simple, special scheduling class for the per-CPU idle tasks: | ||
434 | */ | ||
435 | const struct sched_class idle_sched_class = { | ||
436 | /* .next is NULL */ | ||
437 | /* no enqueue/yield_task for idle tasks */ | ||
438 | |||
439 | /* dequeue is not valid, we print a debug message there: */ | ||
440 | .dequeue_task = dequeue_task_idle, | ||
441 | |||
442 | .check_preempt_curr = check_preempt_curr_idle, | ||
443 | |||
444 | .pick_next_task = pick_next_task_idle, | ||
445 | .put_prev_task = put_prev_task_idle, | ||
446 | |||
447 | #ifdef CONFIG_SMP | ||
448 | .select_task_rq = select_task_rq_idle, | ||
449 | .set_cpus_allowed = set_cpus_allowed_common, | ||
450 | #endif | ||
451 | |||
452 | .set_curr_task = set_curr_task_idle, | ||
453 | .task_tick = task_tick_idle, | ||
454 | |||
455 | .get_rr_interval = get_rr_interval_idle, | ||
456 | |||
457 | .prio_changed = prio_changed_idle, | ||
458 | .switched_to = switched_to_idle, | ||
459 | .update_curr = update_curr_idle, | ||
460 | }; | ||
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c deleted file mode 100644 index d518664cce4f..000000000000 --- a/kernel/sched/idle_task.c +++ /dev/null | |||
@@ -1,110 +0,0 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include "sched.h" | ||
3 | |||
4 | /* | ||
5 | * idle-task scheduling class. | ||
6 | * | ||
7 | * (NOTE: these are not related to SCHED_IDLE tasks which are | ||
8 | * handled in sched/fair.c) | ||
9 | */ | ||
10 | |||
11 | #ifdef CONFIG_SMP | ||
12 | static int | ||
13 | select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) | ||
14 | { | ||
15 | return task_cpu(p); /* IDLE tasks as never migrated */ | ||
16 | } | ||
17 | #endif /* CONFIG_SMP */ | ||
18 | |||
19 | /* | ||
20 | * Idle tasks are unconditionally rescheduled: | ||
21 | */ | ||
22 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) | ||
23 | { | ||
24 | resched_curr(rq); | ||
25 | } | ||
26 | |||
27 | static struct task_struct * | ||
28 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | ||
29 | { | ||
30 | put_prev_task(rq, prev); | ||
31 | update_idle_core(rq); | ||
32 | schedstat_inc(rq->sched_goidle); | ||
33 | return rq->idle; | ||
34 | } | ||
35 | |||
36 | /* | ||
37 | * It is not legal to sleep in the idle task - print a warning | ||
38 | * message if some code attempts to do it: | ||
39 | */ | ||
40 | static void | ||
41 | dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) | ||
42 | { | ||
43 | raw_spin_unlock_irq(&rq->lock); | ||
44 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); | ||
45 | dump_stack(); | ||
46 | raw_spin_lock_irq(&rq->lock); | ||
47 | } | ||
48 | |||
49 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) | ||
50 | { | ||
51 | rq_last_tick_reset(rq); | ||
52 | } | ||
53 | |||
54 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) | ||
55 | { | ||
56 | } | ||
57 | |||
58 | static void set_curr_task_idle(struct rq *rq) | ||
59 | { | ||
60 | } | ||
61 | |||
62 | static void switched_to_idle(struct rq *rq, struct task_struct *p) | ||
63 | { | ||
64 | BUG(); | ||
65 | } | ||
66 | |||
67 | static void | ||
68 | prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) | ||
69 | { | ||
70 | BUG(); | ||
71 | } | ||
72 | |||
73 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) | ||
74 | { | ||
75 | return 0; | ||
76 | } | ||
77 | |||
78 | static void update_curr_idle(struct rq *rq) | ||
79 | { | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * Simple, special scheduling class for the per-CPU idle tasks: | ||
84 | */ | ||
85 | const struct sched_class idle_sched_class = { | ||
86 | /* .next is NULL */ | ||
87 | /* no enqueue/yield_task for idle tasks */ | ||
88 | |||
89 | /* dequeue is not valid, we print a debug message there: */ | ||
90 | .dequeue_task = dequeue_task_idle, | ||
91 | |||
92 | .check_preempt_curr = check_preempt_curr_idle, | ||
93 | |||
94 | .pick_next_task = pick_next_task_idle, | ||
95 | .put_prev_task = put_prev_task_idle, | ||
96 | |||
97 | #ifdef CONFIG_SMP | ||
98 | .select_task_rq = select_task_rq_idle, | ||
99 | .set_cpus_allowed = set_cpus_allowed_common, | ||
100 | #endif | ||
101 | |||
102 | .set_curr_task = set_curr_task_idle, | ||
103 | .task_tick = task_tick_idle, | ||
104 | |||
105 | .get_rr_interval = get_rr_interval_idle, | ||
106 | |||
107 | .prio_changed = prio_changed_idle, | ||
108 | .switched_to = switched_to_idle, | ||
109 | .update_curr = update_curr_idle, | ||
110 | }; | ||
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index b71b436f59f2..e6802181900f 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c | |||
@@ -3,15 +3,10 @@ | |||
3 | * any CPU: unbound workqueues, timers, kthreads and any offloadable work. | 3 | * any CPU: unbound workqueues, timers, kthreads and any offloadable work. |
4 | * | 4 | * |
5 | * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker | 5 | * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker |
6 | * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker | ||
6 | * | 7 | * |
7 | */ | 8 | */ |
8 | 9 | #include "sched.h" | |
9 | #include <linux/sched/isolation.h> | ||
10 | #include <linux/tick.h> | ||
11 | #include <linux/init.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/static_key.h> | ||
14 | #include <linux/ctype.h> | ||
15 | 10 | ||
16 | DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); | 11 | DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); |
17 | EXPORT_SYMBOL_GPL(housekeeping_overriden); | 12 | EXPORT_SYMBOL_GPL(housekeeping_overriden); |
@@ -60,6 +55,9 @@ void __init housekeeping_init(void) | |||
60 | 55 | ||
61 | static_branch_enable(&housekeeping_overriden); | 56 | static_branch_enable(&housekeeping_overriden); |
62 | 57 | ||
58 | if (housekeeping_flags & HK_FLAG_TICK) | ||
59 | sched_tick_offload_init(); | ||
60 | |||
63 | /* We need at least one CPU to handle housekeeping work */ | 61 | /* We need at least one CPU to handle housekeeping work */ |
64 | WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); | 62 | WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); |
65 | } | 63 | } |
@@ -119,7 +117,7 @@ static int __init housekeeping_nohz_full_setup(char *str) | |||
119 | { | 117 | { |
120 | unsigned int flags; | 118 | unsigned int flags; |
121 | 119 | ||
122 | flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; | 120 | flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; |
123 | 121 | ||
124 | return housekeeping_setup(str, flags); | 122 | return housekeeping_setup(str, flags); |
125 | } | 123 | } |
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 89a989e4d758..a171c1258109 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c | |||
@@ -6,10 +6,6 @@ | |||
6 | * figure. Its a silly number but people think its important. We go through | 6 | * figure. Its a silly number but people think its important. We go through |
7 | * great pains to make it work on big machines and tickless kernels. | 7 | * great pains to make it work on big machines and tickless kernels. |
8 | */ | 8 | */ |
9 | |||
10 | #include <linux/export.h> | ||
11 | #include <linux/sched/loadavg.h> | ||
12 | |||
13 | #include "sched.h" | 9 | #include "sched.h" |
14 | 10 | ||
15 | /* | 11 | /* |
@@ -32,29 +28,29 @@ | |||
32 | * Due to a number of reasons the above turns in the mess below: | 28 | * Due to a number of reasons the above turns in the mess below: |
33 | * | 29 | * |
34 | * - for_each_possible_cpu() is prohibitively expensive on machines with | 30 | * - for_each_possible_cpu() is prohibitively expensive on machines with |
35 | * serious number of cpus, therefore we need to take a distributed approach | 31 | * serious number of CPUs, therefore we need to take a distributed approach |
36 | * to calculating nr_active. | 32 | * to calculating nr_active. |
37 | * | 33 | * |
38 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | 34 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 |
39 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | 35 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } |
40 | * | 36 | * |
41 | * So assuming nr_active := 0 when we start out -- true per definition, we | 37 | * So assuming nr_active := 0 when we start out -- true per definition, we |
42 | * can simply take per-cpu deltas and fold those into a global accumulate | 38 | * can simply take per-CPU deltas and fold those into a global accumulate |
43 | * to obtain the same result. See calc_load_fold_active(). | 39 | * to obtain the same result. See calc_load_fold_active(). |
44 | * | 40 | * |
45 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | 41 | * Furthermore, in order to avoid synchronizing all per-CPU delta folding |
46 | * across the machine, we assume 10 ticks is sufficient time for every | 42 | * across the machine, we assume 10 ticks is sufficient time for every |
47 | * cpu to have completed this task. | 43 | * CPU to have completed this task. |
48 | * | 44 | * |
49 | * This places an upper-bound on the IRQ-off latency of the machine. Then | 45 | * This places an upper-bound on the IRQ-off latency of the machine. Then |
50 | * again, being late doesn't loose the delta, just wrecks the sample. | 46 | * again, being late doesn't loose the delta, just wrecks the sample. |
51 | * | 47 | * |
52 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | 48 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because |
53 | * this would add another cross-cpu cacheline miss and atomic operation | 49 | * this would add another cross-CPU cacheline miss and atomic operation |
54 | * to the wakeup path. Instead we increment on whatever cpu the task ran | 50 | * to the wakeup path. Instead we increment on whatever CPU the task ran |
55 | * when it went into uninterruptible state and decrement on whatever cpu | 51 | * when it went into uninterruptible state and decrement on whatever CPU |
56 | * did the wakeup. This means that only the sum of nr_uninterruptible over | 52 | * did the wakeup. This means that only the sum of nr_uninterruptible over |
57 | * all cpus yields the correct result. | 53 | * all CPUs yields the correct result. |
58 | * | 54 | * |
59 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | 55 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. |
60 | */ | 56 | */ |
@@ -115,11 +111,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
115 | * Handle NO_HZ for the global load-average. | 111 | * Handle NO_HZ for the global load-average. |
116 | * | 112 | * |
117 | * Since the above described distributed algorithm to compute the global | 113 | * Since the above described distributed algorithm to compute the global |
118 | * load-average relies on per-cpu sampling from the tick, it is affected by | 114 | * load-average relies on per-CPU sampling from the tick, it is affected by |
119 | * NO_HZ. | 115 | * NO_HZ. |
120 | * | 116 | * |
121 | * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon | 117 | * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon |
122 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | 118 | * entering NO_HZ state such that we can include this as an 'extra' CPU delta |
123 | * when we read the global state. | 119 | * when we read the global state. |
124 | * | 120 | * |
125 | * Obviously reality has to ruin such a delightfully simple scheme: | 121 | * Obviously reality has to ruin such a delightfully simple scheme: |
@@ -146,9 +142,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
146 | * busy state. | 142 | * busy state. |
147 | * | 143 | * |
148 | * This is solved by pushing the window forward, and thus skipping the | 144 | * This is solved by pushing the window forward, and thus skipping the |
149 | * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which | 145 | * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which |
150 | * was in effect at the time the window opened). This also solves the issue | 146 | * was in effect at the time the window opened). This also solves the issue |
151 | * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ | 147 | * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ |
152 | * intervals. | 148 | * intervals. |
153 | * | 149 | * |
154 | * When making the ILB scale, we should try to pull this in as well. | 150 | * When making the ILB scale, we should try to pull this in as well. |
@@ -299,7 +295,7 @@ calc_load_n(unsigned long load, unsigned long exp, | |||
299 | } | 295 | } |
300 | 296 | ||
301 | /* | 297 | /* |
302 | * NO_HZ can leave us missing all per-cpu ticks calling | 298 | * NO_HZ can leave us missing all per-CPU ticks calling |
303 | * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into | 299 | * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into |
304 | * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold | 300 | * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold |
305 | * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. | 301 | * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. |
@@ -363,7 +359,7 @@ void calc_global_load(unsigned long ticks) | |||
363 | return; | 359 | return; |
364 | 360 | ||
365 | /* | 361 | /* |
366 | * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. | 362 | * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs. |
367 | */ | 363 | */ |
368 | delta = calc_load_nohz_fold(); | 364 | delta = calc_load_nohz_fold(); |
369 | if (delta) | 365 | if (delta) |
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 5d0762633639..76e0eaf4654e 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c | |||
@@ -13,32 +13,25 @@ | |||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | * GNU General Public License for more details. | 14 | * GNU General Public License for more details. |
15 | */ | 15 | */ |
16 | 16 | #include "sched.h" | |
17 | #include <linux/syscalls.h> | ||
18 | #include <linux/membarrier.h> | ||
19 | #include <linux/tick.h> | ||
20 | #include <linux/cpumask.h> | ||
21 | #include <linux/atomic.h> | ||
22 | |||
23 | #include "sched.h" /* for cpu_rq(). */ | ||
24 | 17 | ||
25 | /* | 18 | /* |
26 | * Bitmask made from a "or" of all commands within enum membarrier_cmd, | 19 | * Bitmask made from a "or" of all commands within enum membarrier_cmd, |
27 | * except MEMBARRIER_CMD_QUERY. | 20 | * except MEMBARRIER_CMD_QUERY. |
28 | */ | 21 | */ |
29 | #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE | 22 | #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE |
30 | #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ | 23 | #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ |
31 | (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ | 24 | (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ |
32 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) | 25 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) |
33 | #else | 26 | #else |
34 | #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 | 27 | #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 |
35 | #endif | 28 | #endif |
36 | 29 | ||
37 | #define MEMBARRIER_CMD_BITMASK \ | 30 | #define MEMBARRIER_CMD_BITMASK \ |
38 | (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ | 31 | (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ |
39 | | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ | 32 | | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ |
40 | | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ | 33 | | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ |
41 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ | 34 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ |
42 | | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) | 35 | | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) |
43 | 36 | ||
44 | static void ipi_mb(void *info) | 37 | static void ipi_mb(void *info) |
@@ -85,6 +78,7 @@ static int membarrier_global_expedited(void) | |||
85 | */ | 78 | */ |
86 | if (cpu == raw_smp_processor_id()) | 79 | if (cpu == raw_smp_processor_id()) |
87 | continue; | 80 | continue; |
81 | |||
88 | rcu_read_lock(); | 82 | rcu_read_lock(); |
89 | p = task_rcu_dereference(&cpu_rq(cpu)->curr); | 83 | p = task_rcu_dereference(&cpu_rq(cpu)->curr); |
90 | if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & | 84 | if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & |
@@ -188,6 +182,7 @@ static int membarrier_private_expedited(int flags) | |||
188 | * rq->curr modification in scheduler. | 182 | * rq->curr modification in scheduler. |
189 | */ | 183 | */ |
190 | smp_mb(); /* exit from system call is not a mb */ | 184 | smp_mb(); /* exit from system call is not a mb */ |
185 | |||
191 | return 0; | 186 | return 0; |
192 | } | 187 | } |
193 | 188 | ||
@@ -219,6 +214,7 @@ static int membarrier_register_global_expedited(void) | |||
219 | } | 214 | } |
220 | atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, | 215 | atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, |
221 | &mm->membarrier_state); | 216 | &mm->membarrier_state); |
217 | |||
222 | return 0; | 218 | return 0; |
223 | } | 219 | } |
224 | 220 | ||
@@ -253,6 +249,7 @@ static int membarrier_register_private_expedited(int flags) | |||
253 | synchronize_sched(); | 249 | synchronize_sched(); |
254 | } | 250 | } |
255 | atomic_or(state, &mm->membarrier_state); | 251 | atomic_or(state, &mm->membarrier_state); |
252 | |||
256 | return 0; | 253 | return 0; |
257 | } | 254 | } |
258 | 255 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index aad49451584e..86b77987435e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -3,12 +3,8 @@ | |||
3 | * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR | 3 | * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR |
4 | * policies) | 4 | * policies) |
5 | */ | 5 | */ |
6 | |||
7 | #include "sched.h" | 6 | #include "sched.h" |
8 | 7 | ||
9 | #include <linux/slab.h> | ||
10 | #include <linux/irq_work.h> | ||
11 | |||
12 | int sched_rr_timeslice = RR_TIMESLICE; | 8 | int sched_rr_timeslice = RR_TIMESLICE; |
13 | int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; | 9 | int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; |
14 | 10 | ||
@@ -359,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head); | |||
359 | static void push_rt_tasks(struct rq *); | 355 | static void push_rt_tasks(struct rq *); |
360 | static void pull_rt_task(struct rq *); | 356 | static void pull_rt_task(struct rq *); |
361 | 357 | ||
362 | static inline void queue_push_tasks(struct rq *rq) | 358 | static inline void rt_queue_push_tasks(struct rq *rq) |
363 | { | 359 | { |
364 | if (!has_pushable_tasks(rq)) | 360 | if (!has_pushable_tasks(rq)) |
365 | return; | 361 | return; |
@@ -367,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq) | |||
367 | queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); | 363 | queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); |
368 | } | 364 | } |
369 | 365 | ||
370 | static inline void queue_pull_task(struct rq *rq) | 366 | static inline void rt_queue_pull_task(struct rq *rq) |
371 | { | 367 | { |
372 | queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); | 368 | queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); |
373 | } | 369 | } |
@@ -425,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq) | |||
425 | { | 421 | { |
426 | } | 422 | } |
427 | 423 | ||
428 | static inline void queue_push_tasks(struct rq *rq) | 424 | static inline void rt_queue_push_tasks(struct rq *rq) |
429 | { | 425 | { |
430 | } | 426 | } |
431 | #endif /* CONFIG_SMP */ | 427 | #endif /* CONFIG_SMP */ |
@@ -961,9 +957,6 @@ static void update_curr_rt(struct rq *rq) | |||
961 | if (unlikely((s64)delta_exec <= 0)) | 957 | if (unlikely((s64)delta_exec <= 0)) |
962 | return; | 958 | return; |
963 | 959 | ||
964 | /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ | ||
965 | cpufreq_update_util(rq, SCHED_CPUFREQ_RT); | ||
966 | |||
967 | schedstat_set(curr->se.statistics.exec_max, | 960 | schedstat_set(curr->se.statistics.exec_max, |
968 | max(curr->se.statistics.exec_max, delta_exec)); | 961 | max(curr->se.statistics.exec_max, delta_exec)); |
969 | 962 | ||
@@ -1005,6 +998,9 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) | |||
1005 | 998 | ||
1006 | sub_nr_running(rq, rt_rq->rt_nr_running); | 999 | sub_nr_running(rq, rt_rq->rt_nr_running); |
1007 | rt_rq->rt_queued = 0; | 1000 | rt_rq->rt_queued = 0; |
1001 | |||
1002 | /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ | ||
1003 | cpufreq_update_util(rq, 0); | ||
1008 | } | 1004 | } |
1009 | 1005 | ||
1010 | static void | 1006 | static void |
@@ -1021,6 +1017,9 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) | |||
1021 | 1017 | ||
1022 | add_nr_running(rq, rt_rq->rt_nr_running); | 1018 | add_nr_running(rq, rt_rq->rt_nr_running); |
1023 | rt_rq->rt_queued = 1; | 1019 | rt_rq->rt_queued = 1; |
1020 | |||
1021 | /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ | ||
1022 | cpufreq_update_util(rq, 0); | ||
1024 | } | 1023 | } |
1025 | 1024 | ||
1026 | #if defined CONFIG_SMP | 1025 | #if defined CONFIG_SMP |
@@ -1453,9 +1452,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | |||
1453 | return; | 1452 | return; |
1454 | 1453 | ||
1455 | /* | 1454 | /* |
1456 | * There appears to be other cpus that can accept | 1455 | * There appear to be other CPUs that can accept |
1457 | * current and none to run 'p', so lets reschedule | 1456 | * the current task but none can run 'p', so lets reschedule |
1458 | * to try and push current away: | 1457 | * to try and push the current task away: |
1459 | */ | 1458 | */ |
1460 | requeue_task_rt(rq, p, 1); | 1459 | requeue_task_rt(rq, p, 1); |
1461 | resched_curr(rq); | 1460 | resched_curr(rq); |
@@ -1569,7 +1568,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | |||
1569 | /* The running task is never eligible for pushing */ | 1568 | /* The running task is never eligible for pushing */ |
1570 | dequeue_pushable_task(rq, p); | 1569 | dequeue_pushable_task(rq, p); |
1571 | 1570 | ||
1572 | queue_push_tasks(rq); | 1571 | rt_queue_push_tasks(rq); |
1573 | 1572 | ||
1574 | return p; | 1573 | return p; |
1575 | } | 1574 | } |
@@ -1596,12 +1595,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | |||
1596 | if (!task_running(rq, p) && | 1595 | if (!task_running(rq, p) && |
1597 | cpumask_test_cpu(cpu, &p->cpus_allowed)) | 1596 | cpumask_test_cpu(cpu, &p->cpus_allowed)) |
1598 | return 1; | 1597 | return 1; |
1598 | |||
1599 | return 0; | 1599 | return 0; |
1600 | } | 1600 | } |
1601 | 1601 | ||
1602 | /* | 1602 | /* |
1603 | * Return the highest pushable rq's task, which is suitable to be executed | 1603 | * Return the highest pushable rq's task, which is suitable to be executed |
1604 | * on the cpu, NULL otherwise | 1604 | * on the CPU, NULL otherwise |
1605 | */ | 1605 | */ |
1606 | static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) | 1606 | static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) |
1607 | { | 1607 | { |
@@ -1639,11 +1639,11 @@ static int find_lowest_rq(struct task_struct *task) | |||
1639 | return -1; /* No targets found */ | 1639 | return -1; /* No targets found */ |
1640 | 1640 | ||
1641 | /* | 1641 | /* |
1642 | * At this point we have built a mask of cpus representing the | 1642 | * At this point we have built a mask of CPUs representing the |
1643 | * lowest priority tasks in the system. Now we want to elect | 1643 | * lowest priority tasks in the system. Now we want to elect |
1644 | * the best one based on our affinity and topology. | 1644 | * the best one based on our affinity and topology. |
1645 | * | 1645 | * |
1646 | * We prioritize the last cpu that the task executed on since | 1646 | * We prioritize the last CPU that the task executed on since |
1647 | * it is most likely cache-hot in that location. | 1647 | * it is most likely cache-hot in that location. |
1648 | */ | 1648 | */ |
1649 | if (cpumask_test_cpu(cpu, lowest_mask)) | 1649 | if (cpumask_test_cpu(cpu, lowest_mask)) |
@@ -1651,7 +1651,7 @@ static int find_lowest_rq(struct task_struct *task) | |||
1651 | 1651 | ||
1652 | /* | 1652 | /* |
1653 | * Otherwise, we consult the sched_domains span maps to figure | 1653 | * Otherwise, we consult the sched_domains span maps to figure |
1654 | * out which cpu is logically closest to our hot cache data. | 1654 | * out which CPU is logically closest to our hot cache data. |
1655 | */ | 1655 | */ |
1656 | if (!cpumask_test_cpu(this_cpu, lowest_mask)) | 1656 | if (!cpumask_test_cpu(this_cpu, lowest_mask)) |
1657 | this_cpu = -1; /* Skip this_cpu opt if not among lowest */ | 1657 | this_cpu = -1; /* Skip this_cpu opt if not among lowest */ |
@@ -1692,6 +1692,7 @@ static int find_lowest_rq(struct task_struct *task) | |||
1692 | cpu = cpumask_any(lowest_mask); | 1692 | cpu = cpumask_any(lowest_mask); |
1693 | if (cpu < nr_cpu_ids) | 1693 | if (cpu < nr_cpu_ids) |
1694 | return cpu; | 1694 | return cpu; |
1695 | |||
1695 | return -1; | 1696 | return -1; |
1696 | } | 1697 | } |
1697 | 1698 | ||
@@ -1827,7 +1828,7 @@ retry: | |||
1827 | * The task hasn't migrated, and is still the next | 1828 | * The task hasn't migrated, and is still the next |
1828 | * eligible task, but we failed to find a run-queue | 1829 | * eligible task, but we failed to find a run-queue |
1829 | * to push it to. Do not retry in this case, since | 1830 | * to push it to. Do not retry in this case, since |
1830 | * other cpus will pull from us when ready. | 1831 | * other CPUs will pull from us when ready. |
1831 | */ | 1832 | */ |
1832 | goto out; | 1833 | goto out; |
1833 | } | 1834 | } |
@@ -1919,7 +1920,7 @@ static int rto_next_cpu(struct root_domain *rd) | |||
1919 | * rt_next_cpu() will simply return the first CPU found in | 1920 | * rt_next_cpu() will simply return the first CPU found in |
1920 | * the rto_mask. | 1921 | * the rto_mask. |
1921 | * | 1922 | * |
1922 | * If rto_next_cpu() is called with rto_cpu is a valid cpu, it | 1923 | * If rto_next_cpu() is called with rto_cpu is a valid CPU, it |
1923 | * will return the next CPU found in the rto_mask. | 1924 | * will return the next CPU found in the rto_mask. |
1924 | * | 1925 | * |
1925 | * If there are no more CPUs left in the rto_mask, then a check is made | 1926 | * If there are no more CPUs left in the rto_mask, then a check is made |
@@ -1980,7 +1981,7 @@ static void tell_cpu_to_push(struct rq *rq) | |||
1980 | raw_spin_lock(&rq->rd->rto_lock); | 1981 | raw_spin_lock(&rq->rd->rto_lock); |
1981 | 1982 | ||
1982 | /* | 1983 | /* |
1983 | * The rto_cpu is updated under the lock, if it has a valid cpu | 1984 | * The rto_cpu is updated under the lock, if it has a valid CPU |
1984 | * then the IPI is still running and will continue due to the | 1985 | * then the IPI is still running and will continue due to the |
1985 | * update to loop_next, and nothing needs to be done here. | 1986 | * update to loop_next, and nothing needs to be done here. |
1986 | * Otherwise it is finishing up and an ipi needs to be sent. | 1987 | * Otherwise it is finishing up and an ipi needs to be sent. |
@@ -2105,7 +2106,7 @@ static void pull_rt_task(struct rq *this_rq) | |||
2105 | 2106 | ||
2106 | /* | 2107 | /* |
2107 | * There's a chance that p is higher in priority | 2108 | * There's a chance that p is higher in priority |
2108 | * than what's currently running on its cpu. | 2109 | * than what's currently running on its CPU. |
2109 | * This is just that p is wakeing up and hasn't | 2110 | * This is just that p is wakeing up and hasn't |
2110 | * had a chance to schedule. We only pull | 2111 | * had a chance to schedule. We only pull |
2111 | * p if it is lower in priority than the | 2112 | * p if it is lower in priority than the |
@@ -2187,7 +2188,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
2187 | if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) | 2188 | if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) |
2188 | return; | 2189 | return; |
2189 | 2190 | ||
2190 | queue_pull_task(rq); | 2191 | rt_queue_pull_task(rq); |
2191 | } | 2192 | } |
2192 | 2193 | ||
2193 | void __init init_sched_rt_class(void) | 2194 | void __init init_sched_rt_class(void) |
@@ -2218,7 +2219,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
2218 | if (task_on_rq_queued(p) && rq->curr != p) { | 2219 | if (task_on_rq_queued(p) && rq->curr != p) { |
2219 | #ifdef CONFIG_SMP | 2220 | #ifdef CONFIG_SMP |
2220 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) | 2221 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) |
2221 | queue_push_tasks(rq); | 2222 | rt_queue_push_tasks(rq); |
2222 | #endif /* CONFIG_SMP */ | 2223 | #endif /* CONFIG_SMP */ |
2223 | if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) | 2224 | if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) |
2224 | resched_curr(rq); | 2225 | resched_curr(rq); |
@@ -2242,7 +2243,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) | |||
2242 | * may need to pull tasks to this runqueue. | 2243 | * may need to pull tasks to this runqueue. |
2243 | */ | 2244 | */ |
2244 | if (oldprio < p->prio) | 2245 | if (oldprio < p->prio) |
2245 | queue_pull_task(rq); | 2246 | rt_queue_pull_task(rq); |
2246 | 2247 | ||
2247 | /* | 2248 | /* |
2248 | * If there's a higher priority task waiting to run | 2249 | * If there's a higher priority task waiting to run |
@@ -2292,6 +2293,14 @@ static void watchdog(struct rq *rq, struct task_struct *p) | |||
2292 | static inline void watchdog(struct rq *rq, struct task_struct *p) { } | 2293 | static inline void watchdog(struct rq *rq, struct task_struct *p) { } |
2293 | #endif | 2294 | #endif |
2294 | 2295 | ||
2296 | /* | ||
2297 | * scheduler tick hitting a task of our scheduling class. | ||
2298 | * | ||
2299 | * NOTE: This function can be called remotely by the tick offload that | ||
2300 | * goes along full dynticks. Therefore no local assumption can be made | ||
2301 | * and everything must be accessed through the @rq and @curr passed in | ||
2302 | * parameters. | ||
2303 | */ | ||
2295 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | 2304 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) |
2296 | { | 2305 | { |
2297 | struct sched_rt_entity *rt_se = &p->rt; | 2306 | struct sched_rt_entity *rt_se = &p->rt; |
@@ -2685,6 +2694,7 @@ int sched_rr_handler(struct ctl_table *table, int write, | |||
2685 | msecs_to_jiffies(sysctl_sched_rr_timeslice); | 2694 | msecs_to_jiffies(sysctl_sched_rr_timeslice); |
2686 | } | 2695 | } |
2687 | mutex_unlock(&mutex); | 2696 | mutex_unlock(&mutex); |
2697 | |||
2688 | return ret; | 2698 | return ret; |
2689 | } | 2699 | } |
2690 | 2700 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fb5fc458547f..c3deaee7a7a2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -1,39 +1,73 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | 2 | /* | |
3 | * Scheduler internal types and methods: | ||
4 | */ | ||
3 | #include <linux/sched.h> | 5 | #include <linux/sched.h> |
6 | |||
4 | #include <linux/sched/autogroup.h> | 7 | #include <linux/sched/autogroup.h> |
5 | #include <linux/sched/sysctl.h> | ||
6 | #include <linux/sched/topology.h> | ||
7 | #include <linux/sched/rt.h> | ||
8 | #include <linux/sched/deadline.h> | ||
9 | #include <linux/sched/clock.h> | 8 | #include <linux/sched/clock.h> |
10 | #include <linux/sched/wake_q.h> | 9 | #include <linux/sched/coredump.h> |
11 | #include <linux/sched/signal.h> | ||
12 | #include <linux/sched/numa_balancing.h> | ||
13 | #include <linux/sched/mm.h> | ||
14 | #include <linux/sched/cpufreq.h> | 10 | #include <linux/sched/cpufreq.h> |
15 | #include <linux/sched/stat.h> | 11 | #include <linux/sched/cputime.h> |
16 | #include <linux/sched/nohz.h> | 12 | #include <linux/sched/deadline.h> |
17 | #include <linux/sched/debug.h> | 13 | #include <linux/sched/debug.h> |
18 | #include <linux/sched/hotplug.h> | 14 | #include <linux/sched/hotplug.h> |
15 | #include <linux/sched/idle.h> | ||
16 | #include <linux/sched/init.h> | ||
17 | #include <linux/sched/isolation.h> | ||
18 | #include <linux/sched/jobctl.h> | ||
19 | #include <linux/sched/loadavg.h> | ||
20 | #include <linux/sched/mm.h> | ||
21 | #include <linux/sched/nohz.h> | ||
22 | #include <linux/sched/numa_balancing.h> | ||
23 | #include <linux/sched/prio.h> | ||
24 | #include <linux/sched/rt.h> | ||
25 | #include <linux/sched/signal.h> | ||
26 | #include <linux/sched/stat.h> | ||
27 | #include <linux/sched/sysctl.h> | ||
19 | #include <linux/sched/task.h> | 28 | #include <linux/sched/task.h> |
20 | #include <linux/sched/task_stack.h> | 29 | #include <linux/sched/task_stack.h> |
21 | #include <linux/sched/cputime.h> | 30 | #include <linux/sched/topology.h> |
22 | #include <linux/sched/init.h> | 31 | #include <linux/sched/user.h> |
32 | #include <linux/sched/wake_q.h> | ||
33 | #include <linux/sched/xacct.h> | ||
34 | |||
35 | #include <uapi/linux/sched/types.h> | ||
23 | 36 | ||
24 | #include <linux/u64_stats_sync.h> | ||
25 | #include <linux/kernel_stat.h> | ||
26 | #include <linux/binfmts.h> | 37 | #include <linux/binfmts.h> |
27 | #include <linux/mutex.h> | 38 | #include <linux/blkdev.h> |
28 | #include <linux/spinlock.h> | 39 | #include <linux/compat.h> |
40 | #include <linux/context_tracking.h> | ||
41 | #include <linux/cpufreq.h> | ||
42 | #include <linux/cpuidle.h> | ||
43 | #include <linux/cpuset.h> | ||
44 | #include <linux/ctype.h> | ||
45 | #include <linux/debugfs.h> | ||
46 | #include <linux/delayacct.h> | ||
47 | #include <linux/init_task.h> | ||
48 | #include <linux/kprobes.h> | ||
49 | #include <linux/kthread.h> | ||
50 | #include <linux/membarrier.h> | ||
51 | #include <linux/migrate.h> | ||
52 | #include <linux/mmu_context.h> | ||
53 | #include <linux/nmi.h> | ||
54 | #include <linux/proc_fs.h> | ||
55 | #include <linux/prefetch.h> | ||
56 | #include <linux/profile.h> | ||
57 | #include <linux/rcupdate_wait.h> | ||
58 | #include <linux/security.h> | ||
59 | #include <linux/stackprotector.h> | ||
29 | #include <linux/stop_machine.h> | 60 | #include <linux/stop_machine.h> |
30 | #include <linux/irq_work.h> | 61 | #include <linux/suspend.h> |
31 | #include <linux/tick.h> | 62 | #include <linux/swait.h> |
32 | #include <linux/slab.h> | 63 | #include <linux/syscalls.h> |
33 | #include <linux/cgroup.h> | 64 | #include <linux/task_work.h> |
65 | #include <linux/tsacct_kern.h> | ||
66 | |||
67 | #include <asm/tlb.h> | ||
34 | 68 | ||
35 | #ifdef CONFIG_PARAVIRT | 69 | #ifdef CONFIG_PARAVIRT |
36 | #include <asm/paravirt.h> | 70 | # include <asm/paravirt.h> |
37 | #endif | 71 | #endif |
38 | 72 | ||
39 | #include "cpupri.h" | 73 | #include "cpupri.h" |
@@ -79,11 +113,11 @@ static inline void cpu_load_update_active(struct rq *this_rq) { } | |||
79 | * and does not change the user-interface for setting shares/weights. | 113 | * and does not change the user-interface for setting shares/weights. |
80 | * | 114 | * |
81 | * We increase resolution only if we have enough bits to allow this increased | 115 | * We increase resolution only if we have enough bits to allow this increased |
82 | * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are | 116 | * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit |
83 | * pretty high and the returns do not justify the increased costs. | 117 | * are pretty high and the returns do not justify the increased costs. |
84 | * | 118 | * |
85 | * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to | 119 | * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to |
86 | * increase coverage and consistency always enable it on 64bit platforms. | 120 | * increase coverage and consistency always enable it on 64-bit platforms. |
87 | */ | 121 | */ |
88 | #ifdef CONFIG_64BIT | 122 | #ifdef CONFIG_64BIT |
89 | # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) | 123 | # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) |
@@ -111,16 +145,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { } | |||
111 | * 10 -> just above 1us | 145 | * 10 -> just above 1us |
112 | * 9 -> just above 0.5us | 146 | * 9 -> just above 0.5us |
113 | */ | 147 | */ |
114 | #define DL_SCALE (10) | 148 | #define DL_SCALE 10 |
115 | 149 | ||
116 | /* | 150 | /* |
117 | * These are the 'tuning knobs' of the scheduler: | 151 | * Single value that denotes runtime == period, ie unlimited time. |
118 | */ | 152 | */ |
119 | 153 | #define RUNTIME_INF ((u64)~0ULL) | |
120 | /* | ||
121 | * single value that denotes runtime == period, ie unlimited time. | ||
122 | */ | ||
123 | #define RUNTIME_INF ((u64)~0ULL) | ||
124 | 154 | ||
125 | static inline int idle_policy(int policy) | 155 | static inline int idle_policy(int policy) |
126 | { | 156 | { |
@@ -235,9 +265,9 @@ void __dl_clear_params(struct task_struct *p); | |||
235 | * control. | 265 | * control. |
236 | */ | 266 | */ |
237 | struct dl_bandwidth { | 267 | struct dl_bandwidth { |
238 | raw_spinlock_t dl_runtime_lock; | 268 | raw_spinlock_t dl_runtime_lock; |
239 | u64 dl_runtime; | 269 | u64 dl_runtime; |
240 | u64 dl_period; | 270 | u64 dl_period; |
241 | }; | 271 | }; |
242 | 272 | ||
243 | static inline int dl_bandwidth_enabled(void) | 273 | static inline int dl_bandwidth_enabled(void) |
@@ -246,8 +276,9 @@ static inline int dl_bandwidth_enabled(void) | |||
246 | } | 276 | } |
247 | 277 | ||
248 | struct dl_bw { | 278 | struct dl_bw { |
249 | raw_spinlock_t lock; | 279 | raw_spinlock_t lock; |
250 | u64 bw, total_bw; | 280 | u64 bw; |
281 | u64 total_bw; | ||
251 | }; | 282 | }; |
252 | 283 | ||
253 | static inline void __dl_update(struct dl_bw *dl_b, s64 bw); | 284 | static inline void __dl_update(struct dl_bw *dl_b, s64 bw); |
@@ -273,20 +304,17 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | |||
273 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | 304 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; |
274 | } | 305 | } |
275 | 306 | ||
276 | void dl_change_utilization(struct task_struct *p, u64 new_bw); | 307 | extern void dl_change_utilization(struct task_struct *p, u64 new_bw); |
277 | extern void init_dl_bw(struct dl_bw *dl_b); | 308 | extern void init_dl_bw(struct dl_bw *dl_b); |
278 | extern int sched_dl_global_validate(void); | 309 | extern int sched_dl_global_validate(void); |
279 | extern void sched_dl_do_global(void); | 310 | extern void sched_dl_do_global(void); |
280 | extern int sched_dl_overflow(struct task_struct *p, int policy, | 311 | extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); |
281 | const struct sched_attr *attr); | ||
282 | extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); | 312 | extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); |
283 | extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); | 313 | extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); |
284 | extern bool __checkparam_dl(const struct sched_attr *attr); | 314 | extern bool __checkparam_dl(const struct sched_attr *attr); |
285 | extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); | 315 | extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); |
286 | extern int dl_task_can_attach(struct task_struct *p, | 316 | extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); |
287 | const struct cpumask *cs_cpus_allowed); | 317 | extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); |
288 | extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, | ||
289 | const struct cpumask *trial); | ||
290 | extern bool dl_cpu_busy(unsigned int cpu); | 318 | extern bool dl_cpu_busy(unsigned int cpu); |
291 | 319 | ||
292 | #ifdef CONFIG_CGROUP_SCHED | 320 | #ifdef CONFIG_CGROUP_SCHED |
@@ -300,32 +328,36 @@ extern struct list_head task_groups; | |||
300 | 328 | ||
301 | struct cfs_bandwidth { | 329 | struct cfs_bandwidth { |
302 | #ifdef CONFIG_CFS_BANDWIDTH | 330 | #ifdef CONFIG_CFS_BANDWIDTH |
303 | raw_spinlock_t lock; | 331 | raw_spinlock_t lock; |
304 | ktime_t period; | 332 | ktime_t period; |
305 | u64 quota, runtime; | 333 | u64 quota; |
306 | s64 hierarchical_quota; | 334 | u64 runtime; |
307 | u64 runtime_expires; | 335 | s64 hierarchical_quota; |
308 | 336 | u64 runtime_expires; | |
309 | int idle, period_active; | 337 | |
310 | struct hrtimer period_timer, slack_timer; | 338 | int idle; |
311 | struct list_head throttled_cfs_rq; | 339 | int period_active; |
312 | 340 | struct hrtimer period_timer; | |
313 | /* statistics */ | 341 | struct hrtimer slack_timer; |
314 | int nr_periods, nr_throttled; | 342 | struct list_head throttled_cfs_rq; |
315 | u64 throttled_time; | 343 | |
344 | /* Statistics: */ | ||
345 | int nr_periods; | ||
346 | int nr_throttled; | ||
347 | u64 throttled_time; | ||
316 | #endif | 348 | #endif |
317 | }; | 349 | }; |
318 | 350 | ||
319 | /* task group related information */ | 351 | /* Task group related information */ |
320 | struct task_group { | 352 | struct task_group { |
321 | struct cgroup_subsys_state css; | 353 | struct cgroup_subsys_state css; |
322 | 354 | ||
323 | #ifdef CONFIG_FAIR_GROUP_SCHED | 355 | #ifdef CONFIG_FAIR_GROUP_SCHED |
324 | /* schedulable entities of this group on each cpu */ | 356 | /* schedulable entities of this group on each CPU */ |
325 | struct sched_entity **se; | 357 | struct sched_entity **se; |
326 | /* runqueue "owned" by this group on each cpu */ | 358 | /* runqueue "owned" by this group on each CPU */ |
327 | struct cfs_rq **cfs_rq; | 359 | struct cfs_rq **cfs_rq; |
328 | unsigned long shares; | 360 | unsigned long shares; |
329 | 361 | ||
330 | #ifdef CONFIG_SMP | 362 | #ifdef CONFIG_SMP |
331 | /* | 363 | /* |
@@ -333,29 +365,29 @@ struct task_group { | |||
333 | * it in its own cacheline separated from the fields above which | 365 | * it in its own cacheline separated from the fields above which |
334 | * will also be accessed at each tick. | 366 | * will also be accessed at each tick. |
335 | */ | 367 | */ |
336 | atomic_long_t load_avg ____cacheline_aligned; | 368 | atomic_long_t load_avg ____cacheline_aligned; |
337 | #endif | 369 | #endif |
338 | #endif | 370 | #endif |
339 | 371 | ||
340 | #ifdef CONFIG_RT_GROUP_SCHED | 372 | #ifdef CONFIG_RT_GROUP_SCHED |
341 | struct sched_rt_entity **rt_se; | 373 | struct sched_rt_entity **rt_se; |
342 | struct rt_rq **rt_rq; | 374 | struct rt_rq **rt_rq; |
343 | 375 | ||
344 | struct rt_bandwidth rt_bandwidth; | 376 | struct rt_bandwidth rt_bandwidth; |
345 | #endif | 377 | #endif |
346 | 378 | ||
347 | struct rcu_head rcu; | 379 | struct rcu_head rcu; |
348 | struct list_head list; | 380 | struct list_head list; |
349 | 381 | ||
350 | struct task_group *parent; | 382 | struct task_group *parent; |
351 | struct list_head siblings; | 383 | struct list_head siblings; |
352 | struct list_head children; | 384 | struct list_head children; |
353 | 385 | ||
354 | #ifdef CONFIG_SCHED_AUTOGROUP | 386 | #ifdef CONFIG_SCHED_AUTOGROUP |
355 | struct autogroup *autogroup; | 387 | struct autogroup *autogroup; |
356 | #endif | 388 | #endif |
357 | 389 | ||
358 | struct cfs_bandwidth cfs_bandwidth; | 390 | struct cfs_bandwidth cfs_bandwidth; |
359 | }; | 391 | }; |
360 | 392 | ||
361 | #ifdef CONFIG_FAIR_GROUP_SCHED | 393 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -369,8 +401,8 @@ struct task_group { | |||
369 | * (The default weight is 1024 - so there's no practical | 401 | * (The default weight is 1024 - so there's no practical |
370 | * limitation from this.) | 402 | * limitation from this.) |
371 | */ | 403 | */ |
372 | #define MIN_SHARES (1UL << 1) | 404 | #define MIN_SHARES (1UL << 1) |
373 | #define MAX_SHARES (1UL << 18) | 405 | #define MAX_SHARES (1UL << 18) |
374 | #endif | 406 | #endif |
375 | 407 | ||
376 | typedef int (*tg_visitor)(struct task_group *, void *); | 408 | typedef int (*tg_visitor)(struct task_group *, void *); |
@@ -443,35 +475,39 @@ struct cfs_bandwidth { }; | |||
443 | 475 | ||
444 | /* CFS-related fields in a runqueue */ | 476 | /* CFS-related fields in a runqueue */ |
445 | struct cfs_rq { | 477 | struct cfs_rq { |
446 | struct load_weight load; | 478 | struct load_weight load; |
447 | unsigned long runnable_weight; | 479 | unsigned long runnable_weight; |
448 | unsigned int nr_running, h_nr_running; | 480 | unsigned int nr_running; |
481 | unsigned int h_nr_running; | ||
449 | 482 | ||
450 | u64 exec_clock; | 483 | u64 exec_clock; |
451 | u64 min_vruntime; | 484 | u64 min_vruntime; |
452 | #ifndef CONFIG_64BIT | 485 | #ifndef CONFIG_64BIT |
453 | u64 min_vruntime_copy; | 486 | u64 min_vruntime_copy; |
454 | #endif | 487 | #endif |
455 | 488 | ||
456 | struct rb_root_cached tasks_timeline; | 489 | struct rb_root_cached tasks_timeline; |
457 | 490 | ||
458 | /* | 491 | /* |
459 | * 'curr' points to currently running entity on this cfs_rq. | 492 | * 'curr' points to currently running entity on this cfs_rq. |
460 | * It is set to NULL otherwise (i.e when none are currently running). | 493 | * It is set to NULL otherwise (i.e when none are currently running). |
461 | */ | 494 | */ |
462 | struct sched_entity *curr, *next, *last, *skip; | 495 | struct sched_entity *curr; |
496 | struct sched_entity *next; | ||
497 | struct sched_entity *last; | ||
498 | struct sched_entity *skip; | ||
463 | 499 | ||
464 | #ifdef CONFIG_SCHED_DEBUG | 500 | #ifdef CONFIG_SCHED_DEBUG |
465 | unsigned int nr_spread_over; | 501 | unsigned int nr_spread_over; |
466 | #endif | 502 | #endif |
467 | 503 | ||
468 | #ifdef CONFIG_SMP | 504 | #ifdef CONFIG_SMP |
469 | /* | 505 | /* |
470 | * CFS load tracking | 506 | * CFS load tracking |
471 | */ | 507 | */ |
472 | struct sched_avg avg; | 508 | struct sched_avg avg; |
473 | #ifndef CONFIG_64BIT | 509 | #ifndef CONFIG_64BIT |
474 | u64 load_last_update_time_copy; | 510 | u64 load_last_update_time_copy; |
475 | #endif | 511 | #endif |
476 | struct { | 512 | struct { |
477 | raw_spinlock_t lock ____cacheline_aligned; | 513 | raw_spinlock_t lock ____cacheline_aligned; |
@@ -482,9 +518,9 @@ struct cfs_rq { | |||
482 | } removed; | 518 | } removed; |
483 | 519 | ||
484 | #ifdef CONFIG_FAIR_GROUP_SCHED | 520 | #ifdef CONFIG_FAIR_GROUP_SCHED |
485 | unsigned long tg_load_avg_contrib; | 521 | unsigned long tg_load_avg_contrib; |
486 | long propagate; | 522 | long propagate; |
487 | long prop_runnable_sum; | 523 | long prop_runnable_sum; |
488 | 524 | ||
489 | /* | 525 | /* |
490 | * h_load = weight * f(tg) | 526 | * h_load = weight * f(tg) |
@@ -492,36 +528,38 @@ struct cfs_rq { | |||
492 | * Where f(tg) is the recursive weight fraction assigned to | 528 | * Where f(tg) is the recursive weight fraction assigned to |
493 | * this group. | 529 | * this group. |
494 | */ | 530 | */ |
495 | unsigned long h_load; | 531 | unsigned long h_load; |
496 | u64 last_h_load_update; | 532 | u64 last_h_load_update; |
497 | struct sched_entity *h_load_next; | 533 | struct sched_entity *h_load_next; |
498 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 534 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
499 | #endif /* CONFIG_SMP */ | 535 | #endif /* CONFIG_SMP */ |
500 | 536 | ||
501 | #ifdef CONFIG_FAIR_GROUP_SCHED | 537 | #ifdef CONFIG_FAIR_GROUP_SCHED |
502 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 538 | struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ |
503 | 539 | ||
504 | /* | 540 | /* |
505 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | 541 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in |
506 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | 542 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities |
507 | * (like users, containers etc.) | 543 | * (like users, containers etc.) |
508 | * | 544 | * |
509 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 545 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. |
510 | * list is used during load balance. | 546 | * This list is used during load balance. |
511 | */ | 547 | */ |
512 | int on_list; | 548 | int on_list; |
513 | struct list_head leaf_cfs_rq_list; | 549 | struct list_head leaf_cfs_rq_list; |
514 | struct task_group *tg; /* group that "owns" this runqueue */ | 550 | struct task_group *tg; /* group that "owns" this runqueue */ |
515 | 551 | ||
516 | #ifdef CONFIG_CFS_BANDWIDTH | 552 | #ifdef CONFIG_CFS_BANDWIDTH |
517 | int runtime_enabled; | 553 | int runtime_enabled; |
518 | u64 runtime_expires; | 554 | u64 runtime_expires; |
519 | s64 runtime_remaining; | 555 | s64 runtime_remaining; |
520 | 556 | ||
521 | u64 throttled_clock, throttled_clock_task; | 557 | u64 throttled_clock; |
522 | u64 throttled_clock_task_time; | 558 | u64 throttled_clock_task; |
523 | int throttled, throttle_count; | 559 | u64 throttled_clock_task_time; |
524 | struct list_head throttled_list; | 560 | int throttled; |
561 | int throttle_count; | ||
562 | struct list_head throttled_list; | ||
525 | #endif /* CONFIG_CFS_BANDWIDTH */ | 563 | #endif /* CONFIG_CFS_BANDWIDTH */ |
526 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 564 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
527 | }; | 565 | }; |
@@ -538,45 +576,45 @@ static inline int rt_bandwidth_enabled(void) | |||
538 | 576 | ||
539 | /* Real-Time classes' related field in a runqueue: */ | 577 | /* Real-Time classes' related field in a runqueue: */ |
540 | struct rt_rq { | 578 | struct rt_rq { |
541 | struct rt_prio_array active; | 579 | struct rt_prio_array active; |
542 | unsigned int rt_nr_running; | 580 | unsigned int rt_nr_running; |
543 | unsigned int rr_nr_running; | 581 | unsigned int rr_nr_running; |
544 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | 582 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED |
545 | struct { | 583 | struct { |
546 | int curr; /* highest queued rt task prio */ | 584 | int curr; /* highest queued rt task prio */ |
547 | #ifdef CONFIG_SMP | 585 | #ifdef CONFIG_SMP |
548 | int next; /* next highest */ | 586 | int next; /* next highest */ |
549 | #endif | 587 | #endif |
550 | } highest_prio; | 588 | } highest_prio; |
551 | #endif | 589 | #endif |
552 | #ifdef CONFIG_SMP | 590 | #ifdef CONFIG_SMP |
553 | unsigned long rt_nr_migratory; | 591 | unsigned long rt_nr_migratory; |
554 | unsigned long rt_nr_total; | 592 | unsigned long rt_nr_total; |
555 | int overloaded; | 593 | int overloaded; |
556 | struct plist_head pushable_tasks; | 594 | struct plist_head pushable_tasks; |
557 | #endif /* CONFIG_SMP */ | 595 | #endif /* CONFIG_SMP */ |
558 | int rt_queued; | 596 | int rt_queued; |
559 | 597 | ||
560 | int rt_throttled; | 598 | int rt_throttled; |
561 | u64 rt_time; | 599 | u64 rt_time; |
562 | u64 rt_runtime; | 600 | u64 rt_runtime; |
563 | /* Nests inside the rq lock: */ | 601 | /* Nests inside the rq lock: */ |
564 | raw_spinlock_t rt_runtime_lock; | 602 | raw_spinlock_t rt_runtime_lock; |
565 | 603 | ||
566 | #ifdef CONFIG_RT_GROUP_SCHED | 604 | #ifdef CONFIG_RT_GROUP_SCHED |
567 | unsigned long rt_nr_boosted; | 605 | unsigned long rt_nr_boosted; |
568 | 606 | ||
569 | struct rq *rq; | 607 | struct rq *rq; |
570 | struct task_group *tg; | 608 | struct task_group *tg; |
571 | #endif | 609 | #endif |
572 | }; | 610 | }; |
573 | 611 | ||
574 | /* Deadline class' related fields in a runqueue */ | 612 | /* Deadline class' related fields in a runqueue */ |
575 | struct dl_rq { | 613 | struct dl_rq { |
576 | /* runqueue is an rbtree, ordered by deadline */ | 614 | /* runqueue is an rbtree, ordered by deadline */ |
577 | struct rb_root_cached root; | 615 | struct rb_root_cached root; |
578 | 616 | ||
579 | unsigned long dl_nr_running; | 617 | unsigned long dl_nr_running; |
580 | 618 | ||
581 | #ifdef CONFIG_SMP | 619 | #ifdef CONFIG_SMP |
582 | /* | 620 | /* |
@@ -586,28 +624,28 @@ struct dl_rq { | |||
586 | * should migrate somewhere else. | 624 | * should migrate somewhere else. |
587 | */ | 625 | */ |
588 | struct { | 626 | struct { |
589 | u64 curr; | 627 | u64 curr; |
590 | u64 next; | 628 | u64 next; |
591 | } earliest_dl; | 629 | } earliest_dl; |
592 | 630 | ||
593 | unsigned long dl_nr_migratory; | 631 | unsigned long dl_nr_migratory; |
594 | int overloaded; | 632 | int overloaded; |
595 | 633 | ||
596 | /* | 634 | /* |
597 | * Tasks on this rq that can be pushed away. They are kept in | 635 | * Tasks on this rq that can be pushed away. They are kept in |
598 | * an rb-tree, ordered by tasks' deadlines, with caching | 636 | * an rb-tree, ordered by tasks' deadlines, with caching |
599 | * of the leftmost (earliest deadline) element. | 637 | * of the leftmost (earliest deadline) element. |
600 | */ | 638 | */ |
601 | struct rb_root_cached pushable_dl_tasks_root; | 639 | struct rb_root_cached pushable_dl_tasks_root; |
602 | #else | 640 | #else |
603 | struct dl_bw dl_bw; | 641 | struct dl_bw dl_bw; |
604 | #endif | 642 | #endif |
605 | /* | 643 | /* |
606 | * "Active utilization" for this runqueue: increased when a | 644 | * "Active utilization" for this runqueue: increased when a |
607 | * task wakes up (becomes TASK_RUNNING) and decreased when a | 645 | * task wakes up (becomes TASK_RUNNING) and decreased when a |
608 | * task blocks | 646 | * task blocks |
609 | */ | 647 | */ |
610 | u64 running_bw; | 648 | u64 running_bw; |
611 | 649 | ||
612 | /* | 650 | /* |
613 | * Utilization of the tasks "assigned" to this runqueue (including | 651 | * Utilization of the tasks "assigned" to this runqueue (including |
@@ -618,14 +656,14 @@ struct dl_rq { | |||
618 | * This is needed to compute the "inactive utilization" for the | 656 | * This is needed to compute the "inactive utilization" for the |
619 | * runqueue (inactive utilization = this_bw - running_bw). | 657 | * runqueue (inactive utilization = this_bw - running_bw). |
620 | */ | 658 | */ |
621 | u64 this_bw; | 659 | u64 this_bw; |
622 | u64 extra_bw; | 660 | u64 extra_bw; |
623 | 661 | ||
624 | /* | 662 | /* |
625 | * Inverse of the fraction of CPU utilization that can be reclaimed | 663 | * Inverse of the fraction of CPU utilization that can be reclaimed |
626 | * by the GRUB algorithm. | 664 | * by the GRUB algorithm. |
627 | */ | 665 | */ |
628 | u64 bw_ratio; | 666 | u64 bw_ratio; |
629 | }; | 667 | }; |
630 | 668 | ||
631 | #ifdef CONFIG_SMP | 669 | #ifdef CONFIG_SMP |
@@ -638,51 +676,51 @@ static inline bool sched_asym_prefer(int a, int b) | |||
638 | /* | 676 | /* |
639 | * We add the notion of a root-domain which will be used to define per-domain | 677 | * We add the notion of a root-domain which will be used to define per-domain |
640 | * variables. Each exclusive cpuset essentially defines an island domain by | 678 | * variables. Each exclusive cpuset essentially defines an island domain by |
641 | * fully partitioning the member cpus from any other cpuset. Whenever a new | 679 | * fully partitioning the member CPUs from any other cpuset. Whenever a new |
642 | * exclusive cpuset is created, we also create and attach a new root-domain | 680 | * exclusive cpuset is created, we also create and attach a new root-domain |
643 | * object. | 681 | * object. |
644 | * | 682 | * |
645 | */ | 683 | */ |
646 | struct root_domain { | 684 | struct root_domain { |
647 | atomic_t refcount; | 685 | atomic_t refcount; |
648 | atomic_t rto_count; | 686 | atomic_t rto_count; |
649 | struct rcu_head rcu; | 687 | struct rcu_head rcu; |
650 | cpumask_var_t span; | 688 | cpumask_var_t span; |
651 | cpumask_var_t online; | 689 | cpumask_var_t online; |
652 | 690 | ||
653 | /* Indicate more than one runnable task for any CPU */ | 691 | /* Indicate more than one runnable task for any CPU */ |
654 | bool overload; | 692 | bool overload; |
655 | 693 | ||
656 | /* | 694 | /* |
657 | * The bit corresponding to a CPU gets set here if such CPU has more | 695 | * The bit corresponding to a CPU gets set here if such CPU has more |
658 | * than one runnable -deadline task (as it is below for RT tasks). | 696 | * than one runnable -deadline task (as it is below for RT tasks). |
659 | */ | 697 | */ |
660 | cpumask_var_t dlo_mask; | 698 | cpumask_var_t dlo_mask; |
661 | atomic_t dlo_count; | 699 | atomic_t dlo_count; |
662 | struct dl_bw dl_bw; | 700 | struct dl_bw dl_bw; |
663 | struct cpudl cpudl; | 701 | struct cpudl cpudl; |
664 | 702 | ||
665 | #ifdef HAVE_RT_PUSH_IPI | 703 | #ifdef HAVE_RT_PUSH_IPI |
666 | /* | 704 | /* |
667 | * For IPI pull requests, loop across the rto_mask. | 705 | * For IPI pull requests, loop across the rto_mask. |
668 | */ | 706 | */ |
669 | struct irq_work rto_push_work; | 707 | struct irq_work rto_push_work; |
670 | raw_spinlock_t rto_lock; | 708 | raw_spinlock_t rto_lock; |
671 | /* These are only updated and read within rto_lock */ | 709 | /* These are only updated and read within rto_lock */ |
672 | int rto_loop; | 710 | int rto_loop; |
673 | int rto_cpu; | 711 | int rto_cpu; |
674 | /* These atomics are updated outside of a lock */ | 712 | /* These atomics are updated outside of a lock */ |
675 | atomic_t rto_loop_next; | 713 | atomic_t rto_loop_next; |
676 | atomic_t rto_loop_start; | 714 | atomic_t rto_loop_start; |
677 | #endif | 715 | #endif |
678 | /* | 716 | /* |
679 | * The "RT overload" flag: it gets set if a CPU has more than | 717 | * The "RT overload" flag: it gets set if a CPU has more than |
680 | * one runnable RT task. | 718 | * one runnable RT task. |
681 | */ | 719 | */ |
682 | cpumask_var_t rto_mask; | 720 | cpumask_var_t rto_mask; |
683 | struct cpupri cpupri; | 721 | struct cpupri cpupri; |
684 | 722 | ||
685 | unsigned long max_cpu_capacity; | 723 | unsigned long max_cpu_capacity; |
686 | }; | 724 | }; |
687 | 725 | ||
688 | extern struct root_domain def_root_domain; | 726 | extern struct root_domain def_root_domain; |
@@ -708,41 +746,42 @@ extern void rto_push_irq_work_func(struct irq_work *work); | |||
708 | */ | 746 | */ |
709 | struct rq { | 747 | struct rq { |
710 | /* runqueue lock: */ | 748 | /* runqueue lock: */ |
711 | raw_spinlock_t lock; | 749 | raw_spinlock_t lock; |
712 | 750 | ||
713 | /* | 751 | /* |
714 | * nr_running and cpu_load should be in the same cacheline because | 752 | * nr_running and cpu_load should be in the same cacheline because |
715 | * remote CPUs use both these fields when doing load calculation. | 753 | * remote CPUs use both these fields when doing load calculation. |
716 | */ | 754 | */ |
717 | unsigned int nr_running; | 755 | unsigned int nr_running; |
718 | #ifdef CONFIG_NUMA_BALANCING | 756 | #ifdef CONFIG_NUMA_BALANCING |
719 | unsigned int nr_numa_running; | 757 | unsigned int nr_numa_running; |
720 | unsigned int nr_preferred_running; | 758 | unsigned int nr_preferred_running; |
721 | #endif | 759 | #endif |
722 | #define CPU_LOAD_IDX_MAX 5 | 760 | #define CPU_LOAD_IDX_MAX 5 |
723 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 761 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
724 | #ifdef CONFIG_NO_HZ_COMMON | 762 | #ifdef CONFIG_NO_HZ_COMMON |
725 | #ifdef CONFIG_SMP | 763 | #ifdef CONFIG_SMP |
726 | unsigned long last_load_update_tick; | 764 | unsigned long last_load_update_tick; |
765 | unsigned long last_blocked_load_update_tick; | ||
766 | unsigned int has_blocked_load; | ||
727 | #endif /* CONFIG_SMP */ | 767 | #endif /* CONFIG_SMP */ |
728 | unsigned long nohz_flags; | 768 | unsigned int nohz_tick_stopped; |
769 | atomic_t nohz_flags; | ||
729 | #endif /* CONFIG_NO_HZ_COMMON */ | 770 | #endif /* CONFIG_NO_HZ_COMMON */ |
730 | #ifdef CONFIG_NO_HZ_FULL | ||
731 | unsigned long last_sched_tick; | ||
732 | #endif | ||
733 | /* capture load from *all* tasks on this cpu: */ | ||
734 | struct load_weight load; | ||
735 | unsigned long nr_load_updates; | ||
736 | u64 nr_switches; | ||
737 | 771 | ||
738 | struct cfs_rq cfs; | 772 | /* capture load from *all* tasks on this CPU: */ |
739 | struct rt_rq rt; | 773 | struct load_weight load; |
740 | struct dl_rq dl; | 774 | unsigned long nr_load_updates; |
775 | u64 nr_switches; | ||
776 | |||
777 | struct cfs_rq cfs; | ||
778 | struct rt_rq rt; | ||
779 | struct dl_rq dl; | ||
741 | 780 | ||
742 | #ifdef CONFIG_FAIR_GROUP_SCHED | 781 | #ifdef CONFIG_FAIR_GROUP_SCHED |
743 | /* list of leaf cfs_rq on this cpu: */ | 782 | /* list of leaf cfs_rq on this CPU: */ |
744 | struct list_head leaf_cfs_rq_list; | 783 | struct list_head leaf_cfs_rq_list; |
745 | struct list_head *tmp_alone_branch; | 784 | struct list_head *tmp_alone_branch; |
746 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 785 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
747 | 786 | ||
748 | /* | 787 | /* |
@@ -751,94 +790,98 @@ struct rq { | |||
751 | * one CPU and if it got migrated afterwards it may decrease | 790 | * one CPU and if it got migrated afterwards it may decrease |
752 | * it on another CPU. Always updated under the runqueue lock: | 791 | * it on another CPU. Always updated under the runqueue lock: |
753 | */ | 792 | */ |
754 | unsigned long nr_uninterruptible; | 793 | unsigned long nr_uninterruptible; |
755 | 794 | ||
756 | struct task_struct *curr, *idle, *stop; | 795 | struct task_struct *curr; |
757 | unsigned long next_balance; | 796 | struct task_struct *idle; |
758 | struct mm_struct *prev_mm; | 797 | struct task_struct *stop; |
798 | unsigned long next_balance; | ||
799 | struct mm_struct *prev_mm; | ||
759 | 800 | ||
760 | unsigned int clock_update_flags; | 801 | unsigned int clock_update_flags; |
761 | u64 clock; | 802 | u64 clock; |
762 | u64 clock_task; | 803 | u64 clock_task; |
763 | 804 | ||
764 | atomic_t nr_iowait; | 805 | atomic_t nr_iowait; |
765 | 806 | ||
766 | #ifdef CONFIG_SMP | 807 | #ifdef CONFIG_SMP |
767 | struct root_domain *rd; | 808 | struct root_domain *rd; |
768 | struct sched_domain *sd; | 809 | struct sched_domain *sd; |
769 | 810 | ||
770 | unsigned long cpu_capacity; | 811 | unsigned long cpu_capacity; |
771 | unsigned long cpu_capacity_orig; | 812 | unsigned long cpu_capacity_orig; |
772 | 813 | ||
773 | struct callback_head *balance_callback; | 814 | struct callback_head *balance_callback; |
815 | |||
816 | unsigned char idle_balance; | ||
774 | 817 | ||
775 | unsigned char idle_balance; | ||
776 | /* For active balancing */ | 818 | /* For active balancing */ |
777 | int active_balance; | 819 | int active_balance; |
778 | int push_cpu; | 820 | int push_cpu; |
779 | struct cpu_stop_work active_balance_work; | 821 | struct cpu_stop_work active_balance_work; |
780 | /* cpu of this runqueue: */ | 822 | |
781 | int cpu; | 823 | /* CPU of this runqueue: */ |
782 | int online; | 824 | int cpu; |
825 | int online; | ||
783 | 826 | ||
784 | struct list_head cfs_tasks; | 827 | struct list_head cfs_tasks; |
785 | 828 | ||
786 | u64 rt_avg; | 829 | u64 rt_avg; |
787 | u64 age_stamp; | 830 | u64 age_stamp; |
788 | u64 idle_stamp; | 831 | u64 idle_stamp; |
789 | u64 avg_idle; | 832 | u64 avg_idle; |
790 | 833 | ||
791 | /* This is used to determine avg_idle's max value */ | 834 | /* This is used to determine avg_idle's max value */ |
792 | u64 max_idle_balance_cost; | 835 | u64 max_idle_balance_cost; |
793 | #endif | 836 | #endif |
794 | 837 | ||
795 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 838 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
796 | u64 prev_irq_time; | 839 | u64 prev_irq_time; |
797 | #endif | 840 | #endif |
798 | #ifdef CONFIG_PARAVIRT | 841 | #ifdef CONFIG_PARAVIRT |
799 | u64 prev_steal_time; | 842 | u64 prev_steal_time; |
800 | #endif | 843 | #endif |
801 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | 844 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING |
802 | u64 prev_steal_time_rq; | 845 | u64 prev_steal_time_rq; |
803 | #endif | 846 | #endif |
804 | 847 | ||
805 | /* calc_load related fields */ | 848 | /* calc_load related fields */ |
806 | unsigned long calc_load_update; | 849 | unsigned long calc_load_update; |
807 | long calc_load_active; | 850 | long calc_load_active; |
808 | 851 | ||
809 | #ifdef CONFIG_SCHED_HRTICK | 852 | #ifdef CONFIG_SCHED_HRTICK |
810 | #ifdef CONFIG_SMP | 853 | #ifdef CONFIG_SMP |
811 | int hrtick_csd_pending; | 854 | int hrtick_csd_pending; |
812 | call_single_data_t hrtick_csd; | 855 | call_single_data_t hrtick_csd; |
813 | #endif | 856 | #endif |
814 | struct hrtimer hrtick_timer; | 857 | struct hrtimer hrtick_timer; |
815 | #endif | 858 | #endif |
816 | 859 | ||
817 | #ifdef CONFIG_SCHEDSTATS | 860 | #ifdef CONFIG_SCHEDSTATS |
818 | /* latency stats */ | 861 | /* latency stats */ |
819 | struct sched_info rq_sched_info; | 862 | struct sched_info rq_sched_info; |
820 | unsigned long long rq_cpu_time; | 863 | unsigned long long rq_cpu_time; |
821 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ | 864 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ |
822 | 865 | ||
823 | /* sys_sched_yield() stats */ | 866 | /* sys_sched_yield() stats */ |
824 | unsigned int yld_count; | 867 | unsigned int yld_count; |
825 | 868 | ||
826 | /* schedule() stats */ | 869 | /* schedule() stats */ |
827 | unsigned int sched_count; | 870 | unsigned int sched_count; |
828 | unsigned int sched_goidle; | 871 | unsigned int sched_goidle; |
829 | 872 | ||
830 | /* try_to_wake_up() stats */ | 873 | /* try_to_wake_up() stats */ |
831 | unsigned int ttwu_count; | 874 | unsigned int ttwu_count; |
832 | unsigned int ttwu_local; | 875 | unsigned int ttwu_local; |
833 | #endif | 876 | #endif |
834 | 877 | ||
835 | #ifdef CONFIG_SMP | 878 | #ifdef CONFIG_SMP |
836 | struct llist_head wake_list; | 879 | struct llist_head wake_list; |
837 | #endif | 880 | #endif |
838 | 881 | ||
839 | #ifdef CONFIG_CPU_IDLE | 882 | #ifdef CONFIG_CPU_IDLE |
840 | /* Must be inspected within a rcu lock section */ | 883 | /* Must be inspected within a rcu lock section */ |
841 | struct cpuidle_state *idle_state; | 884 | struct cpuidle_state *idle_state; |
842 | #endif | 885 | #endif |
843 | }; | 886 | }; |
844 | 887 | ||
@@ -904,9 +947,9 @@ static inline u64 __rq_clock_broken(struct rq *rq) | |||
904 | * one position though, because the next rq_unpin_lock() will shift it | 947 | * one position though, because the next rq_unpin_lock() will shift it |
905 | * back. | 948 | * back. |
906 | */ | 949 | */ |
907 | #define RQCF_REQ_SKIP 0x01 | 950 | #define RQCF_REQ_SKIP 0x01 |
908 | #define RQCF_ACT_SKIP 0x02 | 951 | #define RQCF_ACT_SKIP 0x02 |
909 | #define RQCF_UPDATED 0x04 | 952 | #define RQCF_UPDATED 0x04 |
910 | 953 | ||
911 | static inline void assert_clock_updated(struct rq *rq) | 954 | static inline void assert_clock_updated(struct rq *rq) |
912 | { | 955 | { |
@@ -1059,12 +1102,12 @@ extern void sched_ttwu_pending(void); | |||
1059 | 1102 | ||
1060 | /** | 1103 | /** |
1061 | * highest_flag_domain - Return highest sched_domain containing flag. | 1104 | * highest_flag_domain - Return highest sched_domain containing flag. |
1062 | * @cpu: The cpu whose highest level of sched domain is to | 1105 | * @cpu: The CPU whose highest level of sched domain is to |
1063 | * be returned. | 1106 | * be returned. |
1064 | * @flag: The flag to check for the highest sched_domain | 1107 | * @flag: The flag to check for the highest sched_domain |
1065 | * for the given cpu. | 1108 | * for the given CPU. |
1066 | * | 1109 | * |
1067 | * Returns the highest sched_domain of a cpu which contains the given flag. | 1110 | * Returns the highest sched_domain of a CPU which contains the given flag. |
1068 | */ | 1111 | */ |
1069 | static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | 1112 | static inline struct sched_domain *highest_flag_domain(int cpu, int flag) |
1070 | { | 1113 | { |
@@ -1099,30 +1142,30 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); | |||
1099 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); | 1142 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); |
1100 | 1143 | ||
1101 | struct sched_group_capacity { | 1144 | struct sched_group_capacity { |
1102 | atomic_t ref; | 1145 | atomic_t ref; |
1103 | /* | 1146 | /* |
1104 | * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity | 1147 | * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity |
1105 | * for a single CPU. | 1148 | * for a single CPU. |
1106 | */ | 1149 | */ |
1107 | unsigned long capacity; | 1150 | unsigned long capacity; |
1108 | unsigned long min_capacity; /* Min per-CPU capacity in group */ | 1151 | unsigned long min_capacity; /* Min per-CPU capacity in group */ |
1109 | unsigned long next_update; | 1152 | unsigned long next_update; |
1110 | int imbalance; /* XXX unrelated to capacity but shared group state */ | 1153 | int imbalance; /* XXX unrelated to capacity but shared group state */ |
1111 | 1154 | ||
1112 | #ifdef CONFIG_SCHED_DEBUG | 1155 | #ifdef CONFIG_SCHED_DEBUG |
1113 | int id; | 1156 | int id; |
1114 | #endif | 1157 | #endif |
1115 | 1158 | ||
1116 | unsigned long cpumask[0]; /* balance mask */ | 1159 | unsigned long cpumask[0]; /* Balance mask */ |
1117 | }; | 1160 | }; |
1118 | 1161 | ||
1119 | struct sched_group { | 1162 | struct sched_group { |
1120 | struct sched_group *next; /* Must be a circular list */ | 1163 | struct sched_group *next; /* Must be a circular list */ |
1121 | atomic_t ref; | 1164 | atomic_t ref; |
1122 | 1165 | ||
1123 | unsigned int group_weight; | 1166 | unsigned int group_weight; |
1124 | struct sched_group_capacity *sgc; | 1167 | struct sched_group_capacity *sgc; |
1125 | int asym_prefer_cpu; /* cpu of highest priority in group */ | 1168 | int asym_prefer_cpu; /* CPU of highest priority in group */ |
1126 | 1169 | ||
1127 | /* | 1170 | /* |
1128 | * The CPUs this group covers. | 1171 | * The CPUs this group covers. |
@@ -1131,7 +1174,7 @@ struct sched_group { | |||
1131 | * by attaching extra space to the end of the structure, | 1174 | * by attaching extra space to the end of the structure, |
1132 | * depending on how many CPUs the kernel has booted up with) | 1175 | * depending on how many CPUs the kernel has booted up with) |
1133 | */ | 1176 | */ |
1134 | unsigned long cpumask[0]; | 1177 | unsigned long cpumask[0]; |
1135 | }; | 1178 | }; |
1136 | 1179 | ||
1137 | static inline struct cpumask *sched_group_span(struct sched_group *sg) | 1180 | static inline struct cpumask *sched_group_span(struct sched_group *sg) |
@@ -1148,8 +1191,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) | |||
1148 | } | 1191 | } |
1149 | 1192 | ||
1150 | /** | 1193 | /** |
1151 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | 1194 | * group_first_cpu - Returns the first CPU in the cpumask of a sched_group. |
1152 | * @group: The group whose first cpu is to be returned. | 1195 | * @group: The group whose first CPU is to be returned. |
1153 | */ | 1196 | */ |
1154 | static inline unsigned int group_first_cpu(struct sched_group *group) | 1197 | static inline unsigned int group_first_cpu(struct sched_group *group) |
1155 | { | 1198 | { |
@@ -1349,19 +1392,12 @@ static inline int task_on_rq_migrating(struct task_struct *p) | |||
1349 | return p->on_rq == TASK_ON_RQ_MIGRATING; | 1392 | return p->on_rq == TASK_ON_RQ_MIGRATING; |
1350 | } | 1393 | } |
1351 | 1394 | ||
1352 | #ifndef prepare_arch_switch | ||
1353 | # define prepare_arch_switch(next) do { } while (0) | ||
1354 | #endif | ||
1355 | #ifndef finish_arch_post_lock_switch | ||
1356 | # define finish_arch_post_lock_switch() do { } while (0) | ||
1357 | #endif | ||
1358 | |||
1359 | /* | 1395 | /* |
1360 | * wake flags | 1396 | * wake flags |
1361 | */ | 1397 | */ |
1362 | #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ | 1398 | #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ |
1363 | #define WF_FORK 0x02 /* child wakeup after fork */ | 1399 | #define WF_FORK 0x02 /* Child wakeup after fork */ |
1364 | #define WF_MIGRATED 0x4 /* internal use, task got migrated */ | 1400 | #define WF_MIGRATED 0x4 /* Internal use, task got migrated */ |
1365 | 1401 | ||
1366 | /* | 1402 | /* |
1367 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 1403 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
@@ -1372,11 +1408,11 @@ static inline int task_on_rq_migrating(struct task_struct *p) | |||
1372 | * slice expiry etc. | 1408 | * slice expiry etc. |
1373 | */ | 1409 | */ |
1374 | 1410 | ||
1375 | #define WEIGHT_IDLEPRIO 3 | 1411 | #define WEIGHT_IDLEPRIO 3 |
1376 | #define WMULT_IDLEPRIO 1431655765 | 1412 | #define WMULT_IDLEPRIO 1431655765 |
1377 | 1413 | ||
1378 | extern const int sched_prio_to_weight[40]; | 1414 | extern const int sched_prio_to_weight[40]; |
1379 | extern const u32 sched_prio_to_wmult[40]; | 1415 | extern const u32 sched_prio_to_wmult[40]; |
1380 | 1416 | ||
1381 | /* | 1417 | /* |
1382 | * {de,en}queue flags: | 1418 | * {de,en}queue flags: |
@@ -1398,9 +1434,9 @@ extern const u32 sched_prio_to_wmult[40]; | |||
1398 | */ | 1434 | */ |
1399 | 1435 | ||
1400 | #define DEQUEUE_SLEEP 0x01 | 1436 | #define DEQUEUE_SLEEP 0x01 |
1401 | #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ | 1437 | #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ |
1402 | #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ | 1438 | #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ |
1403 | #define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ | 1439 | #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ |
1404 | 1440 | ||
1405 | #define ENQUEUE_WAKEUP 0x01 | 1441 | #define ENQUEUE_WAKEUP 0x01 |
1406 | #define ENQUEUE_RESTORE 0x02 | 1442 | #define ENQUEUE_RESTORE 0x02 |
@@ -1422,10 +1458,10 @@ struct sched_class { | |||
1422 | 1458 | ||
1423 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); | 1459 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); |
1424 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); | 1460 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); |
1425 | void (*yield_task) (struct rq *rq); | 1461 | void (*yield_task) (struct rq *rq); |
1426 | bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); | 1462 | bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); |
1427 | 1463 | ||
1428 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); | 1464 | void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); |
1429 | 1465 | ||
1430 | /* | 1466 | /* |
1431 | * It is the responsibility of the pick_next_task() method that will | 1467 | * It is the responsibility of the pick_next_task() method that will |
@@ -1435,16 +1471,16 @@ struct sched_class { | |||
1435 | * May return RETRY_TASK when it finds a higher prio class has runnable | 1471 | * May return RETRY_TASK when it finds a higher prio class has runnable |
1436 | * tasks. | 1472 | * tasks. |
1437 | */ | 1473 | */ |
1438 | struct task_struct * (*pick_next_task) (struct rq *rq, | 1474 | struct task_struct * (*pick_next_task)(struct rq *rq, |
1439 | struct task_struct *prev, | 1475 | struct task_struct *prev, |
1440 | struct rq_flags *rf); | 1476 | struct rq_flags *rf); |
1441 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | 1477 | void (*put_prev_task)(struct rq *rq, struct task_struct *p); |
1442 | 1478 | ||
1443 | #ifdef CONFIG_SMP | 1479 | #ifdef CONFIG_SMP |
1444 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); | 1480 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); |
1445 | void (*migrate_task_rq)(struct task_struct *p); | 1481 | void (*migrate_task_rq)(struct task_struct *p); |
1446 | 1482 | ||
1447 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); | 1483 | void (*task_woken)(struct rq *this_rq, struct task_struct *task); |
1448 | 1484 | ||
1449 | void (*set_cpus_allowed)(struct task_struct *p, | 1485 | void (*set_cpus_allowed)(struct task_struct *p, |
1450 | const struct cpumask *newmask); | 1486 | const struct cpumask *newmask); |
@@ -1453,31 +1489,31 @@ struct sched_class { | |||
1453 | void (*rq_offline)(struct rq *rq); | 1489 | void (*rq_offline)(struct rq *rq); |
1454 | #endif | 1490 | #endif |
1455 | 1491 | ||
1456 | void (*set_curr_task) (struct rq *rq); | 1492 | void (*set_curr_task)(struct rq *rq); |
1457 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); | 1493 | void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); |
1458 | void (*task_fork) (struct task_struct *p); | 1494 | void (*task_fork)(struct task_struct *p); |
1459 | void (*task_dead) (struct task_struct *p); | 1495 | void (*task_dead)(struct task_struct *p); |
1460 | 1496 | ||
1461 | /* | 1497 | /* |
1462 | * The switched_from() call is allowed to drop rq->lock, therefore we | 1498 | * The switched_from() call is allowed to drop rq->lock, therefore we |
1463 | * cannot assume the switched_from/switched_to pair is serliazed by | 1499 | * cannot assume the switched_from/switched_to pair is serliazed by |
1464 | * rq->lock. They are however serialized by p->pi_lock. | 1500 | * rq->lock. They are however serialized by p->pi_lock. |
1465 | */ | 1501 | */ |
1466 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); | 1502 | void (*switched_from)(struct rq *this_rq, struct task_struct *task); |
1467 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); | 1503 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); |
1468 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, | 1504 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, |
1469 | int oldprio); | 1505 | int oldprio); |
1470 | 1506 | ||
1471 | unsigned int (*get_rr_interval) (struct rq *rq, | 1507 | unsigned int (*get_rr_interval)(struct rq *rq, |
1472 | struct task_struct *task); | 1508 | struct task_struct *task); |
1473 | 1509 | ||
1474 | void (*update_curr) (struct rq *rq); | 1510 | void (*update_curr)(struct rq *rq); |
1475 | 1511 | ||
1476 | #define TASK_SET_GROUP 0 | 1512 | #define TASK_SET_GROUP 0 |
1477 | #define TASK_MOVE_GROUP 1 | 1513 | #define TASK_MOVE_GROUP 1 |
1478 | 1514 | ||
1479 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1515 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1480 | void (*task_change_group) (struct task_struct *p, int type); | 1516 | void (*task_change_group)(struct task_struct *p, int type); |
1481 | #endif | 1517 | #endif |
1482 | }; | 1518 | }; |
1483 | 1519 | ||
@@ -1526,6 +1562,7 @@ static inline void idle_set_state(struct rq *rq, | |||
1526 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) | 1562 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) |
1527 | { | 1563 | { |
1528 | SCHED_WARN_ON(!rcu_read_lock_held()); | 1564 | SCHED_WARN_ON(!rcu_read_lock_held()); |
1565 | |||
1529 | return rq->idle_state; | 1566 | return rq->idle_state; |
1530 | } | 1567 | } |
1531 | #else | 1568 | #else |
@@ -1564,9 +1601,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); | |||
1564 | extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); | 1601 | extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); |
1565 | extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); | 1602 | extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); |
1566 | 1603 | ||
1567 | #define BW_SHIFT 20 | 1604 | #define BW_SHIFT 20 |
1568 | #define BW_UNIT (1 << BW_SHIFT) | 1605 | #define BW_UNIT (1 << BW_SHIFT) |
1569 | #define RATIO_SHIFT 8 | 1606 | #define RATIO_SHIFT 8 |
1570 | unsigned long to_ratio(u64 period, u64 runtime); | 1607 | unsigned long to_ratio(u64 period, u64 runtime); |
1571 | 1608 | ||
1572 | extern void init_entity_runnable_average(struct sched_entity *se); | 1609 | extern void init_entity_runnable_average(struct sched_entity *se); |
@@ -1574,6 +1611,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se); | |||
1574 | 1611 | ||
1575 | #ifdef CONFIG_NO_HZ_FULL | 1612 | #ifdef CONFIG_NO_HZ_FULL |
1576 | extern bool sched_can_stop_tick(struct rq *rq); | 1613 | extern bool sched_can_stop_tick(struct rq *rq); |
1614 | extern int __init sched_tick_offload_init(void); | ||
1577 | 1615 | ||
1578 | /* | 1616 | /* |
1579 | * Tick may be needed by tasks in the runqueue depending on their policy and | 1617 | * Tick may be needed by tasks in the runqueue depending on their policy and |
@@ -1598,6 +1636,7 @@ static inline void sched_update_tick_dependency(struct rq *rq) | |||
1598 | tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); | 1636 | tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); |
1599 | } | 1637 | } |
1600 | #else | 1638 | #else |
1639 | static inline int sched_tick_offload_init(void) { return 0; } | ||
1601 | static inline void sched_update_tick_dependency(struct rq *rq) { } | 1640 | static inline void sched_update_tick_dependency(struct rq *rq) { } |
1602 | #endif | 1641 | #endif |
1603 | 1642 | ||
@@ -1624,13 +1663,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) | |||
1624 | sched_update_tick_dependency(rq); | 1663 | sched_update_tick_dependency(rq); |
1625 | } | 1664 | } |
1626 | 1665 | ||
1627 | static inline void rq_last_tick_reset(struct rq *rq) | ||
1628 | { | ||
1629 | #ifdef CONFIG_NO_HZ_FULL | ||
1630 | rq->last_sched_tick = jiffies; | ||
1631 | #endif | ||
1632 | } | ||
1633 | |||
1634 | extern void update_rq_clock(struct rq *rq); | 1666 | extern void update_rq_clock(struct rq *rq); |
1635 | 1667 | ||
1636 | extern void activate_task(struct rq *rq, struct task_struct *p, int flags); | 1668 | extern void activate_task(struct rq *rq, struct task_struct *p, int flags); |
@@ -1821,8 +1853,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
1821 | /* | 1853 | /* |
1822 | * Unfair double_lock_balance: Optimizes throughput at the expense of | 1854 | * Unfair double_lock_balance: Optimizes throughput at the expense of |
1823 | * latency by eliminating extra atomic operations when the locks are | 1855 | * latency by eliminating extra atomic operations when the locks are |
1824 | * already in proper order on entry. This favors lower cpu-ids and will | 1856 | * already in proper order on entry. This favors lower CPU-ids and will |
1825 | * grant the double lock to lower cpus over higher ids under contention, | 1857 | * grant the double lock to lower CPUs over higher ids under contention, |
1826 | * regardless of entry order into the function. | 1858 | * regardless of entry order into the function. |
1827 | */ | 1859 | */ |
1828 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | 1860 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) |
@@ -1854,7 +1886,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
1854 | static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) | 1886 | static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) |
1855 | { | 1887 | { |
1856 | if (unlikely(!irqs_disabled())) { | 1888 | if (unlikely(!irqs_disabled())) { |
1857 | /* printk() doesn't work good under rq->lock */ | 1889 | /* printk() doesn't work well under rq->lock */ |
1858 | raw_spin_unlock(&this_rq->lock); | 1890 | raw_spin_unlock(&this_rq->lock); |
1859 | BUG_ON(1); | 1891 | BUG_ON(1); |
1860 | } | 1892 | } |
@@ -2005,16 +2037,19 @@ extern void cfs_bandwidth_usage_inc(void); | |||
2005 | extern void cfs_bandwidth_usage_dec(void); | 2037 | extern void cfs_bandwidth_usage_dec(void); |
2006 | 2038 | ||
2007 | #ifdef CONFIG_NO_HZ_COMMON | 2039 | #ifdef CONFIG_NO_HZ_COMMON |
2008 | enum rq_nohz_flag_bits { | 2040 | #define NOHZ_BALANCE_KICK_BIT 0 |
2009 | NOHZ_TICK_STOPPED, | 2041 | #define NOHZ_STATS_KICK_BIT 1 |
2010 | NOHZ_BALANCE_KICK, | 2042 | |
2011 | }; | 2043 | #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) |
2044 | #define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) | ||
2045 | |||
2046 | #define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) | ||
2012 | 2047 | ||
2013 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) | 2048 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) |
2014 | 2049 | ||
2015 | extern void nohz_balance_exit_idle(unsigned int cpu); | 2050 | extern void nohz_balance_exit_idle(struct rq *rq); |
2016 | #else | 2051 | #else |
2017 | static inline void nohz_balance_exit_idle(unsigned int cpu) { } | 2052 | static inline void nohz_balance_exit_idle(struct rq *rq) { } |
2018 | #endif | 2053 | #endif |
2019 | 2054 | ||
2020 | 2055 | ||
@@ -2113,15 +2148,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} | |||
2113 | #endif /* CONFIG_CPU_FREQ */ | 2148 | #endif /* CONFIG_CPU_FREQ */ |
2114 | 2149 | ||
2115 | #ifdef arch_scale_freq_capacity | 2150 | #ifdef arch_scale_freq_capacity |
2116 | #ifndef arch_scale_freq_invariant | 2151 | # ifndef arch_scale_freq_invariant |
2117 | #define arch_scale_freq_invariant() (true) | 2152 | # define arch_scale_freq_invariant() true |
2118 | #endif | 2153 | # endif |
2119 | #else /* arch_scale_freq_capacity */ | 2154 | #else |
2120 | #define arch_scale_freq_invariant() (false) | 2155 | # define arch_scale_freq_invariant() false |
2121 | #endif | 2156 | #endif |
2122 | 2157 | ||
2123 | #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL | 2158 | #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL |
2124 | |||
2125 | static inline unsigned long cpu_util_dl(struct rq *rq) | 2159 | static inline unsigned long cpu_util_dl(struct rq *rq) |
2126 | { | 2160 | { |
2127 | return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; | 2161 | return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; |
@@ -2129,7 +2163,13 @@ static inline unsigned long cpu_util_dl(struct rq *rq) | |||
2129 | 2163 | ||
2130 | static inline unsigned long cpu_util_cfs(struct rq *rq) | 2164 | static inline unsigned long cpu_util_cfs(struct rq *rq) |
2131 | { | 2165 | { |
2132 | return rq->cfs.avg.util_avg; | 2166 | unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); |
2133 | } | 2167 | |
2168 | if (sched_feat(UTIL_EST)) { | ||
2169 | util = max_t(unsigned long, util, | ||
2170 | READ_ONCE(rq->cfs.avg.util_est.enqueued)); | ||
2171 | } | ||
2134 | 2172 | ||
2173 | return util; | ||
2174 | } | ||
2135 | #endif | 2175 | #endif |
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 940b1fa1d2ce..ab112cbfd7c8 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c | |||
@@ -1,14 +1,13 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | 2 | /* | |
3 | #include <linux/slab.h> | 3 | * /proc/schedstat implementation |
4 | #include <linux/fs.h> | 4 | */ |
5 | #include <linux/seq_file.h> | ||
6 | #include <linux/proc_fs.h> | ||
7 | |||
8 | #include "sched.h" | 5 | #include "sched.h" |
9 | 6 | ||
10 | /* | 7 | /* |
11 | * bump this up when changing the output format or the meaning of an existing | 8 | * Current schedstat API version. |
9 | * | ||
10 | * Bump this up when changing the output format or the meaning of an existing | ||
12 | * format, so that tools can adapt (or abort) | 11 | * format, so that tools can adapt (or abort) |
13 | */ | 12 | */ |
14 | #define SCHEDSTAT_VERSION 15 | 13 | #define SCHEDSTAT_VERSION 15 |
@@ -78,8 +77,8 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
78 | * This itererator needs some explanation. | 77 | * This itererator needs some explanation. |
79 | * It returns 1 for the header position. | 78 | * It returns 1 for the header position. |
80 | * This means 2 is cpu 0. | 79 | * This means 2 is cpu 0. |
81 | * In a hotplugged system some cpus, including cpu 0, may be missing so we have | 80 | * In a hotplugged system some CPUs, including cpu 0, may be missing so we have |
82 | * to use cpumask_* to iterate over the cpus. | 81 | * to use cpumask_* to iterate over the CPUs. |
83 | */ | 82 | */ |
84 | static void *schedstat_start(struct seq_file *file, loff_t *offset) | 83 | static void *schedstat_start(struct seq_file *file, loff_t *offset) |
85 | { | 84 | { |
@@ -99,12 +98,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset) | |||
99 | 98 | ||
100 | if (n < nr_cpu_ids) | 99 | if (n < nr_cpu_ids) |
101 | return (void *)(unsigned long)(n + 2); | 100 | return (void *)(unsigned long)(n + 2); |
101 | |||
102 | return NULL; | 102 | return NULL; |
103 | } | 103 | } |
104 | 104 | ||
105 | static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) | 105 | static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) |
106 | { | 106 | { |
107 | (*offset)++; | 107 | (*offset)++; |
108 | |||
108 | return schedstat_start(file, offset); | 109 | return schedstat_start(file, offset); |
109 | } | 110 | } |
110 | 111 | ||
@@ -134,6 +135,7 @@ static const struct file_operations proc_schedstat_operations = { | |||
134 | static int __init proc_schedstat_init(void) | 135 | static int __init proc_schedstat_init(void) |
135 | { | 136 | { |
136 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); | 137 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); |
138 | |||
137 | return 0; | 139 | return 0; |
138 | } | 140 | } |
139 | subsys_initcall(proc_schedstat_init); | 141 | subsys_initcall(proc_schedstat_init); |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8e7b58de61e7..8aea199a39b4 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
@@ -30,35 +30,29 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | |||
30 | if (rq) | 30 | if (rq) |
31 | rq->rq_sched_info.run_delay += delta; | 31 | rq->rq_sched_info.run_delay += delta; |
32 | } | 32 | } |
33 | #define schedstat_enabled() static_branch_unlikely(&sched_schedstats) | 33 | #define schedstat_enabled() static_branch_unlikely(&sched_schedstats) |
34 | #define __schedstat_inc(var) do { var++; } while (0) | 34 | #define __schedstat_inc(var) do { var++; } while (0) |
35 | #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) | 35 | #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) |
36 | #define __schedstat_add(var, amt) do { var += (amt); } while (0) | 36 | #define __schedstat_add(var, amt) do { var += (amt); } while (0) |
37 | #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) | 37 | #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) |
38 | #define __schedstat_set(var, val) do { var = (val); } while (0) | 38 | #define __schedstat_set(var, val) do { var = (val); } while (0) |
39 | #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) | 39 | #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) |
40 | #define schedstat_val(var) (var) | 40 | #define schedstat_val(var) (var) |
41 | #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) | 41 | #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) |
42 | 42 | ||
43 | #else /* !CONFIG_SCHEDSTATS */ | 43 | #else /* !CONFIG_SCHEDSTATS: */ |
44 | static inline void | 44 | static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } |
45 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) | 45 | static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { } |
46 | {} | 46 | static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } |
47 | static inline void | 47 | # define schedstat_enabled() 0 |
48 | rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | 48 | # define __schedstat_inc(var) do { } while (0) |
49 | {} | 49 | # define schedstat_inc(var) do { } while (0) |
50 | static inline void | 50 | # define __schedstat_add(var, amt) do { } while (0) |
51 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) | 51 | # define schedstat_add(var, amt) do { } while (0) |
52 | {} | 52 | # define __schedstat_set(var, val) do { } while (0) |
53 | #define schedstat_enabled() 0 | 53 | # define schedstat_set(var, val) do { } while (0) |
54 | #define __schedstat_inc(var) do { } while (0) | 54 | # define schedstat_val(var) 0 |
55 | #define schedstat_inc(var) do { } while (0) | 55 | # define schedstat_val_or_zero(var) 0 |
56 | #define __schedstat_add(var, amt) do { } while (0) | ||
57 | #define schedstat_add(var, amt) do { } while (0) | ||
58 | #define __schedstat_set(var, val) do { } while (0) | ||
59 | #define schedstat_set(var, val) do { } while (0) | ||
60 | #define schedstat_val(var) 0 | ||
61 | #define schedstat_val_or_zero(var) 0 | ||
62 | #endif /* CONFIG_SCHEDSTATS */ | 56 | #endif /* CONFIG_SCHEDSTATS */ |
63 | 57 | ||
64 | #ifdef CONFIG_SCHED_INFO | 58 | #ifdef CONFIG_SCHED_INFO |
@@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) | |||
69 | 63 | ||
70 | /* | 64 | /* |
71 | * We are interested in knowing how long it was from the *first* time a | 65 | * We are interested in knowing how long it was from the *first* time a |
72 | * task was queued to the time that it finally hit a cpu, we call this routine | 66 | * task was queued to the time that it finally hit a CPU, we call this routine |
73 | * from dequeue_task() to account for possible rq->clock skew across cpus. The | 67 | * from dequeue_task() to account for possible rq->clock skew across CPUs. The |
74 | * delta taken on each cpu would annul the skew. | 68 | * delta taken on each CPU would annul the skew. |
75 | */ | 69 | */ |
76 | static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) | 70 | static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) |
77 | { | 71 | { |
@@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) | |||
87 | } | 81 | } |
88 | 82 | ||
89 | /* | 83 | /* |
90 | * Called when a task finally hits the cpu. We can now calculate how | 84 | * Called when a task finally hits the CPU. We can now calculate how |
91 | * long it was waiting to run. We also note when it began so that we | 85 | * long it was waiting to run. We also note when it began so that we |
92 | * can keep stats on how long its timeslice is. | 86 | * can keep stats on how long its timeslice is. |
93 | */ | 87 | */ |
@@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) | |||
112 | */ | 106 | */ |
113 | static inline void sched_info_queued(struct rq *rq, struct task_struct *t) | 107 | static inline void sched_info_queued(struct rq *rq, struct task_struct *t) |
114 | { | 108 | { |
115 | if (unlikely(sched_info_on())) | 109 | if (unlikely(sched_info_on())) { |
116 | if (!t->sched_info.last_queued) | 110 | if (!t->sched_info.last_queued) |
117 | t->sched_info.last_queued = rq_clock(rq); | 111 | t->sched_info.last_queued = rq_clock(rq); |
112 | } | ||
118 | } | 113 | } |
119 | 114 | ||
120 | /* | 115 | /* |
@@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) | |||
127 | */ | 122 | */ |
128 | static inline void sched_info_depart(struct rq *rq, struct task_struct *t) | 123 | static inline void sched_info_depart(struct rq *rq, struct task_struct *t) |
129 | { | 124 | { |
130 | unsigned long long delta = rq_clock(rq) - | 125 | unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival; |
131 | t->sched_info.last_arrival; | ||
132 | 126 | ||
133 | rq_sched_info_depart(rq, delta); | 127 | rq_sched_info_depart(rq, delta); |
134 | 128 | ||
@@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) | |||
142 | * the idle task.) We are only called when prev != next. | 136 | * the idle task.) We are only called when prev != next. |
143 | */ | 137 | */ |
144 | static inline void | 138 | static inline void |
145 | __sched_info_switch(struct rq *rq, | 139 | __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) |
146 | struct task_struct *prev, struct task_struct *next) | ||
147 | { | 140 | { |
148 | /* | 141 | /* |
149 | * prev now departs the cpu. It's not interesting to record | 142 | * prev now departs the CPU. It's not interesting to record |
150 | * stats about how efficient we were at scheduling the idle | 143 | * stats about how efficient we were at scheduling the idle |
151 | * process, however. | 144 | * process, however. |
152 | */ | 145 | */ |
@@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq, | |||
156 | if (next != rq->idle) | 149 | if (next != rq->idle) |
157 | sched_info_arrive(rq, next); | 150 | sched_info_arrive(rq, next); |
158 | } | 151 | } |
152 | |||
159 | static inline void | 153 | static inline void |
160 | sched_info_switch(struct rq *rq, | 154 | sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) |
161 | struct task_struct *prev, struct task_struct *next) | ||
162 | { | 155 | { |
163 | if (unlikely(sched_info_on())) | 156 | if (unlikely(sched_info_on())) |
164 | __sched_info_switch(rq, prev, next); | 157 | __sched_info_switch(rq, prev, next); |
165 | } | 158 | } |
166 | #else | 159 | |
167 | #define sched_info_queued(rq, t) do { } while (0) | 160 | #else /* !CONFIG_SCHED_INFO: */ |
168 | #define sched_info_reset_dequeued(t) do { } while (0) | 161 | # define sched_info_queued(rq, t) do { } while (0) |
169 | #define sched_info_dequeued(rq, t) do { } while (0) | 162 | # define sched_info_reset_dequeued(t) do { } while (0) |
170 | #define sched_info_depart(rq, t) do { } while (0) | 163 | # define sched_info_dequeued(rq, t) do { } while (0) |
171 | #define sched_info_arrive(rq, next) do { } while (0) | 164 | # define sched_info_depart(rq, t) do { } while (0) |
172 | #define sched_info_switch(rq, t, next) do { } while (0) | 165 | # define sched_info_arrive(rq, next) do { } while (0) |
166 | # define sched_info_switch(rq, t, next) do { } while (0) | ||
173 | #endif /* CONFIG_SCHED_INFO */ | 167 | #endif /* CONFIG_SCHED_INFO */ |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 210b1f2146ff..c183b790ca54 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
@@ -1,6 +1,4 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include "sched.h" | ||
3 | |||
4 | /* | 2 | /* |
5 | * stop-task scheduling class. | 3 | * stop-task scheduling class. |
6 | * | 4 | * |
@@ -9,6 +7,7 @@ | |||
9 | * | 7 | * |
10 | * See kernel/stop_machine.c | 8 | * See kernel/stop_machine.c |
11 | */ | 9 | */ |
10 | #include "sched.h" | ||
12 | 11 | ||
13 | #ifdef CONFIG_SMP | 12 | #ifdef CONFIG_SMP |
14 | static int | 13 | static int |
@@ -75,6 +74,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) | |||
75 | cgroup_account_cputime(curr, delta_exec); | 74 | cgroup_account_cputime(curr, delta_exec); |
76 | } | 75 | } |
77 | 76 | ||
77 | /* | ||
78 | * scheduler tick hitting a task of our scheduling class. | ||
79 | * | ||
80 | * NOTE: This function can be called remotely by the tick offload that | ||
81 | * goes along full dynticks. Therefore no local assumption can be made | ||
82 | * and everything must be accessed through the @rq and @curr passed in | ||
83 | * parameters. | ||
84 | */ | ||
78 | static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) | 85 | static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) |
79 | { | 86 | { |
80 | } | 87 | } |
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 9ff1555341ed..b6fb2c3b3ff7 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c | |||
@@ -1,6 +1,8 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/sched/signal.h> | 2 | /* |
3 | #include <linux/swait.h> | 3 | * <linux/swait.h> (simple wait queues ) implementation: |
4 | */ | ||
5 | #include "sched.h" | ||
4 | 6 | ||
5 | void __init_swait_queue_head(struct swait_queue_head *q, const char *name, | 7 | void __init_swait_queue_head(struct swait_queue_head *q, const char *name, |
6 | struct lock_class_key *key) | 8 | struct lock_class_key *key) |
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 519b024f4e94..64cc564f5255 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c | |||
@@ -2,10 +2,6 @@ | |||
2 | /* | 2 | /* |
3 | * Scheduler topology setup/handling methods | 3 | * Scheduler topology setup/handling methods |
4 | */ | 4 | */ |
5 | #include <linux/sched.h> | ||
6 | #include <linux/mutex.h> | ||
7 | #include <linux/sched/isolation.h> | ||
8 | |||
9 | #include "sched.h" | 5 | #include "sched.h" |
10 | 6 | ||
11 | DEFINE_MUTEX(sched_domains_mutex); | 7 | DEFINE_MUTEX(sched_domains_mutex); |
@@ -41,8 +37,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
41 | if (!(sd->flags & SD_LOAD_BALANCE)) { | 37 | if (!(sd->flags & SD_LOAD_BALANCE)) { |
42 | printk("does not load-balance\n"); | 38 | printk("does not load-balance\n"); |
43 | if (sd->parent) | 39 | if (sd->parent) |
44 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" | 40 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); |
45 | " has parent"); | ||
46 | return -1; | 41 | return -1; |
47 | } | 42 | } |
48 | 43 | ||
@@ -50,12 +45,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
50 | cpumask_pr_args(sched_domain_span(sd)), sd->name); | 45 | cpumask_pr_args(sched_domain_span(sd)), sd->name); |
51 | 46 | ||
52 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { | 47 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { |
53 | printk(KERN_ERR "ERROR: domain->span does not contain " | 48 | printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); |
54 | "CPU%d\n", cpu); | ||
55 | } | 49 | } |
56 | if (!cpumask_test_cpu(cpu, sched_group_span(group))) { | 50 | if (!cpumask_test_cpu(cpu, sched_group_span(group))) { |
57 | printk(KERN_ERR "ERROR: domain->groups does not contain" | 51 | printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); |
58 | " CPU%d\n", cpu); | ||
59 | } | 52 | } |
60 | 53 | ||
61 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); | 54 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); |
@@ -115,8 +108,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
115 | 108 | ||
116 | if (sd->parent && | 109 | if (sd->parent && |
117 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) | 110 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) |
118 | printk(KERN_ERR "ERROR: parent span is not a superset " | 111 | printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); |
119 | "of domain->span\n"); | ||
120 | return 0; | 112 | return 0; |
121 | } | 113 | } |
122 | 114 | ||
@@ -595,7 +587,7 @@ int group_balance_cpu(struct sched_group *sg) | |||
595 | * are not. | 587 | * are not. |
596 | * | 588 | * |
597 | * This leads to a few particularly weird cases where the sched_domain's are | 589 | * This leads to a few particularly weird cases where the sched_domain's are |
598 | * not of the same number for each cpu. Consider: | 590 | * not of the same number for each CPU. Consider: |
599 | * | 591 | * |
600 | * NUMA-2 0-3 0-3 | 592 | * NUMA-2 0-3 0-3 |
601 | * groups: {0-2},{1-3} {1-3},{0-2} | 593 | * groups: {0-2},{1-3} {1-3},{0-2} |
@@ -780,7 +772,7 @@ fail: | |||
780 | * ^ ^ ^ ^ | 772 | * ^ ^ ^ ^ |
781 | * `-' `-' | 773 | * `-' `-' |
782 | * | 774 | * |
783 | * The sched_domains are per-cpu and have a two way link (parent & child) and | 775 | * The sched_domains are per-CPU and have a two way link (parent & child) and |
784 | * denote the ever growing mask of CPUs belonging to that level of topology. | 776 | * denote the ever growing mask of CPUs belonging to that level of topology. |
785 | * | 777 | * |
786 | * Each sched_domain has a circular (double) linked list of sched_group's, each | 778 | * Each sched_domain has a circular (double) linked list of sched_group's, each |
@@ -1021,6 +1013,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) | |||
1021 | d->rd = alloc_rootdomain(); | 1013 | d->rd = alloc_rootdomain(); |
1022 | if (!d->rd) | 1014 | if (!d->rd) |
1023 | return sa_sd; | 1015 | return sa_sd; |
1016 | |||
1024 | return sa_rootdomain; | 1017 | return sa_rootdomain; |
1025 | } | 1018 | } |
1026 | 1019 | ||
@@ -1047,12 +1040,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd) | |||
1047 | } | 1040 | } |
1048 | 1041 | ||
1049 | #ifdef CONFIG_NUMA | 1042 | #ifdef CONFIG_NUMA |
1050 | static int sched_domains_numa_levels; | ||
1051 | enum numa_topology_type sched_numa_topology_type; | 1043 | enum numa_topology_type sched_numa_topology_type; |
1052 | static int *sched_domains_numa_distance; | 1044 | |
1053 | int sched_max_numa_distance; | 1045 | static int sched_domains_numa_levels; |
1054 | static struct cpumask ***sched_domains_numa_masks; | 1046 | static int sched_domains_curr_level; |
1055 | static int sched_domains_curr_level; | 1047 | |
1048 | int sched_max_numa_distance; | ||
1049 | static int *sched_domains_numa_distance; | ||
1050 | static struct cpumask ***sched_domains_numa_masks; | ||
1056 | #endif | 1051 | #endif |
1057 | 1052 | ||
1058 | /* | 1053 | /* |
@@ -1074,11 +1069,11 @@ static int sched_domains_curr_level; | |||
1074 | * SD_ASYM_PACKING - describes SMT quirks | 1069 | * SD_ASYM_PACKING - describes SMT quirks |
1075 | */ | 1070 | */ |
1076 | #define TOPOLOGY_SD_FLAGS \ | 1071 | #define TOPOLOGY_SD_FLAGS \ |
1077 | (SD_SHARE_CPUCAPACITY | \ | 1072 | (SD_SHARE_CPUCAPACITY | \ |
1078 | SD_SHARE_PKG_RESOURCES | \ | 1073 | SD_SHARE_PKG_RESOURCES | \ |
1079 | SD_NUMA | \ | 1074 | SD_NUMA | \ |
1080 | SD_ASYM_PACKING | \ | 1075 | SD_ASYM_PACKING | \ |
1081 | SD_ASYM_CPUCAPACITY | \ | 1076 | SD_ASYM_CPUCAPACITY | \ |
1082 | SD_SHARE_POWERDOMAIN) | 1077 | SD_SHARE_POWERDOMAIN) |
1083 | 1078 | ||
1084 | static struct sched_domain * | 1079 | static struct sched_domain * |
@@ -1628,7 +1623,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve | |||
1628 | pr_err(" the %s domain not a subset of the %s domain\n", | 1623 | pr_err(" the %s domain not a subset of the %s domain\n", |
1629 | child->name, sd->name); | 1624 | child->name, sd->name); |
1630 | #endif | 1625 | #endif |
1631 | /* Fixup, ensure @sd has at least @child cpus. */ | 1626 | /* Fixup, ensure @sd has at least @child CPUs. */ |
1632 | cpumask_or(sched_domain_span(sd), | 1627 | cpumask_or(sched_domain_span(sd), |
1633 | sched_domain_span(sd), | 1628 | sched_domain_span(sd), |
1634 | sched_domain_span(child)); | 1629 | sched_domain_span(child)); |
@@ -1720,6 +1715,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att | |||
1720 | ret = 0; | 1715 | ret = 0; |
1721 | error: | 1716 | error: |
1722 | __free_domain_allocs(&d, alloc_state, cpu_map); | 1717 | __free_domain_allocs(&d, alloc_state, cpu_map); |
1718 | |||
1723 | return ret; | 1719 | return ret; |
1724 | } | 1720 | } |
1725 | 1721 | ||
@@ -1824,6 +1820,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | |||
1824 | return 1; | 1820 | return 1; |
1825 | 1821 | ||
1826 | tmp = SD_ATTR_INIT; | 1822 | tmp = SD_ATTR_INIT; |
1823 | |||
1827 | return !memcmp(cur ? (cur + idx_cur) : &tmp, | 1824 | return !memcmp(cur ? (cur + idx_cur) : &tmp, |
1828 | new ? (new + idx_new) : &tmp, | 1825 | new ? (new + idx_new) : &tmp, |
1829 | sizeof(struct sched_domain_attr)); | 1826 | sizeof(struct sched_domain_attr)); |
@@ -1929,4 +1926,3 @@ match2: | |||
1929 | 1926 | ||
1930 | mutex_unlock(&sched_domains_mutex); | 1927 | mutex_unlock(&sched_domains_mutex); |
1931 | } | 1928 | } |
1932 | |||
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 929ecb7d6b78..928be527477e 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
@@ -3,14 +3,7 @@ | |||
3 | * | 3 | * |
4 | * (C) 2004 Nadia Yvette Chambers, Oracle | 4 | * (C) 2004 Nadia Yvette Chambers, Oracle |
5 | */ | 5 | */ |
6 | #include <linux/init.h> | 6 | #include "sched.h" |
7 | #include <linux/export.h> | ||
8 | #include <linux/sched/signal.h> | ||
9 | #include <linux/sched/debug.h> | ||
10 | #include <linux/mm.h> | ||
11 | #include <linux/wait.h> | ||
12 | #include <linux/hash.h> | ||
13 | #include <linux/kthread.h> | ||
14 | 7 | ||
15 | void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) | 8 | void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) |
16 | { | 9 | { |
@@ -107,6 +100,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, | |||
107 | break; | 100 | break; |
108 | } | 101 | } |
109 | } | 102 | } |
103 | |||
110 | return nr_exclusive; | 104 | return nr_exclusive; |
111 | } | 105 | } |
112 | 106 | ||
@@ -317,6 +311,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait) | |||
317 | spin_unlock(&wq->lock); | 311 | spin_unlock(&wq->lock); |
318 | schedule(); | 312 | schedule(); |
319 | spin_lock(&wq->lock); | 313 | spin_lock(&wq->lock); |
314 | |||
320 | return 0; | 315 | return 0; |
321 | } | 316 | } |
322 | EXPORT_SYMBOL(do_wait_intr); | 317 | EXPORT_SYMBOL(do_wait_intr); |
@@ -333,6 +328,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait) | |||
333 | spin_unlock_irq(&wq->lock); | 328 | spin_unlock_irq(&wq->lock); |
334 | schedule(); | 329 | schedule(); |
335 | spin_lock_irq(&wq->lock); | 330 | spin_lock_irq(&wq->lock); |
331 | |||
336 | return 0; | 332 | return 0; |
337 | } | 333 | } |
338 | EXPORT_SYMBOL(do_wait_intr_irq); | 334 | EXPORT_SYMBOL(do_wait_intr_irq); |
@@ -378,6 +374,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i | |||
378 | 374 | ||
379 | if (ret) | 375 | if (ret) |
380 | list_del_init(&wq_entry->entry); | 376 | list_del_init(&wq_entry->entry); |
377 | |||
381 | return ret; | 378 | return ret; |
382 | } | 379 | } |
383 | EXPORT_SYMBOL(autoremove_wake_function); | 380 | EXPORT_SYMBOL(autoremove_wake_function); |
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 84cb3acd9260..c67c6d24adc2 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c | |||
@@ -1,10 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * The implementation of the wait_bit*() and related waiting APIs: | 2 | * The implementation of the wait_bit*() and related waiting APIs: |
3 | */ | 3 | */ |
4 | #include <linux/wait_bit.h> | 4 | #include "sched.h" |
5 | #include <linux/sched/signal.h> | ||
6 | #include <linux/sched/debug.h> | ||
7 | #include <linux/hash.h> | ||
8 | 5 | ||
9 | #define WAIT_TABLE_BITS 8 | 6 | #define WAIT_TABLE_BITS 8 |
10 | #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) | 7 | #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) |
@@ -29,8 +26,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync | |||
29 | wait_bit->key.bit_nr != key->bit_nr || | 26 | wait_bit->key.bit_nr != key->bit_nr || |
30 | test_bit(key->bit_nr, key->flags)) | 27 | test_bit(key->bit_nr, key->flags)) |
31 | return 0; | 28 | return 0; |
32 | else | 29 | |
33 | return autoremove_wake_function(wq_entry, mode, sync, key); | 30 | return autoremove_wake_function(wq_entry, mode, sync, key); |
34 | } | 31 | } |
35 | EXPORT_SYMBOL(wake_bit_function); | 32 | EXPORT_SYMBOL(wake_bit_function); |
36 | 33 | ||
@@ -50,7 +47,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ | |||
50 | if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) | 47 | if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) |
51 | ret = (*action)(&wbq_entry->key, mode); | 48 | ret = (*action)(&wbq_entry->key, mode); |
52 | } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); | 49 | } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); |
50 | |||
53 | finish_wait(wq_head, &wbq_entry->wq_entry); | 51 | finish_wait(wq_head, &wbq_entry->wq_entry); |
52 | |||
54 | return ret; | 53 | return ret; |
55 | } | 54 | } |
56 | EXPORT_SYMBOL(__wait_on_bit); | 55 | EXPORT_SYMBOL(__wait_on_bit); |
@@ -73,6 +72,7 @@ int __sched out_of_line_wait_on_bit_timeout( | |||
73 | DEFINE_WAIT_BIT(wq_entry, word, bit); | 72 | DEFINE_WAIT_BIT(wq_entry, word, bit); |
74 | 73 | ||
75 | wq_entry.key.timeout = jiffies + timeout; | 74 | wq_entry.key.timeout = jiffies + timeout; |
75 | |||
76 | return __wait_on_bit(wq_head, &wq_entry, action, mode); | 76 | return __wait_on_bit(wq_head, &wq_entry, action, mode); |
77 | } | 77 | } |
78 | EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); | 78 | EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); |
@@ -120,6 +120,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); | |||
120 | void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) | 120 | void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) |
121 | { | 121 | { |
122 | struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); | 122 | struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); |
123 | |||
123 | if (waitqueue_active(wq_head)) | 124 | if (waitqueue_active(wq_head)) |
124 | __wake_up(wq_head, TASK_NORMAL, 1, &key); | 125 | __wake_up(wq_head, TASK_NORMAL, 1, &key); |
125 | } | 126 | } |
@@ -148,108 +149,55 @@ void wake_up_bit(void *word, int bit) | |||
148 | } | 149 | } |
149 | EXPORT_SYMBOL(wake_up_bit); | 150 | EXPORT_SYMBOL(wake_up_bit); |
150 | 151 | ||
151 | /* | 152 | wait_queue_head_t *__var_waitqueue(void *p) |
152 | * Manipulate the atomic_t address to produce a better bit waitqueue table hash | ||
153 | * index (we're keying off bit -1, but that would produce a horrible hash | ||
154 | * value). | ||
155 | */ | ||
156 | static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) | ||
157 | { | 153 | { |
158 | if (BITS_PER_LONG == 64) { | 154 | return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS); |
159 | unsigned long q = (unsigned long)p; | ||
160 | return bit_waitqueue((void *)(q & ~1), q & 1); | ||
161 | } | ||
162 | return bit_waitqueue(p, 0); | ||
163 | } | 155 | } |
156 | EXPORT_SYMBOL(__var_waitqueue); | ||
164 | 157 | ||
165 | static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, | 158 | static int |
166 | void *arg) | 159 | var_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, |
160 | int sync, void *arg) | ||
167 | { | 161 | { |
168 | struct wait_bit_key *key = arg; | 162 | struct wait_bit_key *key = arg; |
169 | struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); | 163 | struct wait_bit_queue_entry *wbq_entry = |
170 | atomic_t *val = key->flags; | 164 | container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); |
171 | 165 | ||
172 | if (wait_bit->key.flags != key->flags || | 166 | if (wbq_entry->key.flags != key->flags || |
173 | wait_bit->key.bit_nr != key->bit_nr || | 167 | wbq_entry->key.bit_nr != key->bit_nr) |
174 | atomic_read(val) != 0) | ||
175 | return 0; | 168 | return 0; |
176 | return autoremove_wake_function(wq_entry, mode, sync, key); | ||
177 | } | ||
178 | 169 | ||
179 | /* | 170 | return autoremove_wake_function(wq_entry, mode, sync, key); |
180 | * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, | ||
181 | * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero | ||
182 | * return codes halt waiting and return. | ||
183 | */ | ||
184 | static __sched | ||
185 | int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, | ||
186 | wait_atomic_t_action_f action, unsigned int mode) | ||
187 | { | ||
188 | atomic_t *val; | ||
189 | int ret = 0; | ||
190 | |||
191 | do { | ||
192 | prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); | ||
193 | val = wbq_entry->key.flags; | ||
194 | if (atomic_read(val) == 0) | ||
195 | break; | ||
196 | ret = (*action)(val, mode); | ||
197 | } while (!ret && atomic_read(val) != 0); | ||
198 | finish_wait(wq_head, &wbq_entry->wq_entry); | ||
199 | return ret; | ||
200 | } | 171 | } |
201 | 172 | ||
202 | #define DEFINE_WAIT_ATOMIC_T(name, p) \ | 173 | void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags) |
203 | struct wait_bit_queue_entry name = { \ | ||
204 | .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ | ||
205 | .wq_entry = { \ | ||
206 | .private = current, \ | ||
207 | .func = wake_atomic_t_function, \ | ||
208 | .entry = \ | ||
209 | LIST_HEAD_INIT((name).wq_entry.entry), \ | ||
210 | }, \ | ||
211 | } | ||
212 | |||
213 | __sched int out_of_line_wait_on_atomic_t(atomic_t *p, | ||
214 | wait_atomic_t_action_f action, | ||
215 | unsigned int mode) | ||
216 | { | 174 | { |
217 | struct wait_queue_head *wq_head = atomic_t_waitqueue(p); | 175 | *wbq_entry = (struct wait_bit_queue_entry){ |
218 | DEFINE_WAIT_ATOMIC_T(wq_entry, p); | 176 | .key = { |
219 | 177 | .flags = (var), | |
220 | return __wait_on_atomic_t(wq_head, &wq_entry, action, mode); | 178 | .bit_nr = -1, |
179 | }, | ||
180 | .wq_entry = { | ||
181 | .private = current, | ||
182 | .func = var_wake_function, | ||
183 | .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry), | ||
184 | }, | ||
185 | }; | ||
221 | } | 186 | } |
222 | EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); | 187 | EXPORT_SYMBOL(init_wait_var_entry); |
223 | 188 | ||
224 | __sched int atomic_t_wait(atomic_t *counter, unsigned int mode) | 189 | void wake_up_var(void *var) |
225 | { | 190 | { |
226 | schedule(); | 191 | __wake_up_bit(__var_waitqueue(var), var, -1); |
227 | if (signal_pending_state(mode, current)) | ||
228 | return -EINTR; | ||
229 | return 0; | ||
230 | } | 192 | } |
231 | EXPORT_SYMBOL(atomic_t_wait); | 193 | EXPORT_SYMBOL(wake_up_var); |
232 | |||
233 | /** | ||
234 | * wake_up_atomic_t - Wake up a waiter on a atomic_t | ||
235 | * @p: The atomic_t being waited on, a kernel virtual address | ||
236 | * | ||
237 | * Wake up anyone waiting for the atomic_t to go to zero. | ||
238 | * | ||
239 | * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t | ||
240 | * check is done by the waiter's wake function, not the by the waker itself). | ||
241 | */ | ||
242 | void wake_up_atomic_t(atomic_t *p) | ||
243 | { | ||
244 | __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); | ||
245 | } | ||
246 | EXPORT_SYMBOL(wake_up_atomic_t); | ||
247 | 194 | ||
248 | __sched int bit_wait(struct wait_bit_key *word, int mode) | 195 | __sched int bit_wait(struct wait_bit_key *word, int mode) |
249 | { | 196 | { |
250 | schedule(); | 197 | schedule(); |
251 | if (signal_pending_state(mode, current)) | 198 | if (signal_pending_state(mode, current)) |
252 | return -EINTR; | 199 | return -EINTR; |
200 | |||
253 | return 0; | 201 | return 0; |
254 | } | 202 | } |
255 | EXPORT_SYMBOL(bit_wait); | 203 | EXPORT_SYMBOL(bit_wait); |
@@ -259,6 +207,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode) | |||
259 | io_schedule(); | 207 | io_schedule(); |
260 | if (signal_pending_state(mode, current)) | 208 | if (signal_pending_state(mode, current)) |
261 | return -EINTR; | 209 | return -EINTR; |
210 | |||
262 | return 0; | 211 | return 0; |
263 | } | 212 | } |
264 | EXPORT_SYMBOL(bit_wait_io); | 213 | EXPORT_SYMBOL(bit_wait_io); |
@@ -266,11 +215,13 @@ EXPORT_SYMBOL(bit_wait_io); | |||
266 | __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) | 215 | __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) |
267 | { | 216 | { |
268 | unsigned long now = READ_ONCE(jiffies); | 217 | unsigned long now = READ_ONCE(jiffies); |
218 | |||
269 | if (time_after_eq(now, word->timeout)) | 219 | if (time_after_eq(now, word->timeout)) |
270 | return -EAGAIN; | 220 | return -EAGAIN; |
271 | schedule_timeout(word->timeout - now); | 221 | schedule_timeout(word->timeout - now); |
272 | if (signal_pending_state(mode, current)) | 222 | if (signal_pending_state(mode, current)) |
273 | return -EINTR; | 223 | return -EINTR; |
224 | |||
274 | return 0; | 225 | return 0; |
275 | } | 226 | } |
276 | EXPORT_SYMBOL_GPL(bit_wait_timeout); | 227 | EXPORT_SYMBOL_GPL(bit_wait_timeout); |
@@ -278,11 +229,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout); | |||
278 | __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) | 229 | __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) |
279 | { | 230 | { |
280 | unsigned long now = READ_ONCE(jiffies); | 231 | unsigned long now = READ_ONCE(jiffies); |
232 | |||
281 | if (time_after_eq(now, word->timeout)) | 233 | if (time_after_eq(now, word->timeout)) |
282 | return -EAGAIN; | 234 | return -EAGAIN; |
283 | io_schedule_timeout(word->timeout - now); | 235 | io_schedule_timeout(word->timeout - now); |
284 | if (signal_pending_state(mode, current)) | 236 | if (signal_pending_state(mode, current)) |
285 | return -EINTR; | 237 | return -EINTR; |
238 | |||
286 | return 0; | 239 | return 0; |
287 | } | 240 | } |
288 | EXPORT_SYMBOL_GPL(bit_wait_io_timeout); | 241 | EXPORT_SYMBOL_GPL(bit_wait_io_timeout); |
diff --git a/kernel/signal.c b/kernel/signal.c index c6e4c83dc090..f04466655238 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -3573,9 +3573,8 @@ int __save_altstack(stack_t __user *uss, unsigned long sp) | |||
3573 | } | 3573 | } |
3574 | 3574 | ||
3575 | #ifdef CONFIG_COMPAT | 3575 | #ifdef CONFIG_COMPAT |
3576 | COMPAT_SYSCALL_DEFINE2(sigaltstack, | 3576 | static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr, |
3577 | const compat_stack_t __user *, uss_ptr, | 3577 | compat_stack_t __user *uoss_ptr) |
3578 | compat_stack_t __user *, uoss_ptr) | ||
3579 | { | 3578 | { |
3580 | stack_t uss, uoss; | 3579 | stack_t uss, uoss; |
3581 | int ret; | 3580 | int ret; |
@@ -3602,9 +3601,16 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack, | |||
3602 | return ret; | 3601 | return ret; |
3603 | } | 3602 | } |
3604 | 3603 | ||
3604 | COMPAT_SYSCALL_DEFINE2(sigaltstack, | ||
3605 | const compat_stack_t __user *, uss_ptr, | ||
3606 | compat_stack_t __user *, uoss_ptr) | ||
3607 | { | ||
3608 | return do_compat_sigaltstack(uss_ptr, uoss_ptr); | ||
3609 | } | ||
3610 | |||
3605 | int compat_restore_altstack(const compat_stack_t __user *uss) | 3611 | int compat_restore_altstack(const compat_stack_t __user *uss) |
3606 | { | 3612 | { |
3607 | int err = compat_sys_sigaltstack(uss, NULL); | 3613 | int err = do_compat_sigaltstack(uss, NULL); |
3608 | /* squash all but -EFAULT for now */ | 3614 | /* squash all but -EFAULT for now */ |
3609 | return err == -EFAULT ? err : 0; | 3615 | return err == -EFAULT ? err : 0; |
3610 | } | 3616 | } |
@@ -3629,11 +3635,20 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) | |||
3629 | 3635 | ||
3630 | /** | 3636 | /** |
3631 | * sys_sigpending - examine pending signals | 3637 | * sys_sigpending - examine pending signals |
3632 | * @set: where mask of pending signal is returned | 3638 | * @uset: where mask of pending signal is returned |
3633 | */ | 3639 | */ |
3634 | SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) | 3640 | SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset) |
3635 | { | 3641 | { |
3636 | return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); | 3642 | sigset_t set; |
3643 | int err; | ||
3644 | |||
3645 | if (sizeof(old_sigset_t) > sizeof(*uset)) | ||
3646 | return -EINVAL; | ||
3647 | |||
3648 | err = do_sigpending(&set); | ||
3649 | if (!err && copy_to_user(uset, &set, sizeof(old_sigset_t))) | ||
3650 | err = -EFAULT; | ||
3651 | return err; | ||
3637 | } | 3652 | } |
3638 | 3653 | ||
3639 | #ifdef CONFIG_COMPAT | 3654 | #ifdef CONFIG_COMPAT |
diff --git a/kernel/sys.c b/kernel/sys.c index f2289de20e19..ad692183dfe9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -69,6 +69,8 @@ | |||
69 | #include <asm/io.h> | 69 | #include <asm/io.h> |
70 | #include <asm/unistd.h> | 70 | #include <asm/unistd.h> |
71 | 71 | ||
72 | #include "uid16.h" | ||
73 | |||
72 | #ifndef SET_UNALIGN_CTL | 74 | #ifndef SET_UNALIGN_CTL |
73 | # define SET_UNALIGN_CTL(a, b) (-EINVAL) | 75 | # define SET_UNALIGN_CTL(a, b) (-EINVAL) |
74 | #endif | 76 | #endif |
@@ -340,7 +342,7 @@ out_unlock: | |||
340 | * operations (as far as semantic preservation is concerned). | 342 | * operations (as far as semantic preservation is concerned). |
341 | */ | 343 | */ |
342 | #ifdef CONFIG_MULTIUSER | 344 | #ifdef CONFIG_MULTIUSER |
343 | SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | 345 | long __sys_setregid(gid_t rgid, gid_t egid) |
344 | { | 346 | { |
345 | struct user_namespace *ns = current_user_ns(); | 347 | struct user_namespace *ns = current_user_ns(); |
346 | const struct cred *old; | 348 | const struct cred *old; |
@@ -392,12 +394,17 @@ error: | |||
392 | return retval; | 394 | return retval; |
393 | } | 395 | } |
394 | 396 | ||
397 | SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | ||
398 | { | ||
399 | return __sys_setregid(rgid, egid); | ||
400 | } | ||
401 | |||
395 | /* | 402 | /* |
396 | * setgid() is implemented like SysV w/ SAVED_IDS | 403 | * setgid() is implemented like SysV w/ SAVED_IDS |
397 | * | 404 | * |
398 | * SMP: Same implicit races as above. | 405 | * SMP: Same implicit races as above. |
399 | */ | 406 | */ |
400 | SYSCALL_DEFINE1(setgid, gid_t, gid) | 407 | long __sys_setgid(gid_t gid) |
401 | { | 408 | { |
402 | struct user_namespace *ns = current_user_ns(); | 409 | struct user_namespace *ns = current_user_ns(); |
403 | const struct cred *old; | 410 | const struct cred *old; |
@@ -429,6 +436,11 @@ error: | |||
429 | return retval; | 436 | return retval; |
430 | } | 437 | } |
431 | 438 | ||
439 | SYSCALL_DEFINE1(setgid, gid_t, gid) | ||
440 | { | ||
441 | return __sys_setgid(gid); | ||
442 | } | ||
443 | |||
432 | /* | 444 | /* |
433 | * change the user struct in a credentials set to match the new UID | 445 | * change the user struct in a credentials set to match the new UID |
434 | */ | 446 | */ |
@@ -473,7 +485,7 @@ static int set_user(struct cred *new) | |||
473 | * 100% compatible with BSD. A program which uses just setuid() will be | 485 | * 100% compatible with BSD. A program which uses just setuid() will be |
474 | * 100% compatible with POSIX with saved IDs. | 486 | * 100% compatible with POSIX with saved IDs. |
475 | */ | 487 | */ |
476 | SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | 488 | long __sys_setreuid(uid_t ruid, uid_t euid) |
477 | { | 489 | { |
478 | struct user_namespace *ns = current_user_ns(); | 490 | struct user_namespace *ns = current_user_ns(); |
479 | const struct cred *old; | 491 | const struct cred *old; |
@@ -533,6 +545,11 @@ error: | |||
533 | return retval; | 545 | return retval; |
534 | } | 546 | } |
535 | 547 | ||
548 | SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | ||
549 | { | ||
550 | return __sys_setreuid(ruid, euid); | ||
551 | } | ||
552 | |||
536 | /* | 553 | /* |
537 | * setuid() is implemented like SysV with SAVED_IDS | 554 | * setuid() is implemented like SysV with SAVED_IDS |
538 | * | 555 | * |
@@ -544,7 +561,7 @@ error: | |||
544 | * will allow a root program to temporarily drop privileges and be able to | 561 | * will allow a root program to temporarily drop privileges and be able to |
545 | * regain them by swapping the real and effective uid. | 562 | * regain them by swapping the real and effective uid. |
546 | */ | 563 | */ |
547 | SYSCALL_DEFINE1(setuid, uid_t, uid) | 564 | long __sys_setuid(uid_t uid) |
548 | { | 565 | { |
549 | struct user_namespace *ns = current_user_ns(); | 566 | struct user_namespace *ns = current_user_ns(); |
550 | const struct cred *old; | 567 | const struct cred *old; |
@@ -586,12 +603,17 @@ error: | |||
586 | return retval; | 603 | return retval; |
587 | } | 604 | } |
588 | 605 | ||
606 | SYSCALL_DEFINE1(setuid, uid_t, uid) | ||
607 | { | ||
608 | return __sys_setuid(uid); | ||
609 | } | ||
610 | |||
589 | 611 | ||
590 | /* | 612 | /* |
591 | * This function implements a generic ability to update ruid, euid, | 613 | * This function implements a generic ability to update ruid, euid, |
592 | * and suid. This allows you to implement the 4.4 compatible seteuid(). | 614 | * and suid. This allows you to implement the 4.4 compatible seteuid(). |
593 | */ | 615 | */ |
594 | SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) | 616 | long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) |
595 | { | 617 | { |
596 | struct user_namespace *ns = current_user_ns(); | 618 | struct user_namespace *ns = current_user_ns(); |
597 | const struct cred *old; | 619 | const struct cred *old; |
@@ -656,6 +678,11 @@ error: | |||
656 | return retval; | 678 | return retval; |
657 | } | 679 | } |
658 | 680 | ||
681 | SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) | ||
682 | { | ||
683 | return __sys_setresuid(ruid, euid, suid); | ||
684 | } | ||
685 | |||
659 | SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) | 686 | SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) |
660 | { | 687 | { |
661 | const struct cred *cred = current_cred(); | 688 | const struct cred *cred = current_cred(); |
@@ -678,7 +705,7 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _ | |||
678 | /* | 705 | /* |
679 | * Same as above, but for rgid, egid, sgid. | 706 | * Same as above, but for rgid, egid, sgid. |
680 | */ | 707 | */ |
681 | SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) | 708 | long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) |
682 | { | 709 | { |
683 | struct user_namespace *ns = current_user_ns(); | 710 | struct user_namespace *ns = current_user_ns(); |
684 | const struct cred *old; | 711 | const struct cred *old; |
@@ -730,6 +757,11 @@ error: | |||
730 | return retval; | 757 | return retval; |
731 | } | 758 | } |
732 | 759 | ||
760 | SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) | ||
761 | { | ||
762 | return __sys_setresgid(rgid, egid, sgid); | ||
763 | } | ||
764 | |||
733 | SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) | 765 | SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) |
734 | { | 766 | { |
735 | const struct cred *cred = current_cred(); | 767 | const struct cred *cred = current_cred(); |
@@ -757,7 +789,7 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _ | |||
757 | * whatever uid it wants to). It normally shadows "euid", except when | 789 | * whatever uid it wants to). It normally shadows "euid", except when |
758 | * explicitly set by setfsuid() or for access.. | 790 | * explicitly set by setfsuid() or for access.. |
759 | */ | 791 | */ |
760 | SYSCALL_DEFINE1(setfsuid, uid_t, uid) | 792 | long __sys_setfsuid(uid_t uid) |
761 | { | 793 | { |
762 | const struct cred *old; | 794 | const struct cred *old; |
763 | struct cred *new; | 795 | struct cred *new; |
@@ -793,10 +825,15 @@ change_okay: | |||
793 | return old_fsuid; | 825 | return old_fsuid; |
794 | } | 826 | } |
795 | 827 | ||
828 | SYSCALL_DEFINE1(setfsuid, uid_t, uid) | ||
829 | { | ||
830 | return __sys_setfsuid(uid); | ||
831 | } | ||
832 | |||
796 | /* | 833 | /* |
797 | * Samma på svenska.. | 834 | * Samma på svenska.. |
798 | */ | 835 | */ |
799 | SYSCALL_DEFINE1(setfsgid, gid_t, gid) | 836 | long __sys_setfsgid(gid_t gid) |
800 | { | 837 | { |
801 | const struct cred *old; | 838 | const struct cred *old; |
802 | struct cred *new; | 839 | struct cred *new; |
@@ -830,6 +867,11 @@ change_okay: | |||
830 | commit_creds(new); | 867 | commit_creds(new); |
831 | return old_fsgid; | 868 | return old_fsgid; |
832 | } | 869 | } |
870 | |||
871 | SYSCALL_DEFINE1(setfsgid, gid_t, gid) | ||
872 | { | ||
873 | return __sys_setfsgid(gid); | ||
874 | } | ||
833 | #endif /* CONFIG_MULTIUSER */ | 875 | #endif /* CONFIG_MULTIUSER */ |
834 | 876 | ||
835 | /** | 877 | /** |
@@ -1027,7 +1069,7 @@ out: | |||
1027 | return err; | 1069 | return err; |
1028 | } | 1070 | } |
1029 | 1071 | ||
1030 | SYSCALL_DEFINE1(getpgid, pid_t, pid) | 1072 | static int do_getpgid(pid_t pid) |
1031 | { | 1073 | { |
1032 | struct task_struct *p; | 1074 | struct task_struct *p; |
1033 | struct pid *grp; | 1075 | struct pid *grp; |
@@ -1055,11 +1097,16 @@ out: | |||
1055 | return retval; | 1097 | return retval; |
1056 | } | 1098 | } |
1057 | 1099 | ||
1100 | SYSCALL_DEFINE1(getpgid, pid_t, pid) | ||
1101 | { | ||
1102 | return do_getpgid(pid); | ||
1103 | } | ||
1104 | |||
1058 | #ifdef __ARCH_WANT_SYS_GETPGRP | 1105 | #ifdef __ARCH_WANT_SYS_GETPGRP |
1059 | 1106 | ||
1060 | SYSCALL_DEFINE0(getpgrp) | 1107 | SYSCALL_DEFINE0(getpgrp) |
1061 | { | 1108 | { |
1062 | return sys_getpgid(0); | 1109 | return do_getpgid(0); |
1063 | } | 1110 | } |
1064 | 1111 | ||
1065 | #endif | 1112 | #endif |
@@ -1103,7 +1150,7 @@ static void set_special_pids(struct pid *pid) | |||
1103 | change_pid(curr, PIDTYPE_PGID, pid); | 1150 | change_pid(curr, PIDTYPE_PGID, pid); |
1104 | } | 1151 | } |
1105 | 1152 | ||
1106 | SYSCALL_DEFINE0(setsid) | 1153 | int ksys_setsid(void) |
1107 | { | 1154 | { |
1108 | struct task_struct *group_leader = current->group_leader; | 1155 | struct task_struct *group_leader = current->group_leader; |
1109 | struct pid *sid = task_pid(group_leader); | 1156 | struct pid *sid = task_pid(group_leader); |
@@ -1136,6 +1183,11 @@ out: | |||
1136 | return err; | 1183 | return err; |
1137 | } | 1184 | } |
1138 | 1185 | ||
1186 | SYSCALL_DEFINE0(setsid) | ||
1187 | { | ||
1188 | return ksys_setsid(); | ||
1189 | } | ||
1190 | |||
1139 | DECLARE_RWSEM(uts_sem); | 1191 | DECLARE_RWSEM(uts_sem); |
1140 | 1192 | ||
1141 | #ifdef COMPAT_UTS_MACHINE | 1193 | #ifdef COMPAT_UTS_MACHINE |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index b5189762d275..6cafc008f6db 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -17,245 +17,406 @@ asmlinkage long sys_ni_syscall(void) | |||
17 | return -ENOSYS; | 17 | return -ENOSYS; |
18 | } | 18 | } |
19 | 19 | ||
20 | cond_syscall(sys_quotactl); | 20 | #define COND_SYSCALL(name) cond_syscall(sys_##name) |
21 | cond_syscall(sys32_quotactl); | 21 | #define COND_SYSCALL_COMPAT(name) cond_syscall(compat_sys_##name) |
22 | cond_syscall(sys_acct); | 22 | |
23 | cond_syscall(sys_lookup_dcookie); | 23 | /* |
24 | cond_syscall(compat_sys_lookup_dcookie); | 24 | * This list is kept in the same order as include/uapi/asm-generic/unistd.h. |
25 | cond_syscall(sys_swapon); | 25 | * Architecture specific entries go below, followed by deprecated or obsolete |
26 | cond_syscall(sys_swapoff); | 26 | * system calls. |
27 | cond_syscall(sys_kexec_load); | 27 | */ |
28 | cond_syscall(compat_sys_kexec_load); | 28 | |
29 | cond_syscall(sys_kexec_file_load); | 29 | COND_SYSCALL(io_setup); |
30 | cond_syscall(sys_init_module); | 30 | COND_SYSCALL_COMPAT(io_setup); |
31 | cond_syscall(sys_finit_module); | 31 | COND_SYSCALL(io_destroy); |
32 | cond_syscall(sys_delete_module); | 32 | COND_SYSCALL(io_submit); |
33 | cond_syscall(sys_socketpair); | 33 | COND_SYSCALL_COMPAT(io_submit); |
34 | cond_syscall(sys_bind); | 34 | COND_SYSCALL(io_cancel); |
35 | cond_syscall(sys_listen); | 35 | COND_SYSCALL(io_getevents); |
36 | cond_syscall(sys_accept); | 36 | COND_SYSCALL_COMPAT(io_getevents); |
37 | cond_syscall(sys_accept4); | 37 | |
38 | cond_syscall(sys_connect); | 38 | /* fs/xattr.c */ |
39 | cond_syscall(sys_getsockname); | 39 | |
40 | cond_syscall(sys_getpeername); | 40 | /* fs/dcache.c */ |
41 | cond_syscall(sys_sendto); | 41 | |
42 | cond_syscall(sys_send); | 42 | /* fs/cookies.c */ |
43 | cond_syscall(sys_recvfrom); | 43 | COND_SYSCALL(lookup_dcookie); |
44 | cond_syscall(sys_recv); | 44 | COND_SYSCALL_COMPAT(lookup_dcookie); |
45 | cond_syscall(sys_socket); | 45 | |
46 | cond_syscall(sys_setsockopt); | 46 | /* fs/eventfd.c */ |
47 | cond_syscall(compat_sys_setsockopt); | 47 | COND_SYSCALL(eventfd2); |
48 | cond_syscall(sys_getsockopt); | 48 | |
49 | cond_syscall(compat_sys_getsockopt); | 49 | /* fs/eventfd.c */ |
50 | cond_syscall(sys_shutdown); | 50 | COND_SYSCALL(epoll_create1); |
51 | cond_syscall(sys_sendmsg); | 51 | COND_SYSCALL(epoll_ctl); |
52 | cond_syscall(sys_sendmmsg); | 52 | COND_SYSCALL(epoll_pwait); |
53 | cond_syscall(compat_sys_sendmsg); | 53 | COND_SYSCALL_COMPAT(epoll_pwait); |
54 | cond_syscall(compat_sys_sendmmsg); | 54 | |
55 | cond_syscall(sys_recvmsg); | 55 | /* fs/fcntl.c */ |
56 | cond_syscall(sys_recvmmsg); | 56 | |
57 | cond_syscall(compat_sys_recvmsg); | 57 | /* fs/inotify_user.c */ |
58 | cond_syscall(compat_sys_recv); | 58 | COND_SYSCALL(inotify_init1); |
59 | cond_syscall(compat_sys_recvfrom); | 59 | COND_SYSCALL(inotify_add_watch); |
60 | cond_syscall(compat_sys_recvmmsg); | 60 | COND_SYSCALL(inotify_rm_watch); |
61 | cond_syscall(sys_socketcall); | 61 | |
62 | cond_syscall(sys_futex); | 62 | /* fs/ioctl.c */ |
63 | cond_syscall(compat_sys_futex); | 63 | |
64 | cond_syscall(sys_set_robust_list); | 64 | /* fs/ioprio.c */ |
65 | cond_syscall(compat_sys_set_robust_list); | 65 | COND_SYSCALL(ioprio_set); |
66 | cond_syscall(sys_get_robust_list); | 66 | COND_SYSCALL(ioprio_get); |
67 | cond_syscall(compat_sys_get_robust_list); | 67 | |
68 | cond_syscall(sys_epoll_create); | 68 | /* fs/locks.c */ |
69 | cond_syscall(sys_epoll_create1); | 69 | COND_SYSCALL(flock); |
70 | cond_syscall(sys_epoll_ctl); | 70 | |
71 | cond_syscall(sys_epoll_wait); | 71 | /* fs/namei.c */ |
72 | cond_syscall(sys_epoll_pwait); | 72 | |
73 | cond_syscall(compat_sys_epoll_pwait); | 73 | /* fs/namespace.c */ |
74 | cond_syscall(sys_semget); | 74 | |
75 | cond_syscall(sys_semop); | 75 | /* fs/nfsctl.c */ |
76 | cond_syscall(sys_semtimedop); | 76 | |
77 | cond_syscall(compat_sys_semtimedop); | 77 | /* fs/open.c */ |
78 | cond_syscall(sys_semctl); | 78 | |
79 | cond_syscall(compat_sys_semctl); | 79 | /* fs/pipe.c */ |
80 | cond_syscall(sys_msgget); | 80 | |
81 | cond_syscall(sys_msgsnd); | 81 | /* fs/quota.c */ |
82 | cond_syscall(compat_sys_msgsnd); | 82 | COND_SYSCALL(quotactl); |
83 | cond_syscall(sys_msgrcv); | 83 | |
84 | cond_syscall(compat_sys_msgrcv); | 84 | /* fs/readdir.c */ |
85 | cond_syscall(sys_msgctl); | 85 | |
86 | cond_syscall(compat_sys_msgctl); | 86 | /* fs/read_write.c */ |
87 | cond_syscall(sys_shmget); | 87 | |
88 | cond_syscall(sys_shmat); | 88 | /* fs/sendfile.c */ |
89 | cond_syscall(compat_sys_shmat); | 89 | |
90 | cond_syscall(sys_shmdt); | 90 | /* fs/select.c */ |
91 | cond_syscall(sys_shmctl); | 91 | |
92 | cond_syscall(compat_sys_shmctl); | 92 | /* fs/signalfd.c */ |
93 | cond_syscall(sys_mq_open); | 93 | COND_SYSCALL(signalfd4); |
94 | cond_syscall(sys_mq_unlink); | 94 | COND_SYSCALL_COMPAT(signalfd4); |
95 | cond_syscall(sys_mq_timedsend); | 95 | |
96 | cond_syscall(sys_mq_timedreceive); | 96 | /* fs/splice.c */ |
97 | cond_syscall(sys_mq_notify); | 97 | |
98 | cond_syscall(sys_mq_getsetattr); | 98 | /* fs/stat.c */ |
99 | cond_syscall(compat_sys_mq_open); | 99 | |
100 | cond_syscall(compat_sys_mq_timedsend); | 100 | /* fs/sync.c */ |
101 | cond_syscall(compat_sys_mq_timedreceive); | 101 | |
102 | cond_syscall(compat_sys_mq_notify); | 102 | /* fs/timerfd.c */ |
103 | cond_syscall(compat_sys_mq_getsetattr); | 103 | COND_SYSCALL(timerfd_create); |
104 | cond_syscall(sys_mbind); | 104 | COND_SYSCALL(timerfd_settime); |
105 | cond_syscall(sys_get_mempolicy); | 105 | COND_SYSCALL_COMPAT(timerfd_settime); |
106 | cond_syscall(sys_set_mempolicy); | 106 | COND_SYSCALL(timerfd_gettime); |
107 | cond_syscall(compat_sys_mbind); | 107 | COND_SYSCALL_COMPAT(timerfd_gettime); |
108 | cond_syscall(compat_sys_get_mempolicy); | 108 | |
109 | cond_syscall(compat_sys_set_mempolicy); | 109 | /* fs/utimes.c */ |
110 | cond_syscall(sys_add_key); | 110 | |
111 | cond_syscall(sys_request_key); | 111 | /* kernel/acct.c */ |
112 | cond_syscall(sys_keyctl); | 112 | COND_SYSCALL(acct); |
113 | cond_syscall(compat_sys_keyctl); | 113 | |
114 | cond_syscall(compat_sys_socketcall); | 114 | /* kernel/capability.c */ |
115 | cond_syscall(sys_inotify_init); | 115 | COND_SYSCALL(capget); |
116 | cond_syscall(sys_inotify_init1); | 116 | COND_SYSCALL(capset); |
117 | cond_syscall(sys_inotify_add_watch); | 117 | |
118 | cond_syscall(sys_inotify_rm_watch); | 118 | /* kernel/exec_domain.c */ |
119 | cond_syscall(sys_migrate_pages); | 119 | |
120 | cond_syscall(sys_move_pages); | 120 | /* kernel/exit.c */ |
121 | cond_syscall(sys_chown16); | 121 | |
122 | cond_syscall(sys_fchown16); | 122 | /* kernel/fork.c */ |
123 | cond_syscall(sys_getegid16); | 123 | |
124 | cond_syscall(sys_geteuid16); | 124 | /* kernel/futex.c */ |
125 | cond_syscall(sys_getgid16); | 125 | COND_SYSCALL(futex); |
126 | cond_syscall(sys_getgroups16); | 126 | COND_SYSCALL_COMPAT(futex); |
127 | cond_syscall(sys_getresgid16); | 127 | COND_SYSCALL(set_robust_list); |
128 | cond_syscall(sys_getresuid16); | 128 | COND_SYSCALL_COMPAT(set_robust_list); |
129 | cond_syscall(sys_getuid16); | 129 | COND_SYSCALL(get_robust_list); |
130 | cond_syscall(sys_lchown16); | 130 | COND_SYSCALL_COMPAT(get_robust_list); |
131 | cond_syscall(sys_setfsgid16); | 131 | |
132 | cond_syscall(sys_setfsuid16); | 132 | /* kernel/hrtimer.c */ |
133 | cond_syscall(sys_setgid16); | 133 | |
134 | cond_syscall(sys_setgroups16); | 134 | /* kernel/itimer.c */ |
135 | cond_syscall(sys_setregid16); | 135 | |
136 | cond_syscall(sys_setresgid16); | 136 | /* kernel/kexec.c */ |
137 | cond_syscall(sys_setresuid16); | 137 | COND_SYSCALL(kexec_load); |
138 | cond_syscall(sys_setreuid16); | 138 | COND_SYSCALL_COMPAT(kexec_load); |
139 | cond_syscall(sys_setuid16); | 139 | |
140 | cond_syscall(sys_sgetmask); | 140 | /* kernel/module.c */ |
141 | cond_syscall(sys_ssetmask); | 141 | COND_SYSCALL(init_module); |
142 | cond_syscall(sys_vm86old); | 142 | COND_SYSCALL(delete_module); |
143 | cond_syscall(sys_vm86); | 143 | |
144 | cond_syscall(sys_modify_ldt); | 144 | /* kernel/posix-timers.c */ |
145 | cond_syscall(sys_ipc); | 145 | |
146 | cond_syscall(compat_sys_ipc); | 146 | /* kernel/printk.c */ |
147 | cond_syscall(compat_sys_sysctl); | 147 | COND_SYSCALL(syslog); |
148 | cond_syscall(sys_flock); | 148 | |
149 | cond_syscall(sys_io_setup); | 149 | /* kernel/ptrace.c */ |
150 | cond_syscall(sys_io_destroy); | 150 | |
151 | cond_syscall(sys_io_submit); | 151 | /* kernel/sched/core.c */ |
152 | cond_syscall(sys_io_cancel); | 152 | |
153 | cond_syscall(sys_io_getevents); | 153 | /* kernel/signal.c */ |
154 | cond_syscall(compat_sys_io_setup); | 154 | |
155 | cond_syscall(compat_sys_io_submit); | 155 | /* kernel/sys.c */ |
156 | cond_syscall(compat_sys_io_getevents); | 156 | COND_SYSCALL(setregid); |
157 | cond_syscall(sys_sysfs); | 157 | COND_SYSCALL(setgid); |
158 | cond_syscall(sys_syslog); | 158 | COND_SYSCALL(setreuid); |
159 | cond_syscall(sys_process_vm_readv); | 159 | COND_SYSCALL(setuid); |
160 | cond_syscall(sys_process_vm_writev); | 160 | COND_SYSCALL(setresuid); |
161 | cond_syscall(compat_sys_process_vm_readv); | 161 | COND_SYSCALL(getresuid); |
162 | cond_syscall(compat_sys_process_vm_writev); | 162 | COND_SYSCALL(setresgid); |
163 | cond_syscall(sys_uselib); | 163 | COND_SYSCALL(getresgid); |
164 | cond_syscall(sys_fadvise64); | 164 | COND_SYSCALL(setfsuid); |
165 | cond_syscall(sys_fadvise64_64); | 165 | COND_SYSCALL(setfsgid); |
166 | cond_syscall(sys_madvise); | 166 | COND_SYSCALL(setgroups); |
167 | cond_syscall(sys_setuid); | 167 | COND_SYSCALL(getgroups); |
168 | cond_syscall(sys_setregid); | 168 | |
169 | cond_syscall(sys_setgid); | 169 | /* kernel/time.c */ |
170 | cond_syscall(sys_setreuid); | 170 | |
171 | cond_syscall(sys_setresuid); | 171 | /* kernel/timer.c */ |
172 | cond_syscall(sys_getresuid); | 172 | |
173 | cond_syscall(sys_setresgid); | 173 | /* ipc/mqueue.c */ |
174 | cond_syscall(sys_getresgid); | 174 | COND_SYSCALL(mq_open); |
175 | cond_syscall(sys_setgroups); | 175 | COND_SYSCALL_COMPAT(mq_open); |
176 | cond_syscall(sys_getgroups); | 176 | COND_SYSCALL(mq_unlink); |
177 | cond_syscall(sys_setfsuid); | 177 | COND_SYSCALL(mq_timedsend); |
178 | cond_syscall(sys_setfsgid); | 178 | COND_SYSCALL_COMPAT(mq_timedsend); |
179 | cond_syscall(sys_capget); | 179 | COND_SYSCALL(mq_timedreceive); |
180 | cond_syscall(sys_capset); | 180 | COND_SYSCALL_COMPAT(mq_timedreceive); |
181 | cond_syscall(sys_copy_file_range); | 181 | COND_SYSCALL(mq_notify); |
182 | 182 | COND_SYSCALL_COMPAT(mq_notify); | |
183 | /* arch-specific weak syscall entries */ | 183 | COND_SYSCALL(mq_getsetattr); |
184 | cond_syscall(sys_pciconfig_read); | 184 | COND_SYSCALL_COMPAT(mq_getsetattr); |
185 | cond_syscall(sys_pciconfig_write); | 185 | |
186 | cond_syscall(sys_pciconfig_iobase); | 186 | /* ipc/msg.c */ |
187 | cond_syscall(compat_sys_s390_ipc); | 187 | COND_SYSCALL(msgget); |
188 | cond_syscall(ppc_rtas); | 188 | COND_SYSCALL(msgctl); |
189 | cond_syscall(sys_spu_run); | 189 | COND_SYSCALL_COMPAT(msgctl); |
190 | cond_syscall(sys_spu_create); | 190 | COND_SYSCALL(msgrcv); |
191 | cond_syscall(sys_subpage_prot); | 191 | COND_SYSCALL_COMPAT(msgrcv); |
192 | cond_syscall(sys_s390_pci_mmio_read); | 192 | COND_SYSCALL(msgsnd); |
193 | cond_syscall(sys_s390_pci_mmio_write); | 193 | COND_SYSCALL_COMPAT(msgsnd); |
194 | 194 | ||
195 | /* mmu depending weak syscall entries */ | 195 | /* ipc/sem.c */ |
196 | cond_syscall(sys_mprotect); | 196 | COND_SYSCALL(semget); |
197 | cond_syscall(sys_msync); | 197 | COND_SYSCALL(semctl); |
198 | cond_syscall(sys_mlock); | 198 | COND_SYSCALL_COMPAT(semctl); |
199 | cond_syscall(sys_munlock); | 199 | COND_SYSCALL(semtimedop); |
200 | cond_syscall(sys_mlockall); | 200 | COND_SYSCALL_COMPAT(semtimedop); |
201 | cond_syscall(sys_munlockall); | 201 | COND_SYSCALL(semop); |
202 | cond_syscall(sys_mlock2); | 202 | |
203 | cond_syscall(sys_mincore); | 203 | /* ipc/shm.c */ |
204 | cond_syscall(sys_madvise); | 204 | COND_SYSCALL(shmget); |
205 | cond_syscall(sys_mremap); | 205 | COND_SYSCALL(shmctl); |
206 | cond_syscall(sys_remap_file_pages); | 206 | COND_SYSCALL_COMPAT(shmctl); |
207 | cond_syscall(compat_sys_move_pages); | 207 | COND_SYSCALL(shmat); |
208 | cond_syscall(compat_sys_migrate_pages); | 208 | COND_SYSCALL_COMPAT(shmat); |
209 | 209 | COND_SYSCALL(shmdt); | |
210 | /* block-layer dependent */ | 210 | |
211 | cond_syscall(sys_bdflush); | 211 | /* net/socket.c */ |
212 | cond_syscall(sys_ioprio_set); | 212 | COND_SYSCALL(socket); |
213 | cond_syscall(sys_ioprio_get); | 213 | COND_SYSCALL(socketpair); |
214 | 214 | COND_SYSCALL(bind); | |
215 | /* New file descriptors */ | 215 | COND_SYSCALL(listen); |
216 | cond_syscall(sys_signalfd); | 216 | COND_SYSCALL(accept); |
217 | cond_syscall(sys_signalfd4); | 217 | COND_SYSCALL(connect); |
218 | cond_syscall(compat_sys_signalfd); | 218 | COND_SYSCALL(getsockname); |
219 | cond_syscall(compat_sys_signalfd4); | 219 | COND_SYSCALL(getpeername); |
220 | cond_syscall(sys_timerfd_create); | 220 | COND_SYSCALL(setsockopt); |
221 | cond_syscall(sys_timerfd_settime); | 221 | COND_SYSCALL_COMPAT(setsockopt); |
222 | cond_syscall(sys_timerfd_gettime); | 222 | COND_SYSCALL(getsockopt); |
223 | cond_syscall(compat_sys_timerfd_settime); | 223 | COND_SYSCALL_COMPAT(getsockopt); |
224 | cond_syscall(compat_sys_timerfd_gettime); | 224 | COND_SYSCALL(sendto); |
225 | cond_syscall(sys_eventfd); | 225 | COND_SYSCALL(shutdown); |
226 | cond_syscall(sys_eventfd2); | 226 | COND_SYSCALL(recvfrom); |
227 | cond_syscall(sys_memfd_create); | 227 | COND_SYSCALL_COMPAT(recvfrom); |
228 | cond_syscall(sys_userfaultfd); | 228 | COND_SYSCALL(sendmsg); |
229 | 229 | COND_SYSCALL_COMPAT(sendmsg); | |
230 | /* performance counters: */ | 230 | COND_SYSCALL(recvmsg); |
231 | cond_syscall(sys_perf_event_open); | 231 | COND_SYSCALL_COMPAT(recvmsg); |
232 | 232 | ||
233 | /* fanotify! */ | 233 | /* mm/filemap.c */ |
234 | cond_syscall(sys_fanotify_init); | 234 | |
235 | cond_syscall(sys_fanotify_mark); | 235 | /* mm/nommu.c, also with MMU */ |
236 | cond_syscall(compat_sys_fanotify_mark); | 236 | COND_SYSCALL(mremap); |
237 | |||
238 | /* security/keys/keyctl.c */ | ||
239 | COND_SYSCALL(add_key); | ||
240 | COND_SYSCALL(request_key); | ||
241 | COND_SYSCALL(keyctl); | ||
242 | COND_SYSCALL_COMPAT(keyctl); | ||
243 | |||
244 | /* arch/example/kernel/sys_example.c */ | ||
245 | |||
246 | /* mm/fadvise.c */ | ||
247 | COND_SYSCALL(fadvise64_64); | ||
248 | |||
249 | /* mm/, CONFIG_MMU only */ | ||
250 | COND_SYSCALL(swapon); | ||
251 | COND_SYSCALL(swapoff); | ||
252 | COND_SYSCALL(mprotect); | ||
253 | COND_SYSCALL(msync); | ||
254 | COND_SYSCALL(mlock); | ||
255 | COND_SYSCALL(munlock); | ||
256 | COND_SYSCALL(mlockall); | ||
257 | COND_SYSCALL(munlockall); | ||
258 | COND_SYSCALL(mincore); | ||
259 | COND_SYSCALL(madvise); | ||
260 | COND_SYSCALL(remap_file_pages); | ||
261 | COND_SYSCALL(mbind); | ||
262 | COND_SYSCALL_COMPAT(mbind); | ||
263 | COND_SYSCALL(get_mempolicy); | ||
264 | COND_SYSCALL_COMPAT(get_mempolicy); | ||
265 | COND_SYSCALL(set_mempolicy); | ||
266 | COND_SYSCALL_COMPAT(set_mempolicy); | ||
267 | COND_SYSCALL(migrate_pages); | ||
268 | COND_SYSCALL_COMPAT(migrate_pages); | ||
269 | COND_SYSCALL(move_pages); | ||
270 | COND_SYSCALL_COMPAT(move_pages); | ||
271 | |||
272 | COND_SYSCALL(perf_event_open); | ||
273 | COND_SYSCALL(accept4); | ||
274 | COND_SYSCALL(recvmmsg); | ||
275 | COND_SYSCALL_COMPAT(recvmmsg); | ||
276 | |||
277 | /* | ||
278 | * Architecture specific syscalls: see further below | ||
279 | */ | ||
280 | |||
281 | /* fanotify */ | ||
282 | COND_SYSCALL(fanotify_init); | ||
283 | COND_SYSCALL(fanotify_mark); | ||
237 | 284 | ||
238 | /* open by handle */ | 285 | /* open by handle */ |
239 | cond_syscall(sys_name_to_handle_at); | 286 | COND_SYSCALL(name_to_handle_at); |
240 | cond_syscall(sys_open_by_handle_at); | 287 | COND_SYSCALL(open_by_handle_at); |
241 | cond_syscall(compat_sys_open_by_handle_at); | 288 | COND_SYSCALL_COMPAT(open_by_handle_at); |
289 | |||
290 | COND_SYSCALL(sendmmsg); | ||
291 | COND_SYSCALL_COMPAT(sendmmsg); | ||
292 | COND_SYSCALL(process_vm_readv); | ||
293 | COND_SYSCALL_COMPAT(process_vm_readv); | ||
294 | COND_SYSCALL(process_vm_writev); | ||
295 | COND_SYSCALL_COMPAT(process_vm_writev); | ||
242 | 296 | ||
243 | /* compare kernel pointers */ | 297 | /* compare kernel pointers */ |
244 | cond_syscall(sys_kcmp); | 298 | COND_SYSCALL(kcmp); |
299 | |||
300 | COND_SYSCALL(finit_module); | ||
245 | 301 | ||
246 | /* operate on Secure Computing state */ | 302 | /* operate on Secure Computing state */ |
247 | cond_syscall(sys_seccomp); | 303 | COND_SYSCALL(seccomp); |
304 | |||
305 | COND_SYSCALL(memfd_create); | ||
248 | 306 | ||
249 | /* access BPF programs and maps */ | 307 | /* access BPF programs and maps */ |
250 | cond_syscall(sys_bpf); | 308 | COND_SYSCALL(bpf); |
251 | 309 | ||
252 | /* execveat */ | 310 | /* execveat */ |
253 | cond_syscall(sys_execveat); | 311 | COND_SYSCALL(execveat); |
312 | |||
313 | COND_SYSCALL(userfaultfd); | ||
254 | 314 | ||
255 | /* membarrier */ | 315 | /* membarrier */ |
256 | cond_syscall(sys_membarrier); | 316 | COND_SYSCALL(membarrier); |
317 | |||
318 | COND_SYSCALL(mlock2); | ||
319 | |||
320 | COND_SYSCALL(copy_file_range); | ||
257 | 321 | ||
258 | /* memory protection keys */ | 322 | /* memory protection keys */ |
259 | cond_syscall(sys_pkey_mprotect); | 323 | COND_SYSCALL(pkey_mprotect); |
260 | cond_syscall(sys_pkey_alloc); | 324 | COND_SYSCALL(pkey_alloc); |
261 | cond_syscall(sys_pkey_free); | 325 | COND_SYSCALL(pkey_free); |
326 | |||
327 | |||
328 | /* | ||
329 | * Architecture specific weak syscall entries. | ||
330 | */ | ||
331 | |||
332 | /* pciconfig: alpha, arm, arm64, ia64, sparc */ | ||
333 | COND_SYSCALL(pciconfig_read); | ||
334 | COND_SYSCALL(pciconfig_write); | ||
335 | COND_SYSCALL(pciconfig_iobase); | ||
336 | |||
337 | /* sys_socketcall: arm, mips, x86, ... */ | ||
338 | COND_SYSCALL(socketcall); | ||
339 | COND_SYSCALL_COMPAT(socketcall); | ||
340 | |||
341 | /* compat syscalls for arm64, x86, ... */ | ||
342 | COND_SYSCALL_COMPAT(sysctl); | ||
343 | COND_SYSCALL_COMPAT(fanotify_mark); | ||
344 | |||
345 | /* x86 */ | ||
346 | COND_SYSCALL(vm86old); | ||
347 | COND_SYSCALL(modify_ldt); | ||
348 | COND_SYSCALL_COMPAT(quotactl32); | ||
349 | COND_SYSCALL(vm86); | ||
350 | COND_SYSCALL(kexec_file_load); | ||
351 | |||
352 | /* s390 */ | ||
353 | COND_SYSCALL(s390_pci_mmio_read); | ||
354 | COND_SYSCALL(s390_pci_mmio_write); | ||
355 | COND_SYSCALL_COMPAT(s390_ipc); | ||
356 | |||
357 | /* powerpc */ | ||
358 | cond_syscall(ppc_rtas); | ||
359 | COND_SYSCALL(spu_run); | ||
360 | COND_SYSCALL(spu_create); | ||
361 | COND_SYSCALL(subpage_prot); | ||
362 | |||
363 | |||
364 | /* | ||
365 | * Deprecated system calls which are still defined in | ||
366 | * include/uapi/asm-generic/unistd.h and wanted by >= 1 arch | ||
367 | */ | ||
368 | |||
369 | /* __ARCH_WANT_SYSCALL_NO_FLAGS */ | ||
370 | COND_SYSCALL(epoll_create); | ||
371 | COND_SYSCALL(inotify_init); | ||
372 | COND_SYSCALL(eventfd); | ||
373 | COND_SYSCALL(signalfd); | ||
374 | COND_SYSCALL_COMPAT(signalfd); | ||
375 | |||
376 | /* __ARCH_WANT_SYSCALL_OFF_T */ | ||
377 | COND_SYSCALL(fadvise64); | ||
378 | |||
379 | /* __ARCH_WANT_SYSCALL_DEPRECATED */ | ||
380 | COND_SYSCALL(epoll_wait); | ||
381 | COND_SYSCALL(recv); | ||
382 | COND_SYSCALL_COMPAT(recv); | ||
383 | COND_SYSCALL(send); | ||
384 | COND_SYSCALL(bdflush); | ||
385 | COND_SYSCALL(uselib); | ||
386 | |||
387 | |||
388 | /* | ||
389 | * The syscalls below are not found in include/uapi/asm-generic/unistd.h | ||
390 | */ | ||
391 | |||
392 | /* obsolete: SGETMASK_SYSCALL */ | ||
393 | COND_SYSCALL(sgetmask); | ||
394 | COND_SYSCALL(ssetmask); | ||
395 | |||
396 | /* obsolete: SYSFS_SYSCALL */ | ||
397 | COND_SYSCALL(sysfs); | ||
398 | |||
399 | /* obsolete: __ARCH_WANT_SYS_IPC */ | ||
400 | COND_SYSCALL(ipc); | ||
401 | COND_SYSCALL_COMPAT(ipc); | ||
402 | |||
403 | /* obsolete: UID16 */ | ||
404 | COND_SYSCALL(chown16); | ||
405 | COND_SYSCALL(fchown16); | ||
406 | COND_SYSCALL(getegid16); | ||
407 | COND_SYSCALL(geteuid16); | ||
408 | COND_SYSCALL(getgid16); | ||
409 | COND_SYSCALL(getgroups16); | ||
410 | COND_SYSCALL(getresgid16); | ||
411 | COND_SYSCALL(getresuid16); | ||
412 | COND_SYSCALL(getuid16); | ||
413 | COND_SYSCALL(lchown16); | ||
414 | COND_SYSCALL(setfsgid16); | ||
415 | COND_SYSCALL(setfsuid16); | ||
416 | COND_SYSCALL(setgid16); | ||
417 | COND_SYSCALL(setgroups16); | ||
418 | COND_SYSCALL(setregid16); | ||
419 | COND_SYSCALL(setresgid16); | ||
420 | COND_SYSCALL(setresuid16); | ||
421 | COND_SYSCALL(setreuid16); | ||
422 | COND_SYSCALL(setuid16); | ||
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f6b5f19223d6..78eabc41eaa6 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -113,16 +113,6 @@ config NO_HZ_FULL | |||
113 | 113 | ||
114 | endchoice | 114 | endchoice |
115 | 115 | ||
116 | config NO_HZ_FULL_ALL | ||
117 | bool "Full dynticks system on all CPUs by default (except CPU 0)" | ||
118 | depends on NO_HZ_FULL | ||
119 | help | ||
120 | If the user doesn't pass the nohz_full boot option to | ||
121 | define the range of full dynticks CPUs, consider that all | ||
122 | CPUs in the system are full dynticks by default. | ||
123 | Note the boot CPU will still be kept outside the range to | ||
124 | handle the timekeeping duty. | ||
125 | |||
126 | config NO_HZ | 116 | config NO_HZ |
127 | bool "Old Idle dynticks config" | 117 | bool "Old Idle dynticks config" |
128 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS | 118 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 29a5733eff83..5d4a0342f934 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -405,30 +405,12 @@ static int tick_nohz_cpu_down(unsigned int cpu) | |||
405 | return 0; | 405 | return 0; |
406 | } | 406 | } |
407 | 407 | ||
408 | static int tick_nohz_init_all(void) | ||
409 | { | ||
410 | int err = -1; | ||
411 | |||
412 | #ifdef CONFIG_NO_HZ_FULL_ALL | ||
413 | if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { | ||
414 | WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); | ||
415 | return err; | ||
416 | } | ||
417 | err = 0; | ||
418 | cpumask_setall(tick_nohz_full_mask); | ||
419 | tick_nohz_full_running = true; | ||
420 | #endif | ||
421 | return err; | ||
422 | } | ||
423 | |||
424 | void __init tick_nohz_init(void) | 408 | void __init tick_nohz_init(void) |
425 | { | 409 | { |
426 | int cpu, ret; | 410 | int cpu, ret; |
427 | 411 | ||
428 | if (!tick_nohz_full_running) { | 412 | if (!tick_nohz_full_running) |
429 | if (tick_nohz_init_all() < 0) | 413 | return; |
430 | return; | ||
431 | } | ||
432 | 414 | ||
433 | /* | 415 | /* |
434 | * Full dynticks uses irq work to drive the tick rescheduling on safe | 416 | * Full dynticks uses irq work to drive the tick rescheduling on safe |
@@ -481,11 +463,18 @@ static int __init setup_tick_nohz(char *str) | |||
481 | 463 | ||
482 | __setup("nohz=", setup_tick_nohz); | 464 | __setup("nohz=", setup_tick_nohz); |
483 | 465 | ||
484 | int tick_nohz_tick_stopped(void) | 466 | bool tick_nohz_tick_stopped(void) |
485 | { | 467 | { |
486 | return __this_cpu_read(tick_cpu_sched.tick_stopped); | 468 | return __this_cpu_read(tick_cpu_sched.tick_stopped); |
487 | } | 469 | } |
488 | 470 | ||
471 | bool tick_nohz_tick_stopped_cpu(int cpu) | ||
472 | { | ||
473 | struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); | ||
474 | |||
475 | return ts->tick_stopped; | ||
476 | } | ||
477 | |||
489 | /** | 478 | /** |
490 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted | 479 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted |
491 | * | 480 | * |
@@ -741,12 +730,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
741 | delta = KTIME_MAX; | 730 | delta = KTIME_MAX; |
742 | } | 731 | } |
743 | 732 | ||
744 | #ifdef CONFIG_NO_HZ_FULL | ||
745 | /* Limit the tick delta to the maximum scheduler deferment */ | ||
746 | if (!ts->inidle) | ||
747 | delta = min(delta, scheduler_tick_max_deferment()); | ||
748 | #endif | ||
749 | |||
750 | /* Calculate the next expiry time */ | 733 | /* Calculate the next expiry time */ |
751 | if (delta < (KTIME_MAX - basemono)) | 734 | if (delta < (KTIME_MAX - basemono)) |
752 | expires = basemono + delta; | 735 | expires = basemono + delta; |
@@ -953,13 +936,6 @@ void tick_nohz_idle_enter(void) | |||
953 | struct tick_sched *ts; | 936 | struct tick_sched *ts; |
954 | 937 | ||
955 | lockdep_assert_irqs_enabled(); | 938 | lockdep_assert_irqs_enabled(); |
956 | /* | ||
957 | * Update the idle state in the scheduler domain hierarchy | ||
958 | * when tick_nohz_stop_sched_tick() is called from the idle loop. | ||
959 | * State will be updated to busy during the first busy tick after | ||
960 | * exiting idle. | ||
961 | */ | ||
962 | set_cpu_sd_state_idle(); | ||
963 | 939 | ||
964 | local_irq_disable(); | 940 | local_irq_disable(); |
965 | 941 | ||
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 55d6dff37daf..2c416509b834 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
9 | #include <linux/kprobes.h> | 9 | #include <linux/kprobes.h> |
10 | #include "trace.h" | 10 | #include "trace.h" |
11 | #include "trace_probe.h" | ||
11 | 12 | ||
12 | static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; | 13 | static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; |
13 | 14 | ||
@@ -237,6 +238,107 @@ void perf_trace_destroy(struct perf_event *p_event) | |||
237 | mutex_unlock(&event_mutex); | 238 | mutex_unlock(&event_mutex); |
238 | } | 239 | } |
239 | 240 | ||
241 | #ifdef CONFIG_KPROBE_EVENTS | ||
242 | int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) | ||
243 | { | ||
244 | int ret; | ||
245 | char *func = NULL; | ||
246 | struct trace_event_call *tp_event; | ||
247 | |||
248 | if (p_event->attr.kprobe_func) { | ||
249 | func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL); | ||
250 | if (!func) | ||
251 | return -ENOMEM; | ||
252 | ret = strncpy_from_user( | ||
253 | func, u64_to_user_ptr(p_event->attr.kprobe_func), | ||
254 | KSYM_NAME_LEN); | ||
255 | if (ret < 0) | ||
256 | goto out; | ||
257 | |||
258 | if (func[0] == '\0') { | ||
259 | kfree(func); | ||
260 | func = NULL; | ||
261 | } | ||
262 | } | ||
263 | |||
264 | tp_event = create_local_trace_kprobe( | ||
265 | func, (void *)(unsigned long)(p_event->attr.kprobe_addr), | ||
266 | p_event->attr.probe_offset, is_retprobe); | ||
267 | if (IS_ERR(tp_event)) { | ||
268 | ret = PTR_ERR(tp_event); | ||
269 | goto out; | ||
270 | } | ||
271 | |||
272 | ret = perf_trace_event_init(tp_event, p_event); | ||
273 | if (ret) | ||
274 | destroy_local_trace_kprobe(tp_event); | ||
275 | out: | ||
276 | kfree(func); | ||
277 | return ret; | ||
278 | } | ||
279 | |||
280 | void perf_kprobe_destroy(struct perf_event *p_event) | ||
281 | { | ||
282 | perf_trace_event_close(p_event); | ||
283 | perf_trace_event_unreg(p_event); | ||
284 | |||
285 | destroy_local_trace_kprobe(p_event->tp_event); | ||
286 | } | ||
287 | #endif /* CONFIG_KPROBE_EVENTS */ | ||
288 | |||
289 | #ifdef CONFIG_UPROBE_EVENTS | ||
290 | int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) | ||
291 | { | ||
292 | int ret; | ||
293 | char *path = NULL; | ||
294 | struct trace_event_call *tp_event; | ||
295 | |||
296 | if (!p_event->attr.uprobe_path) | ||
297 | return -EINVAL; | ||
298 | path = kzalloc(PATH_MAX, GFP_KERNEL); | ||
299 | if (!path) | ||
300 | return -ENOMEM; | ||
301 | ret = strncpy_from_user( | ||
302 | path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX); | ||
303 | if (ret < 0) | ||
304 | goto out; | ||
305 | if (path[0] == '\0') { | ||
306 | ret = -EINVAL; | ||
307 | goto out; | ||
308 | } | ||
309 | |||
310 | tp_event = create_local_trace_uprobe( | ||
311 | path, p_event->attr.probe_offset, is_retprobe); | ||
312 | if (IS_ERR(tp_event)) { | ||
313 | ret = PTR_ERR(tp_event); | ||
314 | goto out; | ||
315 | } | ||
316 | |||
317 | /* | ||
318 | * local trace_uprobe need to hold event_mutex to call | ||
319 | * uprobe_buffer_enable() and uprobe_buffer_disable(). | ||
320 | * event_mutex is not required for local trace_kprobes. | ||
321 | */ | ||
322 | mutex_lock(&event_mutex); | ||
323 | ret = perf_trace_event_init(tp_event, p_event); | ||
324 | if (ret) | ||
325 | destroy_local_trace_uprobe(tp_event); | ||
326 | mutex_unlock(&event_mutex); | ||
327 | out: | ||
328 | kfree(path); | ||
329 | return ret; | ||
330 | } | ||
331 | |||
332 | void perf_uprobe_destroy(struct perf_event *p_event) | ||
333 | { | ||
334 | mutex_lock(&event_mutex); | ||
335 | perf_trace_event_close(p_event); | ||
336 | perf_trace_event_unreg(p_event); | ||
337 | mutex_unlock(&event_mutex); | ||
338 | destroy_local_trace_uprobe(p_event->tp_event); | ||
339 | } | ||
340 | #endif /* CONFIG_UPROBE_EVENTS */ | ||
341 | |||
240 | int perf_trace_add(struct perf_event *p_event, int flags) | 342 | int perf_trace_add(struct perf_event *p_event, int flags) |
241 | { | 343 | { |
242 | struct trace_event_call *tp_event = p_event->tp_event; | 344 | struct trace_event_call *tp_event = p_event->tp_event; |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ae4147eaebd4..1cd3fb4d70f8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -462,6 +462,14 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) | |||
462 | disable_kprobe(&tk->rp.kp); | 462 | disable_kprobe(&tk->rp.kp); |
463 | wait = 1; | 463 | wait = 1; |
464 | } | 464 | } |
465 | |||
466 | /* | ||
467 | * if tk is not added to any list, it must be a local trace_kprobe | ||
468 | * created with perf_event_open. We don't need to wait for these | ||
469 | * trace_kprobes | ||
470 | */ | ||
471 | if (list_empty(&tk->list)) | ||
472 | wait = 0; | ||
465 | out: | 473 | out: |
466 | if (wait) { | 474 | if (wait) { |
467 | /* | 475 | /* |
@@ -1358,12 +1366,9 @@ static struct trace_event_functions kprobe_funcs = { | |||
1358 | .trace = print_kprobe_event | 1366 | .trace = print_kprobe_event |
1359 | }; | 1367 | }; |
1360 | 1368 | ||
1361 | static int register_kprobe_event(struct trace_kprobe *tk) | 1369 | static inline void init_trace_event_call(struct trace_kprobe *tk, |
1370 | struct trace_event_call *call) | ||
1362 | { | 1371 | { |
1363 | struct trace_event_call *call = &tk->tp.call; | ||
1364 | int ret; | ||
1365 | |||
1366 | /* Initialize trace_event_call */ | ||
1367 | INIT_LIST_HEAD(&call->class->fields); | 1372 | INIT_LIST_HEAD(&call->class->fields); |
1368 | if (trace_kprobe_is_return(tk)) { | 1373 | if (trace_kprobe_is_return(tk)) { |
1369 | call->event.funcs = &kretprobe_funcs; | 1374 | call->event.funcs = &kretprobe_funcs; |
@@ -1372,6 +1377,19 @@ static int register_kprobe_event(struct trace_kprobe *tk) | |||
1372 | call->event.funcs = &kprobe_funcs; | 1377 | call->event.funcs = &kprobe_funcs; |
1373 | call->class->define_fields = kprobe_event_define_fields; | 1378 | call->class->define_fields = kprobe_event_define_fields; |
1374 | } | 1379 | } |
1380 | |||
1381 | call->flags = TRACE_EVENT_FL_KPROBE; | ||
1382 | call->class->reg = kprobe_register; | ||
1383 | call->data = tk; | ||
1384 | } | ||
1385 | |||
1386 | static int register_kprobe_event(struct trace_kprobe *tk) | ||
1387 | { | ||
1388 | struct trace_event_call *call = &tk->tp.call; | ||
1389 | int ret = 0; | ||
1390 | |||
1391 | init_trace_event_call(tk, call); | ||
1392 | |||
1375 | if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) | 1393 | if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) |
1376 | return -ENOMEM; | 1394 | return -ENOMEM; |
1377 | ret = register_trace_event(&call->event); | 1395 | ret = register_trace_event(&call->event); |
@@ -1379,9 +1397,6 @@ static int register_kprobe_event(struct trace_kprobe *tk) | |||
1379 | kfree(call->print_fmt); | 1397 | kfree(call->print_fmt); |
1380 | return -ENODEV; | 1398 | return -ENODEV; |
1381 | } | 1399 | } |
1382 | call->flags = TRACE_EVENT_FL_KPROBE; | ||
1383 | call->class->reg = kprobe_register; | ||
1384 | call->data = tk; | ||
1385 | ret = trace_add_event_call(call); | 1400 | ret = trace_add_event_call(call); |
1386 | if (ret) { | 1401 | if (ret) { |
1387 | pr_info("Failed to register kprobe event: %s\n", | 1402 | pr_info("Failed to register kprobe event: %s\n", |
@@ -1403,6 +1418,66 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) | |||
1403 | return ret; | 1418 | return ret; |
1404 | } | 1419 | } |
1405 | 1420 | ||
1421 | #ifdef CONFIG_PERF_EVENTS | ||
1422 | /* create a trace_kprobe, but don't add it to global lists */ | ||
1423 | struct trace_event_call * | ||
1424 | create_local_trace_kprobe(char *func, void *addr, unsigned long offs, | ||
1425 | bool is_return) | ||
1426 | { | ||
1427 | struct trace_kprobe *tk; | ||
1428 | int ret; | ||
1429 | char *event; | ||
1430 | |||
1431 | /* | ||
1432 | * local trace_kprobes are not added to probe_list, so they are never | ||
1433 | * searched in find_trace_kprobe(). Therefore, there is no concern of | ||
1434 | * duplicated name here. | ||
1435 | */ | ||
1436 | event = func ? func : "DUMMY_EVENT"; | ||
1437 | |||
1438 | tk = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event, (void *)addr, func, | ||
1439 | offs, 0 /* maxactive */, 0 /* nargs */, | ||
1440 | is_return); | ||
1441 | |||
1442 | if (IS_ERR(tk)) { | ||
1443 | pr_info("Failed to allocate trace_probe.(%d)\n", | ||
1444 | (int)PTR_ERR(tk)); | ||
1445 | return ERR_CAST(tk); | ||
1446 | } | ||
1447 | |||
1448 | init_trace_event_call(tk, &tk->tp.call); | ||
1449 | |||
1450 | if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { | ||
1451 | ret = -ENOMEM; | ||
1452 | goto error; | ||
1453 | } | ||
1454 | |||
1455 | ret = __register_trace_kprobe(tk); | ||
1456 | if (ret < 0) | ||
1457 | goto error; | ||
1458 | |||
1459 | return &tk->tp.call; | ||
1460 | error: | ||
1461 | free_trace_kprobe(tk); | ||
1462 | return ERR_PTR(ret); | ||
1463 | } | ||
1464 | |||
1465 | void destroy_local_trace_kprobe(struct trace_event_call *event_call) | ||
1466 | { | ||
1467 | struct trace_kprobe *tk; | ||
1468 | |||
1469 | tk = container_of(event_call, struct trace_kprobe, tp.call); | ||
1470 | |||
1471 | if (trace_probe_is_enabled(&tk->tp)) { | ||
1472 | WARN_ON(1); | ||
1473 | return; | ||
1474 | } | ||
1475 | |||
1476 | __unregister_trace_kprobe(tk); | ||
1477 | free_trace_kprobe(tk); | ||
1478 | } | ||
1479 | #endif /* CONFIG_PERF_EVENTS */ | ||
1480 | |||
1406 | /* Make a tracefs interface for controlling probe points */ | 1481 | /* Make a tracefs interface for controlling probe points */ |
1407 | static __init int init_kprobe_trace(void) | 1482 | static __init int init_kprobe_trace(void) |
1408 | { | 1483 | { |
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 6a4d3fa94042..75daff22ccea 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h | |||
@@ -416,3 +416,14 @@ store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, | |||
416 | } | 416 | } |
417 | 417 | ||
418 | extern int set_print_fmt(struct trace_probe *tp, bool is_return); | 418 | extern int set_print_fmt(struct trace_probe *tp, bool is_return); |
419 | |||
420 | #ifdef CONFIG_PERF_EVENTS | ||
421 | extern struct trace_event_call * | ||
422 | create_local_trace_kprobe(char *func, void *addr, unsigned long offs, | ||
423 | bool is_return); | ||
424 | extern void destroy_local_trace_kprobe(struct trace_event_call *event_call); | ||
425 | |||
426 | extern struct trace_event_call * | ||
427 | create_local_trace_uprobe(char *name, unsigned long offs, bool is_return); | ||
428 | extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); | ||
429 | #endif | ||
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 268029ae1be6..2014f4351ae0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -1292,16 +1292,25 @@ static struct trace_event_functions uprobe_funcs = { | |||
1292 | .trace = print_uprobe_event | 1292 | .trace = print_uprobe_event |
1293 | }; | 1293 | }; |
1294 | 1294 | ||
1295 | static int register_uprobe_event(struct trace_uprobe *tu) | 1295 | static inline void init_trace_event_call(struct trace_uprobe *tu, |
1296 | struct trace_event_call *call) | ||
1296 | { | 1297 | { |
1297 | struct trace_event_call *call = &tu->tp.call; | ||
1298 | int ret; | ||
1299 | |||
1300 | /* Initialize trace_event_call */ | ||
1301 | INIT_LIST_HEAD(&call->class->fields); | 1298 | INIT_LIST_HEAD(&call->class->fields); |
1302 | call->event.funcs = &uprobe_funcs; | 1299 | call->event.funcs = &uprobe_funcs; |
1303 | call->class->define_fields = uprobe_event_define_fields; | 1300 | call->class->define_fields = uprobe_event_define_fields; |
1304 | 1301 | ||
1302 | call->flags = TRACE_EVENT_FL_UPROBE; | ||
1303 | call->class->reg = trace_uprobe_register; | ||
1304 | call->data = tu; | ||
1305 | } | ||
1306 | |||
1307 | static int register_uprobe_event(struct trace_uprobe *tu) | ||
1308 | { | ||
1309 | struct trace_event_call *call = &tu->tp.call; | ||
1310 | int ret = 0; | ||
1311 | |||
1312 | init_trace_event_call(tu, call); | ||
1313 | |||
1305 | if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) | 1314 | if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) |
1306 | return -ENOMEM; | 1315 | return -ENOMEM; |
1307 | 1316 | ||
@@ -1311,9 +1320,6 @@ static int register_uprobe_event(struct trace_uprobe *tu) | |||
1311 | return -ENODEV; | 1320 | return -ENODEV; |
1312 | } | 1321 | } |
1313 | 1322 | ||
1314 | call->flags = TRACE_EVENT_FL_UPROBE; | ||
1315 | call->class->reg = trace_uprobe_register; | ||
1316 | call->data = tu; | ||
1317 | ret = trace_add_event_call(call); | 1323 | ret = trace_add_event_call(call); |
1318 | 1324 | ||
1319 | if (ret) { | 1325 | if (ret) { |
@@ -1339,6 +1345,70 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) | |||
1339 | return 0; | 1345 | return 0; |
1340 | } | 1346 | } |
1341 | 1347 | ||
1348 | #ifdef CONFIG_PERF_EVENTS | ||
1349 | struct trace_event_call * | ||
1350 | create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) | ||
1351 | { | ||
1352 | struct trace_uprobe *tu; | ||
1353 | struct inode *inode; | ||
1354 | struct path path; | ||
1355 | int ret; | ||
1356 | |||
1357 | ret = kern_path(name, LOOKUP_FOLLOW, &path); | ||
1358 | if (ret) | ||
1359 | return ERR_PTR(ret); | ||
1360 | |||
1361 | inode = igrab(d_inode(path.dentry)); | ||
1362 | path_put(&path); | ||
1363 | |||
1364 | if (!inode || !S_ISREG(inode->i_mode)) { | ||
1365 | iput(inode); | ||
1366 | return ERR_PTR(-EINVAL); | ||
1367 | } | ||
1368 | |||
1369 | /* | ||
1370 | * local trace_kprobes are not added to probe_list, so they are never | ||
1371 | * searched in find_trace_kprobe(). Therefore, there is no concern of | ||
1372 | * duplicated name "DUMMY_EVENT" here. | ||
1373 | */ | ||
1374 | tu = alloc_trace_uprobe(UPROBE_EVENT_SYSTEM, "DUMMY_EVENT", 0, | ||
1375 | is_return); | ||
1376 | |||
1377 | if (IS_ERR(tu)) { | ||
1378 | pr_info("Failed to allocate trace_uprobe.(%d)\n", | ||
1379 | (int)PTR_ERR(tu)); | ||
1380 | return ERR_CAST(tu); | ||
1381 | } | ||
1382 | |||
1383 | tu->offset = offs; | ||
1384 | tu->inode = inode; | ||
1385 | tu->filename = kstrdup(name, GFP_KERNEL); | ||
1386 | init_trace_event_call(tu, &tu->tp.call); | ||
1387 | |||
1388 | if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { | ||
1389 | ret = -ENOMEM; | ||
1390 | goto error; | ||
1391 | } | ||
1392 | |||
1393 | return &tu->tp.call; | ||
1394 | error: | ||
1395 | free_trace_uprobe(tu); | ||
1396 | return ERR_PTR(ret); | ||
1397 | } | ||
1398 | |||
1399 | void destroy_local_trace_uprobe(struct trace_event_call *event_call) | ||
1400 | { | ||
1401 | struct trace_uprobe *tu; | ||
1402 | |||
1403 | tu = container_of(event_call, struct trace_uprobe, tp.call); | ||
1404 | |||
1405 | kfree(tu->tp.call.print_fmt); | ||
1406 | tu->tp.call.print_fmt = NULL; | ||
1407 | |||
1408 | free_trace_uprobe(tu); | ||
1409 | } | ||
1410 | #endif /* CONFIG_PERF_EVENTS */ | ||
1411 | |||
1342 | /* Make a trace interface for controling probe points */ | 1412 | /* Make a trace interface for controling probe points */ |
1343 | static __init int init_uprobe_trace(void) | 1413 | static __init int init_uprobe_trace(void) |
1344 | { | 1414 | { |
diff --git a/kernel/uid16.c b/kernel/uid16.c index ef1da2a5f9bd..af6925d8599b 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
@@ -18,44 +18,46 @@ | |||
18 | 18 | ||
19 | #include <linux/uaccess.h> | 19 | #include <linux/uaccess.h> |
20 | 20 | ||
21 | #include "uid16.h" | ||
22 | |||
21 | SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) | 23 | SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) |
22 | { | 24 | { |
23 | return sys_chown(filename, low2highuid(user), low2highgid(group)); | 25 | return ksys_chown(filename, low2highuid(user), low2highgid(group)); |
24 | } | 26 | } |
25 | 27 | ||
26 | SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) | 28 | SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) |
27 | { | 29 | { |
28 | return sys_lchown(filename, low2highuid(user), low2highgid(group)); | 30 | return ksys_lchown(filename, low2highuid(user), low2highgid(group)); |
29 | } | 31 | } |
30 | 32 | ||
31 | SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) | 33 | SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) |
32 | { | 34 | { |
33 | return sys_fchown(fd, low2highuid(user), low2highgid(group)); | 35 | return ksys_fchown(fd, low2highuid(user), low2highgid(group)); |
34 | } | 36 | } |
35 | 37 | ||
36 | SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) | 38 | SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) |
37 | { | 39 | { |
38 | return sys_setregid(low2highgid(rgid), low2highgid(egid)); | 40 | return __sys_setregid(low2highgid(rgid), low2highgid(egid)); |
39 | } | 41 | } |
40 | 42 | ||
41 | SYSCALL_DEFINE1(setgid16, old_gid_t, gid) | 43 | SYSCALL_DEFINE1(setgid16, old_gid_t, gid) |
42 | { | 44 | { |
43 | return sys_setgid(low2highgid(gid)); | 45 | return __sys_setgid(low2highgid(gid)); |
44 | } | 46 | } |
45 | 47 | ||
46 | SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) | 48 | SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) |
47 | { | 49 | { |
48 | return sys_setreuid(low2highuid(ruid), low2highuid(euid)); | 50 | return __sys_setreuid(low2highuid(ruid), low2highuid(euid)); |
49 | } | 51 | } |
50 | 52 | ||
51 | SYSCALL_DEFINE1(setuid16, old_uid_t, uid) | 53 | SYSCALL_DEFINE1(setuid16, old_uid_t, uid) |
52 | { | 54 | { |
53 | return sys_setuid(low2highuid(uid)); | 55 | return __sys_setuid(low2highuid(uid)); |
54 | } | 56 | } |
55 | 57 | ||
56 | SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) | 58 | SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) |
57 | { | 59 | { |
58 | return sys_setresuid(low2highuid(ruid), low2highuid(euid), | 60 | return __sys_setresuid(low2highuid(ruid), low2highuid(euid), |
59 | low2highuid(suid)); | 61 | low2highuid(suid)); |
60 | } | 62 | } |
61 | 63 | ||
@@ -78,11 +80,10 @@ SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euid | |||
78 | 80 | ||
79 | SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) | 81 | SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) |
80 | { | 82 | { |
81 | return sys_setresgid(low2highgid(rgid), low2highgid(egid), | 83 | return __sys_setresgid(low2highgid(rgid), low2highgid(egid), |
82 | low2highgid(sgid)); | 84 | low2highgid(sgid)); |
83 | } | 85 | } |
84 | 86 | ||
85 | |||
86 | SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp) | 87 | SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp) |
87 | { | 88 | { |
88 | const struct cred *cred = current_cred(); | 89 | const struct cred *cred = current_cred(); |
@@ -102,12 +103,12 @@ SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egid | |||
102 | 103 | ||
103 | SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) | 104 | SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) |
104 | { | 105 | { |
105 | return sys_setfsuid(low2highuid(uid)); | 106 | return __sys_setfsuid(low2highuid(uid)); |
106 | } | 107 | } |
107 | 108 | ||
108 | SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) | 109 | SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) |
109 | { | 110 | { |
110 | return sys_setfsgid(low2highgid(gid)); | 111 | return __sys_setfsgid(low2highgid(gid)); |
111 | } | 112 | } |
112 | 113 | ||
113 | static int groups16_to_user(old_gid_t __user *grouplist, | 114 | static int groups16_to_user(old_gid_t __user *grouplist, |
diff --git a/kernel/uid16.h b/kernel/uid16.h new file mode 100644 index 000000000000..cdca040f7602 --- /dev/null +++ b/kernel/uid16.h | |||
@@ -0,0 +1,14 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | #ifndef LINUX_UID16_H | ||
3 | #define LINUX_UID16_H | ||
4 | |||
5 | long __sys_setuid(uid_t uid); | ||
6 | long __sys_setgid(gid_t gid); | ||
7 | long __sys_setreuid(uid_t ruid, uid_t euid); | ||
8 | long __sys_setregid(gid_t rgid, gid_t egid); | ||
9 | long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid); | ||
10 | long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid); | ||
11 | long __sys_setfsuid(uid_t uid); | ||
12 | long __sys_setfsgid(gid_t gid); | ||
13 | |||
14 | #endif /* LINUX_UID16_H */ | ||
diff --git a/kernel/umh.c b/kernel/umh.c index 18e5fa4b0e71..f76b3ff876cf 100644 --- a/kernel/umh.c +++ b/kernel/umh.c | |||
@@ -118,7 +118,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) | |||
118 | { | 118 | { |
119 | pid_t pid; | 119 | pid_t pid; |
120 | 120 | ||
121 | /* If SIGCLD is ignored sys_wait4 won't populate the status. */ | 121 | /* If SIGCLD is ignored kernel_wait4 won't populate the status. */ |
122 | kernel_sigaction(SIGCHLD, SIG_DFL); | 122 | kernel_sigaction(SIGCHLD, SIG_DFL); |
123 | pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); | 123 | pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); |
124 | if (pid < 0) { | 124 | if (pid < 0) { |
@@ -135,7 +135,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) | |||
135 | * | 135 | * |
136 | * Thus the __user pointer cast is valid here. | 136 | * Thus the __user pointer cast is valid here. |
137 | */ | 137 | */ |
138 | sys_wait4(pid, (int __user *)&ret, 0, NULL); | 138 | kernel_wait4(pid, (int __user *)&ret, 0, NULL); |
139 | 139 | ||
140 | /* | 140 | /* |
141 | * If ret is 0, either call_usermodehelper_exec_async failed and | 141 | * If ret is 0, either call_usermodehelper_exec_async failed and |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6ec6ba65127b..254e636a3d6b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -5573,12 +5573,13 @@ static void __init wq_numa_init(void) | |||
5573 | int __init workqueue_init_early(void) | 5573 | int __init workqueue_init_early(void) |
5574 | { | 5574 | { |
5575 | int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; | 5575 | int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; |
5576 | int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; | ||
5576 | int i, cpu; | 5577 | int i, cpu; |
5577 | 5578 | ||
5578 | WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); | 5579 | WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); |
5579 | 5580 | ||
5580 | BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); | 5581 | BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); |
5581 | cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); | 5582 | cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags)); |
5582 | 5583 | ||
5583 | pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); | 5584 | pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); |
5584 | 5585 | ||