diff options
| author | Ingo Molnar <mingo@kernel.org> | 2018-04-05 03:20:34 -0400 |
|---|---|---|
| committer | Ingo Molnar <mingo@kernel.org> | 2018-04-05 03:20:34 -0400 |
| commit | ea2a6af517714c52a1209795a03e863e96b460bb (patch) | |
| tree | 3bd443bc9b23ceeaf3743eaf2d6d35ec63c620c9 /kernel | |
| parent | 1b5d43cfb69759d8ef8d30469cea31d0c037aed5 (diff) | |
| parent | 642e7fd23353e22290e3d51719fcb658dc252342 (diff) | |
Merge branch 'linus' into sched/urgent, to pick up fixes and updates
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
71 files changed, 3840 insertions, 2396 deletions
diff --git a/kernel/compat.c b/kernel/compat.c index 3f5fa8902e7d..6d21894806b4 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
| @@ -488,61 +488,6 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) | |||
| 488 | } | 488 | } |
| 489 | EXPORT_SYMBOL_GPL(get_compat_sigset); | 489 | EXPORT_SYMBOL_GPL(get_compat_sigset); |
| 490 | 490 | ||
| 491 | #ifdef CONFIG_NUMA | ||
| 492 | COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, | ||
| 493 | compat_uptr_t __user *, pages32, | ||
| 494 | const int __user *, nodes, | ||
| 495 | int __user *, status, | ||
| 496 | int, flags) | ||
| 497 | { | ||
| 498 | const void __user * __user *pages; | ||
| 499 | int i; | ||
| 500 | |||
| 501 | pages = compat_alloc_user_space(nr_pages * sizeof(void *)); | ||
| 502 | for (i = 0; i < nr_pages; i++) { | ||
| 503 | compat_uptr_t p; | ||
| 504 | |||
| 505 | if (get_user(p, pages32 + i) || | ||
| 506 | put_user(compat_ptr(p), pages + i)) | ||
| 507 | return -EFAULT; | ||
| 508 | } | ||
| 509 | return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); | ||
| 510 | } | ||
| 511 | |||
| 512 | COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, | ||
| 513 | compat_ulong_t, maxnode, | ||
| 514 | const compat_ulong_t __user *, old_nodes, | ||
| 515 | const compat_ulong_t __user *, new_nodes) | ||
| 516 | { | ||
| 517 | unsigned long __user *old = NULL; | ||
| 518 | unsigned long __user *new = NULL; | ||
| 519 | nodemask_t tmp_mask; | ||
| 520 | unsigned long nr_bits; | ||
| 521 | unsigned long size; | ||
| 522 | |||
| 523 | nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); | ||
| 524 | size = ALIGN(nr_bits, BITS_PER_LONG) / 8; | ||
| 525 | if (old_nodes) { | ||
| 526 | if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) | ||
| 527 | return -EFAULT; | ||
| 528 | old = compat_alloc_user_space(new_nodes ? size * 2 : size); | ||
| 529 | if (new_nodes) | ||
| 530 | new = old + size / sizeof(unsigned long); | ||
| 531 | if (copy_to_user(old, nodes_addr(tmp_mask), size)) | ||
| 532 | return -EFAULT; | ||
| 533 | } | ||
| 534 | if (new_nodes) { | ||
| 535 | if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) | ||
| 536 | return -EFAULT; | ||
| 537 | if (new == NULL) | ||
| 538 | new = compat_alloc_user_space(size); | ||
| 539 | if (copy_to_user(new, nodes_addr(tmp_mask), size)) | ||
| 540 | return -EFAULT; | ||
| 541 | } | ||
| 542 | return sys_migrate_pages(pid, nr_bits + 1, old, new); | ||
| 543 | } | ||
| 544 | #endif | ||
| 545 | |||
| 546 | /* | 491 | /* |
| 547 | * Allocate user-space memory for the duration of a single system call, | 492 | * Allocate user-space memory for the duration of a single system call, |
| 548 | * in order to marshall parameters inside a compat thunk. | 493 | * in order to marshall parameters inside a compat thunk. |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 53f7dc65f9a3..0db8938fbb23 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -124,24 +124,11 @@ struct cpuhp_step { | |||
| 124 | }; | 124 | }; |
| 125 | 125 | ||
| 126 | static DEFINE_MUTEX(cpuhp_state_mutex); | 126 | static DEFINE_MUTEX(cpuhp_state_mutex); |
| 127 | static struct cpuhp_step cpuhp_bp_states[]; | 127 | static struct cpuhp_step cpuhp_hp_states[]; |
| 128 | static struct cpuhp_step cpuhp_ap_states[]; | ||
| 129 | |||
| 130 | static bool cpuhp_is_ap_state(enum cpuhp_state state) | ||
| 131 | { | ||
| 132 | /* | ||
| 133 | * The extra check for CPUHP_TEARDOWN_CPU is only for documentation | ||
| 134 | * purposes as that state is handled explicitly in cpu_down. | ||
| 135 | */ | ||
| 136 | return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; | ||
| 137 | } | ||
| 138 | 128 | ||
| 139 | static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) | 129 | static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) |
| 140 | { | 130 | { |
| 141 | struct cpuhp_step *sp; | 131 | return cpuhp_hp_states + state; |
| 142 | |||
| 143 | sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; | ||
| 144 | return sp + state; | ||
| 145 | } | 132 | } |
| 146 | 133 | ||
| 147 | /** | 134 | /** |
| @@ -239,6 +226,15 @@ err: | |||
| 239 | } | 226 | } |
| 240 | 227 | ||
| 241 | #ifdef CONFIG_SMP | 228 | #ifdef CONFIG_SMP |
| 229 | static bool cpuhp_is_ap_state(enum cpuhp_state state) | ||
| 230 | { | ||
| 231 | /* | ||
| 232 | * The extra check for CPUHP_TEARDOWN_CPU is only for documentation | ||
| 233 | * purposes as that state is handled explicitly in cpu_down. | ||
| 234 | */ | ||
| 235 | return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; | ||
| 236 | } | ||
| 237 | |||
| 242 | static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) | 238 | static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) |
| 243 | { | 239 | { |
| 244 | struct completion *done = bringup ? &st->done_up : &st->done_down; | 240 | struct completion *done = bringup ? &st->done_up : &st->done_down; |
| @@ -1224,7 +1220,7 @@ int __boot_cpu_id; | |||
| 1224 | #endif /* CONFIG_SMP */ | 1220 | #endif /* CONFIG_SMP */ |
| 1225 | 1221 | ||
| 1226 | /* Boot processor state steps */ | 1222 | /* Boot processor state steps */ |
| 1227 | static struct cpuhp_step cpuhp_bp_states[] = { | 1223 | static struct cpuhp_step cpuhp_hp_states[] = { |
| 1228 | [CPUHP_OFFLINE] = { | 1224 | [CPUHP_OFFLINE] = { |
| 1229 | .name = "offline", | 1225 | .name = "offline", |
| 1230 | .startup.single = NULL, | 1226 | .startup.single = NULL, |
| @@ -1289,24 +1285,6 @@ static struct cpuhp_step cpuhp_bp_states[] = { | |||
| 1289 | .teardown.single = NULL, | 1285 | .teardown.single = NULL, |
| 1290 | .cant_stop = true, | 1286 | .cant_stop = true, |
| 1291 | }, | 1287 | }, |
| 1292 | /* | ||
| 1293 | * Handled on controll processor until the plugged processor manages | ||
| 1294 | * this itself. | ||
| 1295 | */ | ||
| 1296 | [CPUHP_TEARDOWN_CPU] = { | ||
| 1297 | .name = "cpu:teardown", | ||
| 1298 | .startup.single = NULL, | ||
| 1299 | .teardown.single = takedown_cpu, | ||
| 1300 | .cant_stop = true, | ||
| 1301 | }, | ||
| 1302 | #else | ||
| 1303 | [CPUHP_BRINGUP_CPU] = { }, | ||
| 1304 | #endif | ||
| 1305 | }; | ||
| 1306 | |||
| 1307 | /* Application processor state steps */ | ||
| 1308 | static struct cpuhp_step cpuhp_ap_states[] = { | ||
| 1309 | #ifdef CONFIG_SMP | ||
| 1310 | /* Final state before CPU kills itself */ | 1288 | /* Final state before CPU kills itself */ |
| 1311 | [CPUHP_AP_IDLE_DEAD] = { | 1289 | [CPUHP_AP_IDLE_DEAD] = { |
| 1312 | .name = "idle:dead", | 1290 | .name = "idle:dead", |
| @@ -1340,6 +1318,16 @@ static struct cpuhp_step cpuhp_ap_states[] = { | |||
| 1340 | [CPUHP_AP_ONLINE] = { | 1318 | [CPUHP_AP_ONLINE] = { |
| 1341 | .name = "ap:online", | 1319 | .name = "ap:online", |
| 1342 | }, | 1320 | }, |
| 1321 | /* | ||
| 1322 | * Handled on controll processor until the plugged processor manages | ||
| 1323 | * this itself. | ||
| 1324 | */ | ||
| 1325 | [CPUHP_TEARDOWN_CPU] = { | ||
| 1326 | .name = "cpu:teardown", | ||
| 1327 | .startup.single = NULL, | ||
| 1328 | .teardown.single = takedown_cpu, | ||
| 1329 | .cant_stop = true, | ||
| 1330 | }, | ||
| 1343 | /* Handle smpboot threads park/unpark */ | 1331 | /* Handle smpboot threads park/unpark */ |
| 1344 | [CPUHP_AP_SMPBOOT_THREADS] = { | 1332 | [CPUHP_AP_SMPBOOT_THREADS] = { |
| 1345 | .name = "smpboot/threads:online", | 1333 | .name = "smpboot/threads:online", |
| @@ -1408,11 +1396,11 @@ static int cpuhp_reserve_state(enum cpuhp_state state) | |||
| 1408 | 1396 | ||
| 1409 | switch (state) { | 1397 | switch (state) { |
| 1410 | case CPUHP_AP_ONLINE_DYN: | 1398 | case CPUHP_AP_ONLINE_DYN: |
| 1411 | step = cpuhp_ap_states + CPUHP_AP_ONLINE_DYN; | 1399 | step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN; |
| 1412 | end = CPUHP_AP_ONLINE_DYN_END; | 1400 | end = CPUHP_AP_ONLINE_DYN_END; |
| 1413 | break; | 1401 | break; |
| 1414 | case CPUHP_BP_PREPARE_DYN: | 1402 | case CPUHP_BP_PREPARE_DYN: |
| 1415 | step = cpuhp_bp_states + CPUHP_BP_PREPARE_DYN; | 1403 | step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN; |
| 1416 | end = CPUHP_BP_PREPARE_DYN_END; | 1404 | end = CPUHP_BP_PREPARE_DYN_END; |
| 1417 | break; | 1405 | break; |
| 1418 | default: | 1406 | default: |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 709a55b9ad97..fc1c330c6bd6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -430,7 +430,7 @@ static void update_perf_cpu_limits(void) | |||
| 430 | WRITE_ONCE(perf_sample_allowed_ns, tmp); | 430 | WRITE_ONCE(perf_sample_allowed_ns, tmp); |
| 431 | } | 431 | } |
| 432 | 432 | ||
| 433 | static int perf_rotate_context(struct perf_cpu_context *cpuctx); | 433 | static bool perf_rotate_context(struct perf_cpu_context *cpuctx); |
| 434 | 434 | ||
| 435 | int perf_proc_update_handler(struct ctl_table *table, int write, | 435 | int perf_proc_update_handler(struct ctl_table *table, int write, |
| 436 | void __user *buffer, size_t *lenp, | 436 | void __user *buffer, size_t *lenp, |
| @@ -643,7 +643,7 @@ static void perf_event_update_sibling_time(struct perf_event *leader) | |||
| 643 | { | 643 | { |
| 644 | struct perf_event *sibling; | 644 | struct perf_event *sibling; |
| 645 | 645 | ||
| 646 | list_for_each_entry(sibling, &leader->sibling_list, group_entry) | 646 | for_each_sibling_event(sibling, leader) |
| 647 | perf_event_update_time(sibling); | 647 | perf_event_update_time(sibling); |
| 648 | } | 648 | } |
| 649 | 649 | ||
| @@ -948,27 +948,39 @@ list_update_cgroup_event(struct perf_event *event, | |||
| 948 | if (!is_cgroup_event(event)) | 948 | if (!is_cgroup_event(event)) |
| 949 | return; | 949 | return; |
| 950 | 950 | ||
| 951 | if (add && ctx->nr_cgroups++) | ||
| 952 | return; | ||
| 953 | else if (!add && --ctx->nr_cgroups) | ||
| 954 | return; | ||
| 955 | /* | 951 | /* |
| 956 | * Because cgroup events are always per-cpu events, | 952 | * Because cgroup events are always per-cpu events, |
| 957 | * this will always be called from the right CPU. | 953 | * this will always be called from the right CPU. |
| 958 | */ | 954 | */ |
| 959 | cpuctx = __get_cpu_context(ctx); | 955 | cpuctx = __get_cpu_context(ctx); |
| 960 | cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; | 956 | |
| 961 | /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/ | 957 | /* |
| 962 | if (add) { | 958 | * Since setting cpuctx->cgrp is conditional on the current @cgrp |
| 959 | * matching the event's cgroup, we must do this for every new event, | ||
| 960 | * because if the first would mismatch, the second would not try again | ||
| 961 | * and we would leave cpuctx->cgrp unset. | ||
| 962 | */ | ||
| 963 | if (add && !cpuctx->cgrp) { | ||
| 963 | struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); | 964 | struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); |
| 964 | 965 | ||
| 965 | list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); | ||
| 966 | if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) | 966 | if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) |
| 967 | cpuctx->cgrp = cgrp; | 967 | cpuctx->cgrp = cgrp; |
| 968 | } else { | ||
| 969 | list_del(cpuctx_entry); | ||
| 970 | cpuctx->cgrp = NULL; | ||
| 971 | } | 968 | } |
| 969 | |||
| 970 | if (add && ctx->nr_cgroups++) | ||
| 971 | return; | ||
| 972 | else if (!add && --ctx->nr_cgroups) | ||
| 973 | return; | ||
| 974 | |||
| 975 | /* no cgroup running */ | ||
| 976 | if (!add) | ||
| 977 | cpuctx->cgrp = NULL; | ||
| 978 | |||
| 979 | cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; | ||
| 980 | if (add) | ||
| 981 | list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); | ||
| 982 | else | ||
| 983 | list_del(cpuctx_entry); | ||
| 972 | } | 984 | } |
| 973 | 985 | ||
| 974 | #else /* !CONFIG_CGROUP_PERF */ | 986 | #else /* !CONFIG_CGROUP_PERF */ |
| @@ -1052,7 +1064,7 @@ list_update_cgroup_event(struct perf_event *event, | |||
| 1052 | static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) | 1064 | static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) |
| 1053 | { | 1065 | { |
| 1054 | struct perf_cpu_context *cpuctx; | 1066 | struct perf_cpu_context *cpuctx; |
| 1055 | int rotations = 0; | 1067 | bool rotations; |
| 1056 | 1068 | ||
| 1057 | lockdep_assert_irqs_disabled(); | 1069 | lockdep_assert_irqs_disabled(); |
| 1058 | 1070 | ||
| @@ -1471,8 +1483,21 @@ static enum event_type_t get_event_type(struct perf_event *event) | |||
| 1471 | return event_type; | 1483 | return event_type; |
| 1472 | } | 1484 | } |
| 1473 | 1485 | ||
| 1474 | static struct list_head * | 1486 | /* |
| 1475 | ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) | 1487 | * Helper function to initialize event group nodes. |
| 1488 | */ | ||
| 1489 | static void init_event_group(struct perf_event *event) | ||
| 1490 | { | ||
| 1491 | RB_CLEAR_NODE(&event->group_node); | ||
| 1492 | event->group_index = 0; | ||
| 1493 | } | ||
| 1494 | |||
| 1495 | /* | ||
| 1496 | * Extract pinned or flexible groups from the context | ||
| 1497 | * based on event attrs bits. | ||
| 1498 | */ | ||
| 1499 | static struct perf_event_groups * | ||
| 1500 | get_event_groups(struct perf_event *event, struct perf_event_context *ctx) | ||
| 1476 | { | 1501 | { |
| 1477 | if (event->attr.pinned) | 1502 | if (event->attr.pinned) |
| 1478 | return &ctx->pinned_groups; | 1503 | return &ctx->pinned_groups; |
| @@ -1481,6 +1506,156 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) | |||
| 1481 | } | 1506 | } |
| 1482 | 1507 | ||
| 1483 | /* | 1508 | /* |
| 1509 | * Helper function to initializes perf_event_group trees. | ||
| 1510 | */ | ||
| 1511 | static void perf_event_groups_init(struct perf_event_groups *groups) | ||
| 1512 | { | ||
| 1513 | groups->tree = RB_ROOT; | ||
| 1514 | groups->index = 0; | ||
| 1515 | } | ||
| 1516 | |||
| 1517 | /* | ||
| 1518 | * Compare function for event groups; | ||
| 1519 | * | ||
| 1520 | * Implements complex key that first sorts by CPU and then by virtual index | ||
| 1521 | * which provides ordering when rotating groups for the same CPU. | ||
| 1522 | */ | ||
| 1523 | static bool | ||
| 1524 | perf_event_groups_less(struct perf_event *left, struct perf_event *right) | ||
| 1525 | { | ||
| 1526 | if (left->cpu < right->cpu) | ||
| 1527 | return true; | ||
| 1528 | if (left->cpu > right->cpu) | ||
| 1529 | return false; | ||
| 1530 | |||
| 1531 | if (left->group_index < right->group_index) | ||
| 1532 | return true; | ||
| 1533 | if (left->group_index > right->group_index) | ||
| 1534 | return false; | ||
| 1535 | |||
| 1536 | return false; | ||
| 1537 | } | ||
| 1538 | |||
| 1539 | /* | ||
| 1540 | * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for | ||
| 1541 | * key (see perf_event_groups_less). This places it last inside the CPU | ||
| 1542 | * subtree. | ||
| 1543 | */ | ||
| 1544 | static void | ||
| 1545 | perf_event_groups_insert(struct perf_event_groups *groups, | ||
| 1546 | struct perf_event *event) | ||
| 1547 | { | ||
| 1548 | struct perf_event *node_event; | ||
| 1549 | struct rb_node *parent; | ||
| 1550 | struct rb_node **node; | ||
| 1551 | |||
| 1552 | event->group_index = ++groups->index; | ||
| 1553 | |||
| 1554 | node = &groups->tree.rb_node; | ||
| 1555 | parent = *node; | ||
| 1556 | |||
| 1557 | while (*node) { | ||
| 1558 | parent = *node; | ||
| 1559 | node_event = container_of(*node, struct perf_event, group_node); | ||
| 1560 | |||
| 1561 | if (perf_event_groups_less(event, node_event)) | ||
| 1562 | node = &parent->rb_left; | ||
| 1563 | else | ||
| 1564 | node = &parent->rb_right; | ||
| 1565 | } | ||
| 1566 | |||
| 1567 | rb_link_node(&event->group_node, parent, node); | ||
| 1568 | rb_insert_color(&event->group_node, &groups->tree); | ||
| 1569 | } | ||
| 1570 | |||
| 1571 | /* | ||
| 1572 | * Helper function to insert event into the pinned or flexible groups. | ||
| 1573 | */ | ||
| 1574 | static void | ||
| 1575 | add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) | ||
| 1576 | { | ||
| 1577 | struct perf_event_groups *groups; | ||
| 1578 | |||
| 1579 | groups = get_event_groups(event, ctx); | ||
| 1580 | perf_event_groups_insert(groups, event); | ||
| 1581 | } | ||
| 1582 | |||
| 1583 | /* | ||
| 1584 | * Delete a group from a tree. | ||
| 1585 | */ | ||
| 1586 | static void | ||
| 1587 | perf_event_groups_delete(struct perf_event_groups *groups, | ||
| 1588 | struct perf_event *event) | ||
| 1589 | { | ||
| 1590 | WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) || | ||
| 1591 | RB_EMPTY_ROOT(&groups->tree)); | ||
| 1592 | |||
| 1593 | rb_erase(&event->group_node, &groups->tree); | ||
| 1594 | init_event_group(event); | ||
| 1595 | } | ||
| 1596 | |||
| 1597 | /* | ||
| 1598 | * Helper function to delete event from its groups. | ||
| 1599 | */ | ||
| 1600 | static void | ||
| 1601 | del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) | ||
| 1602 | { | ||
| 1603 | struct perf_event_groups *groups; | ||
| 1604 | |||
| 1605 | groups = get_event_groups(event, ctx); | ||
| 1606 | perf_event_groups_delete(groups, event); | ||
| 1607 | } | ||
| 1608 | |||
| 1609 | /* | ||
| 1610 | * Get the leftmost event in the @cpu subtree. | ||
| 1611 | */ | ||
| 1612 | static struct perf_event * | ||
| 1613 | perf_event_groups_first(struct perf_event_groups *groups, int cpu) | ||
| 1614 | { | ||
| 1615 | struct perf_event *node_event = NULL, *match = NULL; | ||
| 1616 | struct rb_node *node = groups->tree.rb_node; | ||
| 1617 | |||
| 1618 | while (node) { | ||
| 1619 | node_event = container_of(node, struct perf_event, group_node); | ||
| 1620 | |||
| 1621 | if (cpu < node_event->cpu) { | ||
| 1622 | node = node->rb_left; | ||
| 1623 | } else if (cpu > node_event->cpu) { | ||
| 1624 | node = node->rb_right; | ||
| 1625 | } else { | ||
| 1626 | match = node_event; | ||
| 1627 | node = node->rb_left; | ||
| 1628 | } | ||
| 1629 | } | ||
| 1630 | |||
| 1631 | return match; | ||
| 1632 | } | ||
| 1633 | |||
| 1634 | /* | ||
| 1635 | * Like rb_entry_next_safe() for the @cpu subtree. | ||
| 1636 | */ | ||
| 1637 | static struct perf_event * | ||
| 1638 | perf_event_groups_next(struct perf_event *event) | ||
| 1639 | { | ||
| 1640 | struct perf_event *next; | ||
| 1641 | |||
| 1642 | next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node); | ||
| 1643 | if (next && next->cpu == event->cpu) | ||
| 1644 | return next; | ||
| 1645 | |||
| 1646 | return NULL; | ||
| 1647 | } | ||
| 1648 | |||
| 1649 | /* | ||
| 1650 | * Iterate through the whole groups tree. | ||
| 1651 | */ | ||
| 1652 | #define perf_event_groups_for_each(event, groups) \ | ||
| 1653 | for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ | ||
| 1654 | typeof(*event), group_node); event; \ | ||
| 1655 | event = rb_entry_safe(rb_next(&event->group_node), \ | ||
| 1656 | typeof(*event), group_node)) | ||
| 1657 | |||
| 1658 | /* | ||
| 1484 | * Add a event from the lists for its context. | 1659 | * Add a event from the lists for its context. |
| 1485 | * Must be called with ctx->mutex and ctx->lock held. | 1660 | * Must be called with ctx->mutex and ctx->lock held. |
| 1486 | */ | 1661 | */ |
| @@ -1500,12 +1675,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 1500 | * perf_group_detach can, at all times, locate all siblings. | 1675 | * perf_group_detach can, at all times, locate all siblings. |
| 1501 | */ | 1676 | */ |
| 1502 | if (event->group_leader == event) { | 1677 | if (event->group_leader == event) { |
| 1503 | struct list_head *list; | ||
| 1504 | |||
| 1505 | event->group_caps = event->event_caps; | 1678 | event->group_caps = event->event_caps; |
| 1506 | 1679 | add_event_to_groups(event, ctx); | |
| 1507 | list = ctx_group_list(event, ctx); | ||
| 1508 | list_add_tail(&event->group_entry, list); | ||
| 1509 | } | 1680 | } |
| 1510 | 1681 | ||
| 1511 | list_update_cgroup_event(event, ctx, true); | 1682 | list_update_cgroup_event(event, ctx, true); |
| @@ -1663,12 +1834,12 @@ static void perf_group_attach(struct perf_event *event) | |||
| 1663 | 1834 | ||
| 1664 | group_leader->group_caps &= event->event_caps; | 1835 | group_leader->group_caps &= event->event_caps; |
| 1665 | 1836 | ||
| 1666 | list_add_tail(&event->group_entry, &group_leader->sibling_list); | 1837 | list_add_tail(&event->sibling_list, &group_leader->sibling_list); |
| 1667 | group_leader->nr_siblings++; | 1838 | group_leader->nr_siblings++; |
| 1668 | 1839 | ||
| 1669 | perf_event__header_size(group_leader); | 1840 | perf_event__header_size(group_leader); |
| 1670 | 1841 | ||
| 1671 | list_for_each_entry(pos, &group_leader->sibling_list, group_entry) | 1842 | for_each_sibling_event(pos, group_leader) |
| 1672 | perf_event__header_size(pos); | 1843 | perf_event__header_size(pos); |
| 1673 | } | 1844 | } |
| 1674 | 1845 | ||
| @@ -1699,7 +1870,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 1699 | list_del_rcu(&event->event_entry); | 1870 | list_del_rcu(&event->event_entry); |
| 1700 | 1871 | ||
| 1701 | if (event->group_leader == event) | 1872 | if (event->group_leader == event) |
| 1702 | list_del_init(&event->group_entry); | 1873 | del_event_from_groups(event, ctx); |
| 1703 | 1874 | ||
| 1704 | /* | 1875 | /* |
| 1705 | * If event was in error state, then keep it | 1876 | * If event was in error state, then keep it |
| @@ -1717,9 +1888,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 1717 | static void perf_group_detach(struct perf_event *event) | 1888 | static void perf_group_detach(struct perf_event *event) |
| 1718 | { | 1889 | { |
| 1719 | struct perf_event *sibling, *tmp; | 1890 | struct perf_event *sibling, *tmp; |
| 1720 | struct list_head *list = NULL; | 1891 | struct perf_event_context *ctx = event->ctx; |
| 1721 | 1892 | ||
| 1722 | lockdep_assert_held(&event->ctx->lock); | 1893 | lockdep_assert_held(&ctx->lock); |
| 1723 | 1894 | ||
| 1724 | /* | 1895 | /* |
| 1725 | * We can have double detach due to exit/hot-unplug + close. | 1896 | * We can have double detach due to exit/hot-unplug + close. |
| @@ -1733,34 +1904,42 @@ static void perf_group_detach(struct perf_event *event) | |||
| 1733 | * If this is a sibling, remove it from its group. | 1904 | * If this is a sibling, remove it from its group. |
| 1734 | */ | 1905 | */ |
| 1735 | if (event->group_leader != event) { | 1906 | if (event->group_leader != event) { |
| 1736 | list_del_init(&event->group_entry); | 1907 | list_del_init(&event->sibling_list); |
| 1737 | event->group_leader->nr_siblings--; | 1908 | event->group_leader->nr_siblings--; |
| 1738 | goto out; | 1909 | goto out; |
| 1739 | } | 1910 | } |
| 1740 | 1911 | ||
| 1741 | if (!list_empty(&event->group_entry)) | ||
| 1742 | list = &event->group_entry; | ||
| 1743 | |||
| 1744 | /* | 1912 | /* |
| 1745 | * If this was a group event with sibling events then | 1913 | * If this was a group event with sibling events then |
| 1746 | * upgrade the siblings to singleton events by adding them | 1914 | * upgrade the siblings to singleton events by adding them |
| 1747 | * to whatever list we are on. | 1915 | * to whatever list we are on. |
| 1748 | */ | 1916 | */ |
| 1749 | list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { | 1917 | list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { |
| 1750 | if (list) | 1918 | |
| 1751 | list_move_tail(&sibling->group_entry, list); | ||
| 1752 | sibling->group_leader = sibling; | 1919 | sibling->group_leader = sibling; |
| 1920 | list_del_init(&sibling->sibling_list); | ||
| 1753 | 1921 | ||
| 1754 | /* Inherit group flags from the previous leader */ | 1922 | /* Inherit group flags from the previous leader */ |
| 1755 | sibling->group_caps = event->group_caps; | 1923 | sibling->group_caps = event->group_caps; |
| 1756 | 1924 | ||
| 1925 | if (!RB_EMPTY_NODE(&event->group_node)) { | ||
| 1926 | add_event_to_groups(sibling, event->ctx); | ||
| 1927 | |||
| 1928 | if (sibling->state == PERF_EVENT_STATE_ACTIVE) { | ||
| 1929 | struct list_head *list = sibling->attr.pinned ? | ||
| 1930 | &ctx->pinned_active : &ctx->flexible_active; | ||
| 1931 | |||
| 1932 | list_add_tail(&sibling->active_list, list); | ||
| 1933 | } | ||
| 1934 | } | ||
| 1935 | |||
| 1757 | WARN_ON_ONCE(sibling->ctx != event->ctx); | 1936 | WARN_ON_ONCE(sibling->ctx != event->ctx); |
| 1758 | } | 1937 | } |
| 1759 | 1938 | ||
| 1760 | out: | 1939 | out: |
| 1761 | perf_event__header_size(event->group_leader); | 1940 | perf_event__header_size(event->group_leader); |
| 1762 | 1941 | ||
| 1763 | list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) | 1942 | for_each_sibling_event(tmp, event->group_leader) |
| 1764 | perf_event__header_size(tmp); | 1943 | perf_event__header_size(tmp); |
| 1765 | } | 1944 | } |
| 1766 | 1945 | ||
| @@ -1783,13 +1962,13 @@ static inline int __pmu_filter_match(struct perf_event *event) | |||
| 1783 | */ | 1962 | */ |
| 1784 | static inline int pmu_filter_match(struct perf_event *event) | 1963 | static inline int pmu_filter_match(struct perf_event *event) |
| 1785 | { | 1964 | { |
| 1786 | struct perf_event *child; | 1965 | struct perf_event *sibling; |
| 1787 | 1966 | ||
| 1788 | if (!__pmu_filter_match(event)) | 1967 | if (!__pmu_filter_match(event)) |
| 1789 | return 0; | 1968 | return 0; |
| 1790 | 1969 | ||
| 1791 | list_for_each_entry(child, &event->sibling_list, group_entry) { | 1970 | for_each_sibling_event(sibling, event) { |
| 1792 | if (!__pmu_filter_match(child)) | 1971 | if (!__pmu_filter_match(sibling)) |
| 1793 | return 0; | 1972 | return 0; |
| 1794 | } | 1973 | } |
| 1795 | 1974 | ||
| @@ -1816,6 +1995,13 @@ event_sched_out(struct perf_event *event, | |||
| 1816 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 1995 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
| 1817 | return; | 1996 | return; |
| 1818 | 1997 | ||
| 1998 | /* | ||
| 1999 | * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but | ||
| 2000 | * we can schedule events _OUT_ individually through things like | ||
| 2001 | * __perf_remove_from_context(). | ||
| 2002 | */ | ||
| 2003 | list_del_init(&event->active_list); | ||
| 2004 | |||
| 1819 | perf_pmu_disable(event->pmu); | 2005 | perf_pmu_disable(event->pmu); |
| 1820 | 2006 | ||
| 1821 | event->pmu->del(event, 0); | 2007 | event->pmu->del(event, 0); |
| @@ -1856,7 +2042,7 @@ group_sched_out(struct perf_event *group_event, | |||
| 1856 | /* | 2042 | /* |
| 1857 | * Schedule out siblings (if any): | 2043 | * Schedule out siblings (if any): |
| 1858 | */ | 2044 | */ |
| 1859 | list_for_each_entry(event, &group_event->sibling_list, group_entry) | 2045 | for_each_sibling_event(event, group_event) |
| 1860 | event_sched_out(event, cpuctx, ctx); | 2046 | event_sched_out(event, cpuctx, ctx); |
| 1861 | 2047 | ||
| 1862 | perf_pmu_enable(ctx->pmu); | 2048 | perf_pmu_enable(ctx->pmu); |
| @@ -2135,7 +2321,7 @@ group_sched_in(struct perf_event *group_event, | |||
| 2135 | /* | 2321 | /* |
| 2136 | * Schedule in siblings as one group (if any): | 2322 | * Schedule in siblings as one group (if any): |
| 2137 | */ | 2323 | */ |
| 2138 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { | 2324 | for_each_sibling_event(event, group_event) { |
| 2139 | if (event_sched_in(event, cpuctx, ctx)) { | 2325 | if (event_sched_in(event, cpuctx, ctx)) { |
| 2140 | partial_group = event; | 2326 | partial_group = event; |
| 2141 | goto group_error; | 2327 | goto group_error; |
| @@ -2151,7 +2337,7 @@ group_error: | |||
| 2151 | * partial group before returning: | 2337 | * partial group before returning: |
| 2152 | * The events up to the failed event are scheduled out normally. | 2338 | * The events up to the failed event are scheduled out normally. |
| 2153 | */ | 2339 | */ |
| 2154 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { | 2340 | for_each_sibling_event(event, group_event) { |
| 2155 | if (event == partial_group) | 2341 | if (event == partial_group) |
| 2156 | break; | 2342 | break; |
| 2157 | 2343 | ||
| @@ -2328,6 +2514,18 @@ static int __perf_install_in_context(void *info) | |||
| 2328 | raw_spin_lock(&task_ctx->lock); | 2514 | raw_spin_lock(&task_ctx->lock); |
| 2329 | } | 2515 | } |
| 2330 | 2516 | ||
| 2517 | #ifdef CONFIG_CGROUP_PERF | ||
| 2518 | if (is_cgroup_event(event)) { | ||
| 2519 | /* | ||
| 2520 | * If the current cgroup doesn't match the event's | ||
| 2521 | * cgroup, we should not try to schedule it. | ||
| 2522 | */ | ||
| 2523 | struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); | ||
| 2524 | reprogram = cgroup_is_descendant(cgrp->css.cgroup, | ||
| 2525 | event->cgrp->css.cgroup); | ||
| 2526 | } | ||
| 2527 | #endif | ||
| 2528 | |||
| 2331 | if (reprogram) { | 2529 | if (reprogram) { |
| 2332 | ctx_sched_out(ctx, cpuctx, EVENT_TIME); | 2530 | ctx_sched_out(ctx, cpuctx, EVENT_TIME); |
| 2333 | add_event_to_ctx(event, ctx); | 2531 | add_event_to_ctx(event, ctx); |
| @@ -2661,12 +2859,47 @@ int perf_event_refresh(struct perf_event *event, int refresh) | |||
| 2661 | } | 2859 | } |
| 2662 | EXPORT_SYMBOL_GPL(perf_event_refresh); | 2860 | EXPORT_SYMBOL_GPL(perf_event_refresh); |
| 2663 | 2861 | ||
| 2862 | static int perf_event_modify_breakpoint(struct perf_event *bp, | ||
| 2863 | struct perf_event_attr *attr) | ||
| 2864 | { | ||
| 2865 | int err; | ||
| 2866 | |||
| 2867 | _perf_event_disable(bp); | ||
| 2868 | |||
| 2869 | err = modify_user_hw_breakpoint_check(bp, attr, true); | ||
| 2870 | if (err) { | ||
| 2871 | if (!bp->attr.disabled) | ||
| 2872 | _perf_event_enable(bp); | ||
| 2873 | |||
| 2874 | return err; | ||
| 2875 | } | ||
| 2876 | |||
| 2877 | if (!attr->disabled) | ||
| 2878 | _perf_event_enable(bp); | ||
| 2879 | return 0; | ||
| 2880 | } | ||
| 2881 | |||
| 2882 | static int perf_event_modify_attr(struct perf_event *event, | ||
| 2883 | struct perf_event_attr *attr) | ||
| 2884 | { | ||
| 2885 | if (event->attr.type != attr->type) | ||
| 2886 | return -EINVAL; | ||
| 2887 | |||
| 2888 | switch (event->attr.type) { | ||
| 2889 | case PERF_TYPE_BREAKPOINT: | ||
| 2890 | return perf_event_modify_breakpoint(event, attr); | ||
| 2891 | default: | ||
| 2892 | /* Place holder for future additions. */ | ||
| 2893 | return -EOPNOTSUPP; | ||
| 2894 | } | ||
| 2895 | } | ||
| 2896 | |||
| 2664 | static void ctx_sched_out(struct perf_event_context *ctx, | 2897 | static void ctx_sched_out(struct perf_event_context *ctx, |
| 2665 | struct perf_cpu_context *cpuctx, | 2898 | struct perf_cpu_context *cpuctx, |
| 2666 | enum event_type_t event_type) | 2899 | enum event_type_t event_type) |
| 2667 | { | 2900 | { |
| 2901 | struct perf_event *event, *tmp; | ||
| 2668 | int is_active = ctx->is_active; | 2902 | int is_active = ctx->is_active; |
| 2669 | struct perf_event *event; | ||
| 2670 | 2903 | ||
| 2671 | lockdep_assert_held(&ctx->lock); | 2904 | lockdep_assert_held(&ctx->lock); |
| 2672 | 2905 | ||
| @@ -2713,12 +2946,12 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
| 2713 | 2946 | ||
| 2714 | perf_pmu_disable(ctx->pmu); | 2947 | perf_pmu_disable(ctx->pmu); |
| 2715 | if (is_active & EVENT_PINNED) { | 2948 | if (is_active & EVENT_PINNED) { |
| 2716 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) | 2949 | list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) |
| 2717 | group_sched_out(event, cpuctx, ctx); | 2950 | group_sched_out(event, cpuctx, ctx); |
| 2718 | } | 2951 | } |
| 2719 | 2952 | ||
| 2720 | if (is_active & EVENT_FLEXIBLE) { | 2953 | if (is_active & EVENT_FLEXIBLE) { |
| 2721 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) | 2954 | list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) |
| 2722 | group_sched_out(event, cpuctx, ctx); | 2955 | group_sched_out(event, cpuctx, ctx); |
| 2723 | } | 2956 | } |
| 2724 | perf_pmu_enable(ctx->pmu); | 2957 | perf_pmu_enable(ctx->pmu); |
| @@ -3005,53 +3238,116 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | |||
| 3005 | ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); | 3238 | ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); |
| 3006 | } | 3239 | } |
| 3007 | 3240 | ||
| 3008 | static void | 3241 | static int visit_groups_merge(struct perf_event_groups *groups, int cpu, |
| 3009 | ctx_pinned_sched_in(struct perf_event_context *ctx, | 3242 | int (*func)(struct perf_event *, void *), void *data) |
| 3010 | struct perf_cpu_context *cpuctx) | ||
| 3011 | { | 3243 | { |
| 3012 | struct perf_event *event; | 3244 | struct perf_event **evt, *evt1, *evt2; |
| 3245 | int ret; | ||
| 3013 | 3246 | ||
| 3014 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { | 3247 | evt1 = perf_event_groups_first(groups, -1); |
| 3015 | if (event->state <= PERF_EVENT_STATE_OFF) | 3248 | evt2 = perf_event_groups_first(groups, cpu); |
| 3016 | continue; | 3249 | |
| 3017 | if (!event_filter_match(event)) | 3250 | while (evt1 || evt2) { |
| 3018 | continue; | 3251 | if (evt1 && evt2) { |
| 3252 | if (evt1->group_index < evt2->group_index) | ||
| 3253 | evt = &evt1; | ||
| 3254 | else | ||
| 3255 | evt = &evt2; | ||
| 3256 | } else if (evt1) { | ||
| 3257 | evt = &evt1; | ||
| 3258 | } else { | ||
| 3259 | evt = &evt2; | ||
| 3260 | } | ||
| 3019 | 3261 | ||
| 3020 | if (group_can_go_on(event, cpuctx, 1)) | 3262 | ret = func(*evt, data); |
| 3021 | group_sched_in(event, cpuctx, ctx); | 3263 | if (ret) |
| 3264 | return ret; | ||
| 3022 | 3265 | ||
| 3023 | /* | 3266 | *evt = perf_event_groups_next(*evt); |
| 3024 | * If this pinned group hasn't been scheduled, | ||
| 3025 | * put it in error state. | ||
| 3026 | */ | ||
| 3027 | if (event->state == PERF_EVENT_STATE_INACTIVE) | ||
| 3028 | perf_event_set_state(event, PERF_EVENT_STATE_ERROR); | ||
| 3029 | } | 3267 | } |
| 3268 | |||
| 3269 | return 0; | ||
| 3270 | } | ||
| 3271 | |||
| 3272 | struct sched_in_data { | ||
| 3273 | struct perf_event_context *ctx; | ||
| 3274 | struct perf_cpu_context *cpuctx; | ||
| 3275 | int can_add_hw; | ||
| 3276 | }; | ||
| 3277 | |||
| 3278 | static int pinned_sched_in(struct perf_event *event, void *data) | ||
| 3279 | { | ||
| 3280 | struct sched_in_data *sid = data; | ||
| 3281 | |||
| 3282 | if (event->state <= PERF_EVENT_STATE_OFF) | ||
| 3283 | return 0; | ||
| 3284 | |||
| 3285 | if (!event_filter_match(event)) | ||
| 3286 | return 0; | ||
| 3287 | |||
| 3288 | if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { | ||
| 3289 | if (!group_sched_in(event, sid->cpuctx, sid->ctx)) | ||
| 3290 | list_add_tail(&event->active_list, &sid->ctx->pinned_active); | ||
| 3291 | } | ||
| 3292 | |||
| 3293 | /* | ||
| 3294 | * If this pinned group hasn't been scheduled, | ||
| 3295 | * put it in error state. | ||
| 3296 | */ | ||
| 3297 | if (event->state == PERF_EVENT_STATE_INACTIVE) | ||
| 3298 | perf_event_set_state(event, PERF_EVENT_STATE_ERROR); | ||
| 3299 | |||
| 3300 | return 0; | ||
| 3301 | } | ||
| 3302 | |||
| 3303 | static int flexible_sched_in(struct perf_event *event, void *data) | ||
| 3304 | { | ||
| 3305 | struct sched_in_data *sid = data; | ||
| 3306 | |||
| 3307 | if (event->state <= PERF_EVENT_STATE_OFF) | ||
| 3308 | return 0; | ||
| 3309 | |||
| 3310 | if (!event_filter_match(event)) | ||
| 3311 | return 0; | ||
| 3312 | |||
| 3313 | if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { | ||
| 3314 | if (!group_sched_in(event, sid->cpuctx, sid->ctx)) | ||
| 3315 | list_add_tail(&event->active_list, &sid->ctx->flexible_active); | ||
| 3316 | else | ||
| 3317 | sid->can_add_hw = 0; | ||
| 3318 | } | ||
| 3319 | |||
| 3320 | return 0; | ||
| 3321 | } | ||
| 3322 | |||
| 3323 | static void | ||
| 3324 | ctx_pinned_sched_in(struct perf_event_context *ctx, | ||
| 3325 | struct perf_cpu_context *cpuctx) | ||
| 3326 | { | ||
| 3327 | struct sched_in_data sid = { | ||
| 3328 | .ctx = ctx, | ||
| 3329 | .cpuctx = cpuctx, | ||
| 3330 | .can_add_hw = 1, | ||
| 3331 | }; | ||
| 3332 | |||
| 3333 | visit_groups_merge(&ctx->pinned_groups, | ||
| 3334 | smp_processor_id(), | ||
| 3335 | pinned_sched_in, &sid); | ||
| 3030 | } | 3336 | } |
| 3031 | 3337 | ||
| 3032 | static void | 3338 | static void |
| 3033 | ctx_flexible_sched_in(struct perf_event_context *ctx, | 3339 | ctx_flexible_sched_in(struct perf_event_context *ctx, |
| 3034 | struct perf_cpu_context *cpuctx) | 3340 | struct perf_cpu_context *cpuctx) |
| 3035 | { | 3341 | { |
| 3036 | struct perf_event *event; | 3342 | struct sched_in_data sid = { |
| 3037 | int can_add_hw = 1; | 3343 | .ctx = ctx, |
| 3038 | 3344 | .cpuctx = cpuctx, | |
| 3039 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) { | 3345 | .can_add_hw = 1, |
| 3040 | /* Ignore events in OFF or ERROR state */ | 3346 | }; |
| 3041 | if (event->state <= PERF_EVENT_STATE_OFF) | ||
| 3042 | continue; | ||
| 3043 | /* | ||
| 3044 | * Listen to the 'cpu' scheduling filter constraint | ||
| 3045 | * of events: | ||
| 3046 | */ | ||
| 3047 | if (!event_filter_match(event)) | ||
| 3048 | continue; | ||
| 3049 | 3347 | ||
| 3050 | if (group_can_go_on(event, cpuctx, can_add_hw)) { | 3348 | visit_groups_merge(&ctx->flexible_groups, |
| 3051 | if (group_sched_in(event, cpuctx, ctx)) | 3349 | smp_processor_id(), |
| 3052 | can_add_hw = 0; | 3350 | flexible_sched_in, &sid); |
| 3053 | } | ||
| 3054 | } | ||
| 3055 | } | 3351 | } |
| 3056 | 3352 | ||
| 3057 | static void | 3353 | static void |
| @@ -3132,7 +3428,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, | |||
| 3132 | * However, if task's ctx is not carrying any pinned | 3428 | * However, if task's ctx is not carrying any pinned |
| 3133 | * events, no need to flip the cpuctx's events around. | 3429 | * events, no need to flip the cpuctx's events around. |
| 3134 | */ | 3430 | */ |
| 3135 | if (!list_empty(&ctx->pinned_groups)) | 3431 | if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) |
| 3136 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 3432 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
| 3137 | perf_event_sched_in(cpuctx, ctx, task); | 3433 | perf_event_sched_in(cpuctx, ctx, task); |
| 3138 | perf_pmu_enable(ctx->pmu); | 3434 | perf_pmu_enable(ctx->pmu); |
| @@ -3361,55 +3657,81 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, | |||
| 3361 | } | 3657 | } |
| 3362 | 3658 | ||
| 3363 | /* | 3659 | /* |
| 3364 | * Round-robin a context's events: | 3660 | * Move @event to the tail of the @ctx's elegible events. |
| 3365 | */ | 3661 | */ |
| 3366 | static void rotate_ctx(struct perf_event_context *ctx) | 3662 | static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) |
| 3367 | { | 3663 | { |
| 3368 | /* | 3664 | /* |
| 3369 | * Rotate the first entry last of non-pinned groups. Rotation might be | 3665 | * Rotate the first entry last of non-pinned groups. Rotation might be |
| 3370 | * disabled by the inheritance code. | 3666 | * disabled by the inheritance code. |
| 3371 | */ | 3667 | */ |
| 3372 | if (!ctx->rotate_disable) | 3668 | if (ctx->rotate_disable) |
| 3373 | list_rotate_left(&ctx->flexible_groups); | 3669 | return; |
| 3670 | |||
| 3671 | perf_event_groups_delete(&ctx->flexible_groups, event); | ||
| 3672 | perf_event_groups_insert(&ctx->flexible_groups, event); | ||
| 3374 | } | 3673 | } |
| 3375 | 3674 | ||
| 3376 | static int perf_rotate_context(struct perf_cpu_context *cpuctx) | 3675 | static inline struct perf_event * |
| 3676 | ctx_first_active(struct perf_event_context *ctx) | ||
| 3377 | { | 3677 | { |
| 3678 | return list_first_entry_or_null(&ctx->flexible_active, | ||
| 3679 | struct perf_event, active_list); | ||
| 3680 | } | ||
| 3681 | |||
| 3682 | static bool perf_rotate_context(struct perf_cpu_context *cpuctx) | ||
| 3683 | { | ||
| 3684 | struct perf_event *cpu_event = NULL, *task_event = NULL; | ||
| 3685 | bool cpu_rotate = false, task_rotate = false; | ||
| 3378 | struct perf_event_context *ctx = NULL; | 3686 | struct perf_event_context *ctx = NULL; |
| 3379 | int rotate = 0; | 3687 | |
| 3688 | /* | ||
| 3689 | * Since we run this from IRQ context, nobody can install new | ||
| 3690 | * events, thus the event count values are stable. | ||
| 3691 | */ | ||
| 3380 | 3692 | ||
| 3381 | if (cpuctx->ctx.nr_events) { | 3693 | if (cpuctx->ctx.nr_events) { |
| 3382 | if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) | 3694 | if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) |
| 3383 | rotate = 1; | 3695 | cpu_rotate = true; |
| 3384 | } | 3696 | } |
| 3385 | 3697 | ||
| 3386 | ctx = cpuctx->task_ctx; | 3698 | ctx = cpuctx->task_ctx; |
| 3387 | if (ctx && ctx->nr_events) { | 3699 | if (ctx && ctx->nr_events) { |
| 3388 | if (ctx->nr_events != ctx->nr_active) | 3700 | if (ctx->nr_events != ctx->nr_active) |
| 3389 | rotate = 1; | 3701 | task_rotate = true; |
| 3390 | } | 3702 | } |
| 3391 | 3703 | ||
| 3392 | if (!rotate) | 3704 | if (!(cpu_rotate || task_rotate)) |
| 3393 | goto done; | 3705 | return false; |
| 3394 | 3706 | ||
| 3395 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); | 3707 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); |
| 3396 | perf_pmu_disable(cpuctx->ctx.pmu); | 3708 | perf_pmu_disable(cpuctx->ctx.pmu); |
| 3397 | 3709 | ||
| 3398 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 3710 | if (task_rotate) |
| 3399 | if (ctx) | 3711 | task_event = ctx_first_active(ctx); |
| 3712 | if (cpu_rotate) | ||
| 3713 | cpu_event = ctx_first_active(&cpuctx->ctx); | ||
| 3714 | |||
| 3715 | /* | ||
| 3716 | * As per the order given at ctx_resched() first 'pop' task flexible | ||
| 3717 | * and then, if needed CPU flexible. | ||
| 3718 | */ | ||
| 3719 | if (task_event || (ctx && cpu_event)) | ||
| 3400 | ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); | 3720 | ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); |
| 3721 | if (cpu_event) | ||
| 3722 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | ||
| 3401 | 3723 | ||
| 3402 | rotate_ctx(&cpuctx->ctx); | 3724 | if (task_event) |
| 3403 | if (ctx) | 3725 | rotate_ctx(ctx, task_event); |
| 3404 | rotate_ctx(ctx); | 3726 | if (cpu_event) |
| 3727 | rotate_ctx(&cpuctx->ctx, cpu_event); | ||
| 3405 | 3728 | ||
| 3406 | perf_event_sched_in(cpuctx, ctx, current); | 3729 | perf_event_sched_in(cpuctx, ctx, current); |
| 3407 | 3730 | ||
| 3408 | perf_pmu_enable(cpuctx->ctx.pmu); | 3731 | perf_pmu_enable(cpuctx->ctx.pmu); |
| 3409 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | 3732 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); |
| 3410 | done: | ||
| 3411 | 3733 | ||
| 3412 | return rotate; | 3734 | return true; |
| 3413 | } | 3735 | } |
| 3414 | 3736 | ||
| 3415 | void perf_event_task_tick(void) | 3737 | void perf_event_task_tick(void) |
| @@ -3554,7 +3876,7 @@ static void __perf_event_read(void *info) | |||
| 3554 | 3876 | ||
| 3555 | pmu->read(event); | 3877 | pmu->read(event); |
| 3556 | 3878 | ||
| 3557 | list_for_each_entry(sub, &event->sibling_list, group_entry) { | 3879 | for_each_sibling_event(sub, event) { |
| 3558 | if (sub->state == PERF_EVENT_STATE_ACTIVE) { | 3880 | if (sub->state == PERF_EVENT_STATE_ACTIVE) { |
| 3559 | /* | 3881 | /* |
| 3560 | * Use sibling's PMU rather than @event's since | 3882 | * Use sibling's PMU rather than @event's since |
| @@ -3728,9 +4050,11 @@ static void __perf_event_init_context(struct perf_event_context *ctx) | |||
| 3728 | raw_spin_lock_init(&ctx->lock); | 4050 | raw_spin_lock_init(&ctx->lock); |
| 3729 | mutex_init(&ctx->mutex); | 4051 | mutex_init(&ctx->mutex); |
| 3730 | INIT_LIST_HEAD(&ctx->active_ctx_list); | 4052 | INIT_LIST_HEAD(&ctx->active_ctx_list); |
| 3731 | INIT_LIST_HEAD(&ctx->pinned_groups); | 4053 | perf_event_groups_init(&ctx->pinned_groups); |
| 3732 | INIT_LIST_HEAD(&ctx->flexible_groups); | 4054 | perf_event_groups_init(&ctx->flexible_groups); |
| 3733 | INIT_LIST_HEAD(&ctx->event_list); | 4055 | INIT_LIST_HEAD(&ctx->event_list); |
| 4056 | INIT_LIST_HEAD(&ctx->pinned_active); | ||
| 4057 | INIT_LIST_HEAD(&ctx->flexible_active); | ||
| 3734 | atomic_set(&ctx->refcount, 1); | 4058 | atomic_set(&ctx->refcount, 1); |
| 3735 | } | 4059 | } |
| 3736 | 4060 | ||
| @@ -4400,7 +4724,7 @@ static int __perf_read_group_add(struct perf_event *leader, | |||
| 4400 | if (read_format & PERF_FORMAT_ID) | 4724 | if (read_format & PERF_FORMAT_ID) |
| 4401 | values[n++] = primary_event_id(leader); | 4725 | values[n++] = primary_event_id(leader); |
| 4402 | 4726 | ||
| 4403 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { | 4727 | for_each_sibling_event(sub, leader) { |
| 4404 | values[n++] += perf_event_count(sub); | 4728 | values[n++] += perf_event_count(sub); |
| 4405 | if (read_format & PERF_FORMAT_ID) | 4729 | if (read_format & PERF_FORMAT_ID) |
| 4406 | values[n++] = primary_event_id(sub); | 4730 | values[n++] = primary_event_id(sub); |
| @@ -4594,7 +4918,7 @@ static void perf_event_for_each(struct perf_event *event, | |||
| 4594 | event = event->group_leader; | 4918 | event = event->group_leader; |
| 4595 | 4919 | ||
| 4596 | perf_event_for_each_child(event, func); | 4920 | perf_event_for_each_child(event, func); |
| 4597 | list_for_each_entry(sibling, &event->sibling_list, group_entry) | 4921 | for_each_sibling_event(sibling, event) |
| 4598 | perf_event_for_each_child(sibling, func); | 4922 | perf_event_for_each_child(sibling, func); |
| 4599 | } | 4923 | } |
| 4600 | 4924 | ||
| @@ -4676,6 +5000,8 @@ static int perf_event_set_output(struct perf_event *event, | |||
| 4676 | struct perf_event *output_event); | 5000 | struct perf_event *output_event); |
| 4677 | static int perf_event_set_filter(struct perf_event *event, void __user *arg); | 5001 | static int perf_event_set_filter(struct perf_event *event, void __user *arg); |
| 4678 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); | 5002 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); |
| 5003 | static int perf_copy_attr(struct perf_event_attr __user *uattr, | ||
| 5004 | struct perf_event_attr *attr); | ||
| 4679 | 5005 | ||
| 4680 | static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) | 5006 | static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) |
| 4681 | { | 5007 | { |
| @@ -4748,6 +5074,17 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon | |||
| 4748 | 5074 | ||
| 4749 | case PERF_EVENT_IOC_QUERY_BPF: | 5075 | case PERF_EVENT_IOC_QUERY_BPF: |
| 4750 | return perf_event_query_prog_array(event, (void __user *)arg); | 5076 | return perf_event_query_prog_array(event, (void __user *)arg); |
| 5077 | |||
| 5078 | case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: { | ||
| 5079 | struct perf_event_attr new_attr; | ||
| 5080 | int err = perf_copy_attr((struct perf_event_attr __user *)arg, | ||
| 5081 | &new_attr); | ||
| 5082 | |||
| 5083 | if (err) | ||
| 5084 | return err; | ||
| 5085 | |||
| 5086 | return perf_event_modify_attr(event, &new_attr); | ||
| 5087 | } | ||
| 4751 | default: | 5088 | default: |
| 4752 | return -ENOTTY; | 5089 | return -ENOTTY; |
| 4753 | } | 5090 | } |
| @@ -5743,7 +6080,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
| 5743 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | 6080 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) |
| 5744 | values[n++] = running; | 6081 | values[n++] = running; |
| 5745 | 6082 | ||
| 5746 | if (leader != event) | 6083 | if ((leader != event) && |
| 6084 | (leader->state == PERF_EVENT_STATE_ACTIVE)) | ||
| 5747 | leader->pmu->read(leader); | 6085 | leader->pmu->read(leader); |
| 5748 | 6086 | ||
| 5749 | values[n++] = perf_event_count(leader); | 6087 | values[n++] = perf_event_count(leader); |
| @@ -5752,7 +6090,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
| 5752 | 6090 | ||
| 5753 | __output_copy(handle, values, n * sizeof(u64)); | 6091 | __output_copy(handle, values, n * sizeof(u64)); |
| 5754 | 6092 | ||
| 5755 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { | 6093 | for_each_sibling_event(sub, leader) { |
| 5756 | n = 0; | 6094 | n = 0; |
| 5757 | 6095 | ||
| 5758 | if ((sub != event) && | 6096 | if ((sub != event) && |
| @@ -8009,9 +8347,119 @@ static struct pmu perf_tracepoint = { | |||
| 8009 | .read = perf_swevent_read, | 8347 | .read = perf_swevent_read, |
| 8010 | }; | 8348 | }; |
| 8011 | 8349 | ||
| 8350 | #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) | ||
| 8351 | /* | ||
| 8352 | * Flags in config, used by dynamic PMU kprobe and uprobe | ||
| 8353 | * The flags should match following PMU_FORMAT_ATTR(). | ||
| 8354 | * | ||
| 8355 | * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe | ||
| 8356 | * if not set, create kprobe/uprobe | ||
| 8357 | */ | ||
| 8358 | enum perf_probe_config { | ||
| 8359 | PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ | ||
| 8360 | }; | ||
| 8361 | |||
| 8362 | PMU_FORMAT_ATTR(retprobe, "config:0"); | ||
| 8363 | |||
| 8364 | static struct attribute *probe_attrs[] = { | ||
| 8365 | &format_attr_retprobe.attr, | ||
| 8366 | NULL, | ||
| 8367 | }; | ||
| 8368 | |||
| 8369 | static struct attribute_group probe_format_group = { | ||
| 8370 | .name = "format", | ||
| 8371 | .attrs = probe_attrs, | ||
| 8372 | }; | ||
| 8373 | |||
| 8374 | static const struct attribute_group *probe_attr_groups[] = { | ||
| 8375 | &probe_format_group, | ||
| 8376 | NULL, | ||
| 8377 | }; | ||
| 8378 | #endif | ||
| 8379 | |||
| 8380 | #ifdef CONFIG_KPROBE_EVENTS | ||
| 8381 | static int perf_kprobe_event_init(struct perf_event *event); | ||
| 8382 | static struct pmu perf_kprobe = { | ||
| 8383 | .task_ctx_nr = perf_sw_context, | ||
| 8384 | .event_init = perf_kprobe_event_init, | ||
| 8385 | .add = perf_trace_add, | ||
| 8386 | .del = perf_trace_del, | ||
| 8387 | .start = perf_swevent_start, | ||
| 8388 | .stop = perf_swevent_stop, | ||
| 8389 | .read = perf_swevent_read, | ||
| 8390 | .attr_groups = probe_attr_groups, | ||
| 8391 | }; | ||
| 8392 | |||
| 8393 | static int perf_kprobe_event_init(struct perf_event *event) | ||
| 8394 | { | ||
| 8395 | int err; | ||
| 8396 | bool is_retprobe; | ||
| 8397 | |||
| 8398 | if (event->attr.type != perf_kprobe.type) | ||
| 8399 | return -ENOENT; | ||
| 8400 | /* | ||
| 8401 | * no branch sampling for probe events | ||
| 8402 | */ | ||
| 8403 | if (has_branch_stack(event)) | ||
| 8404 | return -EOPNOTSUPP; | ||
| 8405 | |||
| 8406 | is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; | ||
| 8407 | err = perf_kprobe_init(event, is_retprobe); | ||
| 8408 | if (err) | ||
| 8409 | return err; | ||
| 8410 | |||
| 8411 | event->destroy = perf_kprobe_destroy; | ||
| 8412 | |||
| 8413 | return 0; | ||
| 8414 | } | ||
| 8415 | #endif /* CONFIG_KPROBE_EVENTS */ | ||
| 8416 | |||
| 8417 | #ifdef CONFIG_UPROBE_EVENTS | ||
| 8418 | static int perf_uprobe_event_init(struct perf_event *event); | ||
| 8419 | static struct pmu perf_uprobe = { | ||
| 8420 | .task_ctx_nr = perf_sw_context, | ||
| 8421 | .event_init = perf_uprobe_event_init, | ||
| 8422 | .add = perf_trace_add, | ||
| 8423 | .del = perf_trace_del, | ||
| 8424 | .start = perf_swevent_start, | ||
| 8425 | .stop = perf_swevent_stop, | ||
| 8426 | .read = perf_swevent_read, | ||
| 8427 | .attr_groups = probe_attr_groups, | ||
| 8428 | }; | ||
| 8429 | |||
| 8430 | static int perf_uprobe_event_init(struct perf_event *event) | ||
| 8431 | { | ||
| 8432 | int err; | ||
| 8433 | bool is_retprobe; | ||
| 8434 | |||
| 8435 | if (event->attr.type != perf_uprobe.type) | ||
| 8436 | return -ENOENT; | ||
| 8437 | /* | ||
| 8438 | * no branch sampling for probe events | ||
| 8439 | */ | ||
| 8440 | if (has_branch_stack(event)) | ||
| 8441 | return -EOPNOTSUPP; | ||
| 8442 | |||
| 8443 | is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; | ||
| 8444 | err = perf_uprobe_init(event, is_retprobe); | ||
| 8445 | if (err) | ||
| 8446 | return err; | ||
| 8447 | |||
| 8448 | event->destroy = perf_uprobe_destroy; | ||
| 8449 | |||
| 8450 | return 0; | ||
| 8451 | } | ||
| 8452 | #endif /* CONFIG_UPROBE_EVENTS */ | ||
| 8453 | |||
| 8012 | static inline void perf_tp_register(void) | 8454 | static inline void perf_tp_register(void) |
| 8013 | { | 8455 | { |
| 8014 | perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); | 8456 | perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); |
| 8457 | #ifdef CONFIG_KPROBE_EVENTS | ||
| 8458 | perf_pmu_register(&perf_kprobe, "kprobe", -1); | ||
| 8459 | #endif | ||
| 8460 | #ifdef CONFIG_UPROBE_EVENTS | ||
| 8461 | perf_pmu_register(&perf_uprobe, "uprobe", -1); | ||
| 8462 | #endif | ||
| 8015 | } | 8463 | } |
| 8016 | 8464 | ||
| 8017 | static void perf_event_free_filter(struct perf_event *event) | 8465 | static void perf_event_free_filter(struct perf_event *event) |
| @@ -8088,13 +8536,32 @@ static void perf_event_free_bpf_handler(struct perf_event *event) | |||
| 8088 | } | 8536 | } |
| 8089 | #endif | 8537 | #endif |
| 8090 | 8538 | ||
| 8539 | /* | ||
| 8540 | * returns true if the event is a tracepoint, or a kprobe/upprobe created | ||
| 8541 | * with perf_event_open() | ||
| 8542 | */ | ||
| 8543 | static inline bool perf_event_is_tracing(struct perf_event *event) | ||
| 8544 | { | ||
| 8545 | if (event->pmu == &perf_tracepoint) | ||
| 8546 | return true; | ||
| 8547 | #ifdef CONFIG_KPROBE_EVENTS | ||
| 8548 | if (event->pmu == &perf_kprobe) | ||
| 8549 | return true; | ||
| 8550 | #endif | ||
| 8551 | #ifdef CONFIG_UPROBE_EVENTS | ||
| 8552 | if (event->pmu == &perf_uprobe) | ||
| 8553 | return true; | ||
| 8554 | #endif | ||
| 8555 | return false; | ||
| 8556 | } | ||
| 8557 | |||
| 8091 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) | 8558 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) |
| 8092 | { | 8559 | { |
| 8093 | bool is_kprobe, is_tracepoint, is_syscall_tp; | 8560 | bool is_kprobe, is_tracepoint, is_syscall_tp; |
| 8094 | struct bpf_prog *prog; | 8561 | struct bpf_prog *prog; |
| 8095 | int ret; | 8562 | int ret; |
| 8096 | 8563 | ||
| 8097 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | 8564 | if (!perf_event_is_tracing(event)) |
| 8098 | return perf_event_set_bpf_handler(event, prog_fd); | 8565 | return perf_event_set_bpf_handler(event, prog_fd); |
| 8099 | 8566 | ||
| 8100 | is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; | 8567 | is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; |
| @@ -8140,7 +8607,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) | |||
| 8140 | 8607 | ||
| 8141 | static void perf_event_free_bpf_prog(struct perf_event *event) | 8608 | static void perf_event_free_bpf_prog(struct perf_event *event) |
| 8142 | { | 8609 | { |
| 8143 | if (event->attr.type != PERF_TYPE_TRACEPOINT) { | 8610 | if (!perf_event_is_tracing(event)) { |
| 8144 | perf_event_free_bpf_handler(event); | 8611 | perf_event_free_bpf_handler(event); |
| 8145 | return; | 8612 | return; |
| 8146 | } | 8613 | } |
| @@ -8336,7 +8803,8 @@ restart: | |||
| 8336 | * * for kernel addresses: <start address>[/<size>] | 8803 | * * for kernel addresses: <start address>[/<size>] |
| 8337 | * * for object files: <start address>[/<size>]@</path/to/object/file> | 8804 | * * for object files: <start address>[/<size>]@</path/to/object/file> |
| 8338 | * | 8805 | * |
| 8339 | * if <size> is not specified, the range is treated as a single address. | 8806 | * if <size> is not specified or is zero, the range is treated as a single |
| 8807 | * address; not valid for ACTION=="filter". | ||
| 8340 | */ | 8808 | */ |
| 8341 | enum { | 8809 | enum { |
| 8342 | IF_ACT_NONE = -1, | 8810 | IF_ACT_NONE = -1, |
| @@ -8386,6 +8854,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, | |||
| 8386 | return -ENOMEM; | 8854 | return -ENOMEM; |
| 8387 | 8855 | ||
| 8388 | while ((start = strsep(&fstr, " ,\n")) != NULL) { | 8856 | while ((start = strsep(&fstr, " ,\n")) != NULL) { |
| 8857 | static const enum perf_addr_filter_action_t actions[] = { | ||
| 8858 | [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER, | ||
| 8859 | [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START, | ||
| 8860 | [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP, | ||
| 8861 | }; | ||
| 8389 | ret = -EINVAL; | 8862 | ret = -EINVAL; |
| 8390 | 8863 | ||
| 8391 | if (!*start) | 8864 | if (!*start) |
| @@ -8402,12 +8875,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, | |||
| 8402 | switch (token) { | 8875 | switch (token) { |
| 8403 | case IF_ACT_FILTER: | 8876 | case IF_ACT_FILTER: |
| 8404 | case IF_ACT_START: | 8877 | case IF_ACT_START: |
| 8405 | filter->filter = 1; | ||
| 8406 | |||
| 8407 | case IF_ACT_STOP: | 8878 | case IF_ACT_STOP: |
| 8408 | if (state != IF_STATE_ACTION) | 8879 | if (state != IF_STATE_ACTION) |
| 8409 | goto fail; | 8880 | goto fail; |
| 8410 | 8881 | ||
| 8882 | filter->action = actions[token]; | ||
| 8411 | state = IF_STATE_SOURCE; | 8883 | state = IF_STATE_SOURCE; |
| 8412 | break; | 8884 | break; |
| 8413 | 8885 | ||
| @@ -8420,15 +8892,12 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, | |||
| 8420 | if (state != IF_STATE_SOURCE) | 8892 | if (state != IF_STATE_SOURCE) |
| 8421 | goto fail; | 8893 | goto fail; |
| 8422 | 8894 | ||
| 8423 | if (token == IF_SRC_FILE || token == IF_SRC_KERNEL) | ||
| 8424 | filter->range = 1; | ||
| 8425 | |||
| 8426 | *args[0].to = 0; | 8895 | *args[0].to = 0; |
| 8427 | ret = kstrtoul(args[0].from, 0, &filter->offset); | 8896 | ret = kstrtoul(args[0].from, 0, &filter->offset); |
| 8428 | if (ret) | 8897 | if (ret) |
| 8429 | goto fail; | 8898 | goto fail; |
| 8430 | 8899 | ||
| 8431 | if (filter->range) { | 8900 | if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) { |
| 8432 | *args[1].to = 0; | 8901 | *args[1].to = 0; |
| 8433 | ret = kstrtoul(args[1].from, 0, &filter->size); | 8902 | ret = kstrtoul(args[1].from, 0, &filter->size); |
| 8434 | if (ret) | 8903 | if (ret) |
| @@ -8436,7 +8905,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, | |||
| 8436 | } | 8905 | } |
| 8437 | 8906 | ||
| 8438 | if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { | 8907 | if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { |
| 8439 | int fpos = filter->range ? 2 : 1; | 8908 | int fpos = token == IF_SRC_FILE ? 2 : 1; |
| 8440 | 8909 | ||
| 8441 | filename = match_strdup(&args[fpos]); | 8910 | filename = match_strdup(&args[fpos]); |
| 8442 | if (!filename) { | 8911 | if (!filename) { |
| @@ -8462,6 +8931,14 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, | |||
| 8462 | if (kernel && event->attr.exclude_kernel) | 8931 | if (kernel && event->attr.exclude_kernel) |
| 8463 | goto fail; | 8932 | goto fail; |
| 8464 | 8933 | ||
| 8934 | /* | ||
| 8935 | * ACTION "filter" must have a non-zero length region | ||
| 8936 | * specified. | ||
| 8937 | */ | ||
| 8938 | if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER && | ||
| 8939 | !filter->size) | ||
| 8940 | goto fail; | ||
| 8941 | |||
| 8465 | if (!kernel) { | 8942 | if (!kernel) { |
| 8466 | if (!filename) | 8943 | if (!filename) |
| 8467 | goto fail; | 8944 | goto fail; |
| @@ -8559,47 +9036,36 @@ fail_clear_files: | |||
| 8559 | return ret; | 9036 | return ret; |
| 8560 | } | 9037 | } |
| 8561 | 9038 | ||
| 8562 | static int | ||
| 8563 | perf_tracepoint_set_filter(struct perf_event *event, char *filter_str) | ||
| 8564 | { | ||
| 8565 | struct perf_event_context *ctx = event->ctx; | ||
| 8566 | int ret; | ||
| 8567 | |||
| 8568 | /* | ||
| 8569 | * Beware, here be dragons!! | ||
| 8570 | * | ||
| 8571 | * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint | ||
| 8572 | * stuff does not actually need it. So temporarily drop ctx->mutex. As per | ||
| 8573 | * perf_event_ctx_lock() we already have a reference on ctx. | ||
| 8574 | * | ||
| 8575 | * This can result in event getting moved to a different ctx, but that | ||
| 8576 | * does not affect the tracepoint state. | ||
| 8577 | */ | ||
| 8578 | mutex_unlock(&ctx->mutex); | ||
| 8579 | ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); | ||
| 8580 | mutex_lock(&ctx->mutex); | ||
| 8581 | |||
| 8582 | return ret; | ||
| 8583 | } | ||
| 8584 | |||
| 8585 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 9039 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
| 8586 | { | 9040 | { |
| 8587 | char *filter_str; | ||
| 8588 | int ret = -EINVAL; | 9041 | int ret = -EINVAL; |
| 8589 | 9042 | char *filter_str; | |
| 8590 | if ((event->attr.type != PERF_TYPE_TRACEPOINT || | ||
| 8591 | !IS_ENABLED(CONFIG_EVENT_TRACING)) && | ||
| 8592 | !has_addr_filter(event)) | ||
| 8593 | return -EINVAL; | ||
| 8594 | 9043 | ||
| 8595 | filter_str = strndup_user(arg, PAGE_SIZE); | 9044 | filter_str = strndup_user(arg, PAGE_SIZE); |
| 8596 | if (IS_ERR(filter_str)) | 9045 | if (IS_ERR(filter_str)) |
| 8597 | return PTR_ERR(filter_str); | 9046 | return PTR_ERR(filter_str); |
| 8598 | 9047 | ||
| 8599 | if (IS_ENABLED(CONFIG_EVENT_TRACING) && | 9048 | #ifdef CONFIG_EVENT_TRACING |
| 8600 | event->attr.type == PERF_TYPE_TRACEPOINT) | 9049 | if (perf_event_is_tracing(event)) { |
| 8601 | ret = perf_tracepoint_set_filter(event, filter_str); | 9050 | struct perf_event_context *ctx = event->ctx; |
| 8602 | else if (has_addr_filter(event)) | 9051 | |
| 9052 | /* | ||
| 9053 | * Beware, here be dragons!! | ||
| 9054 | * | ||
| 9055 | * the tracepoint muck will deadlock against ctx->mutex, but | ||
| 9056 | * the tracepoint stuff does not actually need it. So | ||
| 9057 | * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we | ||
| 9058 | * already have a reference on ctx. | ||
| 9059 | * | ||
| 9060 | * This can result in event getting moved to a different ctx, | ||
| 9061 | * but that does not affect the tracepoint state. | ||
| 9062 | */ | ||
| 9063 | mutex_unlock(&ctx->mutex); | ||
| 9064 | ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); | ||
| 9065 | mutex_lock(&ctx->mutex); | ||
| 9066 | } else | ||
| 9067 | #endif | ||
| 9068 | if (has_addr_filter(event)) | ||
| 8603 | ret = perf_event_set_addr_filter(event, filter_str); | 9069 | ret = perf_event_set_addr_filter(event, filter_str); |
| 8604 | 9070 | ||
| 8605 | kfree(filter_str); | 9071 | kfree(filter_str); |
| @@ -9452,9 +9918,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 9452 | mutex_init(&event->child_mutex); | 9918 | mutex_init(&event->child_mutex); |
| 9453 | INIT_LIST_HEAD(&event->child_list); | 9919 | INIT_LIST_HEAD(&event->child_list); |
| 9454 | 9920 | ||
| 9455 | INIT_LIST_HEAD(&event->group_entry); | ||
| 9456 | INIT_LIST_HEAD(&event->event_entry); | 9921 | INIT_LIST_HEAD(&event->event_entry); |
| 9457 | INIT_LIST_HEAD(&event->sibling_list); | 9922 | INIT_LIST_HEAD(&event->sibling_list); |
| 9923 | INIT_LIST_HEAD(&event->active_list); | ||
| 9924 | init_event_group(event); | ||
| 9458 | INIT_LIST_HEAD(&event->rb_entry); | 9925 | INIT_LIST_HEAD(&event->rb_entry); |
| 9459 | INIT_LIST_HEAD(&event->active_entry); | 9926 | INIT_LIST_HEAD(&event->active_entry); |
| 9460 | INIT_LIST_HEAD(&event->addr_filters.list); | 9927 | INIT_LIST_HEAD(&event->addr_filters.list); |
| @@ -9729,6 +10196,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
| 9729 | ret = -EINVAL; | 10196 | ret = -EINVAL; |
| 9730 | } | 10197 | } |
| 9731 | 10198 | ||
| 10199 | if (!attr->sample_max_stack) | ||
| 10200 | attr->sample_max_stack = sysctl_perf_event_max_stack; | ||
| 10201 | |||
| 9732 | if (attr->sample_type & PERF_SAMPLE_REGS_INTR) | 10202 | if (attr->sample_type & PERF_SAMPLE_REGS_INTR) |
| 9733 | ret = perf_reg_validate(attr->sample_regs_intr); | 10203 | ret = perf_reg_validate(attr->sample_regs_intr); |
| 9734 | out: | 10204 | out: |
| @@ -9942,9 +10412,6 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 9942 | perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) | 10412 | perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) |
| 9943 | return -EACCES; | 10413 | return -EACCES; |
| 9944 | 10414 | ||
| 9945 | if (!attr.sample_max_stack) | ||
| 9946 | attr.sample_max_stack = sysctl_perf_event_max_stack; | ||
| 9947 | |||
| 9948 | /* | 10415 | /* |
| 9949 | * In cgroup mode, the pid argument is used to pass the fd | 10416 | * In cgroup mode, the pid argument is used to pass the fd |
| 9950 | * opened to the cgroup directory in cgroupfs. The cpu argument | 10417 | * opened to the cgroup directory in cgroupfs. The cpu argument |
| @@ -10218,8 +10685,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 10218 | perf_remove_from_context(group_leader, 0); | 10685 | perf_remove_from_context(group_leader, 0); |
| 10219 | put_ctx(gctx); | 10686 | put_ctx(gctx); |
| 10220 | 10687 | ||
| 10221 | list_for_each_entry(sibling, &group_leader->sibling_list, | 10688 | for_each_sibling_event(sibling, group_leader) { |
| 10222 | group_entry) { | ||
| 10223 | perf_remove_from_context(sibling, 0); | 10689 | perf_remove_from_context(sibling, 0); |
| 10224 | put_ctx(gctx); | 10690 | put_ctx(gctx); |
| 10225 | } | 10691 | } |
| @@ -10240,8 +10706,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 10240 | * By installing siblings first we NO-OP because they're not | 10706 | * By installing siblings first we NO-OP because they're not |
| 10241 | * reachable through the group lists. | 10707 | * reachable through the group lists. |
| 10242 | */ | 10708 | */ |
| 10243 | list_for_each_entry(sibling, &group_leader->sibling_list, | 10709 | for_each_sibling_event(sibling, group_leader) { |
| 10244 | group_entry) { | ||
| 10245 | perf_event__state_init(sibling); | 10710 | perf_event__state_init(sibling); |
| 10246 | perf_install_in_context(ctx, sibling, sibling->cpu); | 10711 | perf_install_in_context(ctx, sibling, sibling->cpu); |
| 10247 | get_ctx(ctx); | 10712 | get_ctx(ctx); |
| @@ -10880,7 +11345,7 @@ static int inherit_group(struct perf_event *parent_event, | |||
| 10880 | * case inherit_event() will create individual events, similar to what | 11345 | * case inherit_event() will create individual events, similar to what |
| 10881 | * perf_group_detach() would do anyway. | 11346 | * perf_group_detach() would do anyway. |
| 10882 | */ | 11347 | */ |
| 10883 | list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { | 11348 | for_each_sibling_event(sub, parent_event) { |
| 10884 | child_ctr = inherit_event(sub, parent, parent_ctx, | 11349 | child_ctr = inherit_event(sub, parent, parent_ctx, |
| 10885 | child, leader, child_ctx); | 11350 | child, leader, child_ctx); |
| 10886 | if (IS_ERR(child_ctr)) | 11351 | if (IS_ERR(child_ctr)) |
| @@ -10979,7 +11444,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) | |||
| 10979 | * We dont have to disable NMIs - we are only looking at | 11444 | * We dont have to disable NMIs - we are only looking at |
| 10980 | * the list, not manipulating it: | 11445 | * the list, not manipulating it: |
| 10981 | */ | 11446 | */ |
| 10982 | list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { | 11447 | perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { |
| 10983 | ret = inherit_task_group(event, parent, parent_ctx, | 11448 | ret = inherit_task_group(event, parent, parent_ctx, |
| 10984 | child, ctxn, &inherited_all); | 11449 | child, ctxn, &inherited_all); |
| 10985 | if (ret) | 11450 | if (ret) |
| @@ -10995,7 +11460,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) | |||
| 10995 | parent_ctx->rotate_disable = 1; | 11460 | parent_ctx->rotate_disable = 1; |
| 10996 | raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); | 11461 | raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); |
| 10997 | 11462 | ||
| 10998 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { | 11463 | perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { |
| 10999 | ret = inherit_task_group(event, parent, parent_ctx, | 11464 | ret = inherit_task_group(event, parent, parent_ctx, |
| 11000 | child, ctxn, &inherited_all); | 11465 | child, ctxn, &inherited_all); |
| 11001 | if (ret) | 11466 | if (ret) |
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 3f8cb1e14588..6e28d2866be5 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
| @@ -44,6 +44,7 @@ | |||
| 44 | #include <linux/list.h> | 44 | #include <linux/list.h> |
| 45 | #include <linux/cpu.h> | 45 | #include <linux/cpu.h> |
| 46 | #include <linux/smp.h> | 46 | #include <linux/smp.h> |
| 47 | #include <linux/bug.h> | ||
| 47 | 48 | ||
| 48 | #include <linux/hw_breakpoint.h> | 49 | #include <linux/hw_breakpoint.h> |
| 49 | /* | 50 | /* |
| @@ -85,9 +86,9 @@ __weak int hw_breakpoint_weight(struct perf_event *bp) | |||
| 85 | return 1; | 86 | return 1; |
| 86 | } | 87 | } |
| 87 | 88 | ||
| 88 | static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) | 89 | static inline enum bp_type_idx find_slot_idx(u64 bp_type) |
| 89 | { | 90 | { |
| 90 | if (bp->attr.bp_type & HW_BREAKPOINT_RW) | 91 | if (bp_type & HW_BREAKPOINT_RW) |
| 91 | return TYPE_DATA; | 92 | return TYPE_DATA; |
| 92 | 93 | ||
| 93 | return TYPE_INST; | 94 | return TYPE_INST; |
| @@ -122,7 +123,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) | |||
| 122 | 123 | ||
| 123 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { | 124 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { |
| 124 | if (iter->hw.target == tsk && | 125 | if (iter->hw.target == tsk && |
| 125 | find_slot_idx(iter) == type && | 126 | find_slot_idx(iter->attr.bp_type) == type && |
| 126 | (iter->cpu < 0 || cpu == iter->cpu)) | 127 | (iter->cpu < 0 || cpu == iter->cpu)) |
| 127 | count += hw_breakpoint_weight(iter); | 128 | count += hw_breakpoint_weight(iter); |
| 128 | } | 129 | } |
| @@ -277,7 +278,7 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) | |||
| 277 | * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *)) | 278 | * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *)) |
| 278 | * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM | 279 | * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM |
| 279 | */ | 280 | */ |
| 280 | static int __reserve_bp_slot(struct perf_event *bp) | 281 | static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) |
| 281 | { | 282 | { |
| 282 | struct bp_busy_slots slots = {0}; | 283 | struct bp_busy_slots slots = {0}; |
| 283 | enum bp_type_idx type; | 284 | enum bp_type_idx type; |
| @@ -288,11 +289,11 @@ static int __reserve_bp_slot(struct perf_event *bp) | |||
| 288 | return -ENOMEM; | 289 | return -ENOMEM; |
| 289 | 290 | ||
| 290 | /* Basic checks */ | 291 | /* Basic checks */ |
| 291 | if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || | 292 | if (bp_type == HW_BREAKPOINT_EMPTY || |
| 292 | bp->attr.bp_type == HW_BREAKPOINT_INVALID) | 293 | bp_type == HW_BREAKPOINT_INVALID) |
| 293 | return -EINVAL; | 294 | return -EINVAL; |
| 294 | 295 | ||
| 295 | type = find_slot_idx(bp); | 296 | type = find_slot_idx(bp_type); |
| 296 | weight = hw_breakpoint_weight(bp); | 297 | weight = hw_breakpoint_weight(bp); |
| 297 | 298 | ||
| 298 | fetch_bp_busy_slots(&slots, bp, type); | 299 | fetch_bp_busy_slots(&slots, bp, type); |
| @@ -317,19 +318,19 @@ int reserve_bp_slot(struct perf_event *bp) | |||
| 317 | 318 | ||
| 318 | mutex_lock(&nr_bp_mutex); | 319 | mutex_lock(&nr_bp_mutex); |
| 319 | 320 | ||
| 320 | ret = __reserve_bp_slot(bp); | 321 | ret = __reserve_bp_slot(bp, bp->attr.bp_type); |
| 321 | 322 | ||
| 322 | mutex_unlock(&nr_bp_mutex); | 323 | mutex_unlock(&nr_bp_mutex); |
| 323 | 324 | ||
| 324 | return ret; | 325 | return ret; |
| 325 | } | 326 | } |
| 326 | 327 | ||
| 327 | static void __release_bp_slot(struct perf_event *bp) | 328 | static void __release_bp_slot(struct perf_event *bp, u64 bp_type) |
| 328 | { | 329 | { |
| 329 | enum bp_type_idx type; | 330 | enum bp_type_idx type; |
| 330 | int weight; | 331 | int weight; |
| 331 | 332 | ||
| 332 | type = find_slot_idx(bp); | 333 | type = find_slot_idx(bp_type); |
| 333 | weight = hw_breakpoint_weight(bp); | 334 | weight = hw_breakpoint_weight(bp); |
| 334 | toggle_bp_slot(bp, false, type, weight); | 335 | toggle_bp_slot(bp, false, type, weight); |
| 335 | } | 336 | } |
| @@ -339,11 +340,43 @@ void release_bp_slot(struct perf_event *bp) | |||
| 339 | mutex_lock(&nr_bp_mutex); | 340 | mutex_lock(&nr_bp_mutex); |
| 340 | 341 | ||
| 341 | arch_unregister_hw_breakpoint(bp); | 342 | arch_unregister_hw_breakpoint(bp); |
| 342 | __release_bp_slot(bp); | 343 | __release_bp_slot(bp, bp->attr.bp_type); |
| 343 | 344 | ||
| 344 | mutex_unlock(&nr_bp_mutex); | 345 | mutex_unlock(&nr_bp_mutex); |
| 345 | } | 346 | } |
| 346 | 347 | ||
| 348 | static int __modify_bp_slot(struct perf_event *bp, u64 old_type) | ||
| 349 | { | ||
| 350 | int err; | ||
| 351 | |||
| 352 | __release_bp_slot(bp, old_type); | ||
| 353 | |||
| 354 | err = __reserve_bp_slot(bp, bp->attr.bp_type); | ||
| 355 | if (err) { | ||
| 356 | /* | ||
| 357 | * Reserve the old_type slot back in case | ||
| 358 | * there's no space for the new type. | ||
| 359 | * | ||
| 360 | * This must succeed, because we just released | ||
| 361 | * the old_type slot in the __release_bp_slot | ||
| 362 | * call above. If not, something is broken. | ||
| 363 | */ | ||
| 364 | WARN_ON(__reserve_bp_slot(bp, old_type)); | ||
| 365 | } | ||
| 366 | |||
| 367 | return err; | ||
| 368 | } | ||
| 369 | |||
| 370 | static int modify_bp_slot(struct perf_event *bp, u64 old_type) | ||
| 371 | { | ||
| 372 | int ret; | ||
| 373 | |||
| 374 | mutex_lock(&nr_bp_mutex); | ||
| 375 | ret = __modify_bp_slot(bp, old_type); | ||
| 376 | mutex_unlock(&nr_bp_mutex); | ||
| 377 | return ret; | ||
| 378 | } | ||
| 379 | |||
| 347 | /* | 380 | /* |
| 348 | * Allow the kernel debugger to reserve breakpoint slots without | 381 | * Allow the kernel debugger to reserve breakpoint slots without |
| 349 | * taking a lock using the dbg_* variant of for the reserve and | 382 | * taking a lock using the dbg_* variant of for the reserve and |
| @@ -354,7 +387,7 @@ int dbg_reserve_bp_slot(struct perf_event *bp) | |||
| 354 | if (mutex_is_locked(&nr_bp_mutex)) | 387 | if (mutex_is_locked(&nr_bp_mutex)) |
| 355 | return -1; | 388 | return -1; |
| 356 | 389 | ||
| 357 | return __reserve_bp_slot(bp); | 390 | return __reserve_bp_slot(bp, bp->attr.bp_type); |
| 358 | } | 391 | } |
| 359 | 392 | ||
| 360 | int dbg_release_bp_slot(struct perf_event *bp) | 393 | int dbg_release_bp_slot(struct perf_event *bp) |
| @@ -362,7 +395,7 @@ int dbg_release_bp_slot(struct perf_event *bp) | |||
| 362 | if (mutex_is_locked(&nr_bp_mutex)) | 395 | if (mutex_is_locked(&nr_bp_mutex)) |
| 363 | return -1; | 396 | return -1; |
| 364 | 397 | ||
| 365 | __release_bp_slot(bp); | 398 | __release_bp_slot(bp, bp->attr.bp_type); |
| 366 | 399 | ||
| 367 | return 0; | 400 | return 0; |
| 368 | } | 401 | } |
| @@ -423,20 +456,45 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, | |||
| 423 | } | 456 | } |
| 424 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); | 457 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); |
| 425 | 458 | ||
| 459 | int | ||
| 460 | modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, | ||
| 461 | bool check) | ||
| 462 | { | ||
| 463 | u64 old_addr = bp->attr.bp_addr; | ||
| 464 | u64 old_len = bp->attr.bp_len; | ||
| 465 | int old_type = bp->attr.bp_type; | ||
| 466 | bool modify = attr->bp_type != old_type; | ||
| 467 | int err = 0; | ||
| 468 | |||
| 469 | bp->attr.bp_addr = attr->bp_addr; | ||
| 470 | bp->attr.bp_type = attr->bp_type; | ||
| 471 | bp->attr.bp_len = attr->bp_len; | ||
| 472 | |||
| 473 | if (check && memcmp(&bp->attr, attr, sizeof(*attr))) | ||
| 474 | return -EINVAL; | ||
| 475 | |||
| 476 | err = validate_hw_breakpoint(bp); | ||
| 477 | if (!err && modify) | ||
| 478 | err = modify_bp_slot(bp, old_type); | ||
| 479 | |||
| 480 | if (err) { | ||
| 481 | bp->attr.bp_addr = old_addr; | ||
| 482 | bp->attr.bp_type = old_type; | ||
| 483 | bp->attr.bp_len = old_len; | ||
| 484 | return err; | ||
| 485 | } | ||
| 486 | |||
| 487 | bp->attr.disabled = attr->disabled; | ||
| 488 | return 0; | ||
| 489 | } | ||
| 490 | |||
| 426 | /** | 491 | /** |
| 427 | * modify_user_hw_breakpoint - modify a user-space hardware breakpoint | 492 | * modify_user_hw_breakpoint - modify a user-space hardware breakpoint |
| 428 | * @bp: the breakpoint structure to modify | 493 | * @bp: the breakpoint structure to modify |
| 429 | * @attr: new breakpoint attributes | 494 | * @attr: new breakpoint attributes |
| 430 | * @triggered: callback to trigger when we hit the breakpoint | ||
| 431 | * @tsk: pointer to 'task_struct' of the process to which the address belongs | ||
| 432 | */ | 495 | */ |
| 433 | int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) | 496 | int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) |
| 434 | { | 497 | { |
| 435 | u64 old_addr = bp->attr.bp_addr; | ||
| 436 | u64 old_len = bp->attr.bp_len; | ||
| 437 | int old_type = bp->attr.bp_type; | ||
| 438 | int err = 0; | ||
| 439 | |||
| 440 | /* | 498 | /* |
| 441 | * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it | 499 | * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it |
| 442 | * will not be possible to raise IPIs that invoke __perf_event_disable. | 500 | * will not be possible to raise IPIs that invoke __perf_event_disable. |
| @@ -448,30 +506,14 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att | |||
| 448 | else | 506 | else |
| 449 | perf_event_disable(bp); | 507 | perf_event_disable(bp); |
| 450 | 508 | ||
| 451 | bp->attr.bp_addr = attr->bp_addr; | 509 | if (!attr->disabled) { |
| 452 | bp->attr.bp_type = attr->bp_type; | 510 | int err = modify_user_hw_breakpoint_check(bp, attr, false); |
| 453 | bp->attr.bp_len = attr->bp_len; | ||
| 454 | |||
| 455 | if (attr->disabled) | ||
| 456 | goto end; | ||
| 457 | 511 | ||
| 458 | err = validate_hw_breakpoint(bp); | 512 | if (err) |
| 459 | if (!err) | 513 | return err; |
| 460 | perf_event_enable(bp); | 514 | perf_event_enable(bp); |
| 461 | 515 | bp->attr.disabled = 0; | |
| 462 | if (err) { | ||
| 463 | bp->attr.bp_addr = old_addr; | ||
| 464 | bp->attr.bp_type = old_type; | ||
| 465 | bp->attr.bp_len = old_len; | ||
| 466 | if (!bp->attr.disabled) | ||
| 467 | perf_event_enable(bp); | ||
| 468 | |||
| 469 | return err; | ||
| 470 | } | 516 | } |
| 471 | |||
| 472 | end: | ||
| 473 | bp->attr.disabled = attr->disabled; | ||
| 474 | |||
| 475 | return 0; | 517 | return 0; |
| 476 | } | 518 | } |
| 477 | EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); | 519 | EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); |
diff --git a/kernel/exit.c b/kernel/exit.c index 995453d9fb55..c3c7ac560114 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -1691,7 +1691,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, | |||
| 1691 | */ | 1691 | */ |
| 1692 | SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) | 1692 | SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) |
| 1693 | { | 1693 | { |
| 1694 | return sys_wait4(pid, stat_addr, options, NULL); | 1694 | return kernel_wait4(pid, stat_addr, options, NULL); |
| 1695 | } | 1695 | } |
| 1696 | 1696 | ||
| 1697 | #endif | 1697 | #endif |
diff --git a/kernel/fork.c b/kernel/fork.c index e5d9d405ae4e..f71b67dc156d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -1198,8 +1198,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) | |||
| 1198 | * not set up a proper pointer then tough luck. | 1198 | * not set up a proper pointer then tough luck. |
| 1199 | */ | 1199 | */ |
| 1200 | put_user(0, tsk->clear_child_tid); | 1200 | put_user(0, tsk->clear_child_tid); |
| 1201 | sys_futex(tsk->clear_child_tid, FUTEX_WAKE, | 1201 | do_futex(tsk->clear_child_tid, FUTEX_WAKE, |
| 1202 | 1, NULL, NULL, 0); | 1202 | 1, NULL, NULL, 0, 0); |
| 1203 | } | 1203 | } |
| 1204 | tsk->clear_child_tid = NULL; | 1204 | tsk->clear_child_tid = NULL; |
| 1205 | } | 1205 | } |
| @@ -2354,7 +2354,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp | |||
| 2354 | * constructed. Here we are modifying the current, active, | 2354 | * constructed. Here we are modifying the current, active, |
| 2355 | * task_struct. | 2355 | * task_struct. |
| 2356 | */ | 2356 | */ |
| 2357 | SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | 2357 | int ksys_unshare(unsigned long unshare_flags) |
| 2358 | { | 2358 | { |
| 2359 | struct fs_struct *fs, *new_fs = NULL; | 2359 | struct fs_struct *fs, *new_fs = NULL; |
| 2360 | struct files_struct *fd, *new_fd = NULL; | 2360 | struct files_struct *fd, *new_fd = NULL; |
| @@ -2470,6 +2470,11 @@ bad_unshare_out: | |||
| 2470 | return err; | 2470 | return err; |
| 2471 | } | 2471 | } |
| 2472 | 2472 | ||
| 2473 | SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | ||
| 2474 | { | ||
| 2475 | return ksys_unshare(unshare_flags); | ||
| 2476 | } | ||
| 2477 | |||
| 2473 | /* | 2478 | /* |
| 2474 | * Helper to unshare the files of the current task. | 2479 | * Helper to unshare the files of the current task. |
| 2475 | * We don't want to expose copy_files internals to | 2480 | * We don't want to expose copy_files internals to |
diff --git a/kernel/kexec.c b/kernel/kexec.c index e62ec4dc6620..aed8fb2564b3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -192,11 +192,9 @@ out: | |||
| 192 | * that to happen you need to do that yourself. | 192 | * that to happen you need to do that yourself. |
| 193 | */ | 193 | */ |
| 194 | 194 | ||
| 195 | SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, | 195 | static inline int kexec_load_check(unsigned long nr_segments, |
| 196 | struct kexec_segment __user *, segments, unsigned long, flags) | 196 | unsigned long flags) |
| 197 | { | 197 | { |
| 198 | int result; | ||
| 199 | |||
| 200 | /* We only trust the superuser with rebooting the system. */ | 198 | /* We only trust the superuser with rebooting the system. */ |
| 201 | if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) | 199 | if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) |
| 202 | return -EPERM; | 200 | return -EPERM; |
| @@ -208,17 +206,29 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, | |||
| 208 | if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) | 206 | if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) |
| 209 | return -EINVAL; | 207 | return -EINVAL; |
| 210 | 208 | ||
| 211 | /* Verify we are on the appropriate architecture */ | ||
| 212 | if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && | ||
| 213 | ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) | ||
| 214 | return -EINVAL; | ||
| 215 | |||
| 216 | /* Put an artificial cap on the number | 209 | /* Put an artificial cap on the number |
| 217 | * of segments passed to kexec_load. | 210 | * of segments passed to kexec_load. |
| 218 | */ | 211 | */ |
| 219 | if (nr_segments > KEXEC_SEGMENT_MAX) | 212 | if (nr_segments > KEXEC_SEGMENT_MAX) |
| 220 | return -EINVAL; | 213 | return -EINVAL; |
| 221 | 214 | ||
| 215 | return 0; | ||
| 216 | } | ||
| 217 | |||
| 218 | SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, | ||
| 219 | struct kexec_segment __user *, segments, unsigned long, flags) | ||
| 220 | { | ||
| 221 | int result; | ||
| 222 | |||
| 223 | result = kexec_load_check(nr_segments, flags); | ||
| 224 | if (result) | ||
| 225 | return result; | ||
| 226 | |||
| 227 | /* Verify we are on the appropriate architecture */ | ||
| 228 | if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && | ||
| 229 | ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) | ||
| 230 | return -EINVAL; | ||
| 231 | |||
| 222 | /* Because we write directly to the reserved memory | 232 | /* Because we write directly to the reserved memory |
| 223 | * region when loading crash kernels we need a mutex here to | 233 | * region when loading crash kernels we need a mutex here to |
| 224 | * prevent multiple crash kernels from attempting to load | 234 | * prevent multiple crash kernels from attempting to load |
| @@ -247,15 +257,16 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, | |||
| 247 | struct kexec_segment out, __user *ksegments; | 257 | struct kexec_segment out, __user *ksegments; |
| 248 | unsigned long i, result; | 258 | unsigned long i, result; |
| 249 | 259 | ||
| 260 | result = kexec_load_check(nr_segments, flags); | ||
| 261 | if (result) | ||
| 262 | return result; | ||
| 263 | |||
| 250 | /* Don't allow clients that don't understand the native | 264 | /* Don't allow clients that don't understand the native |
| 251 | * architecture to do anything. | 265 | * architecture to do anything. |
| 252 | */ | 266 | */ |
| 253 | if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) | 267 | if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) |
| 254 | return -EINVAL; | 268 | return -EINVAL; |
| 255 | 269 | ||
| 256 | if (nr_segments > KEXEC_SEGMENT_MAX) | ||
| 257 | return -EINVAL; | ||
| 258 | |||
| 259 | ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); | 270 | ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); |
| 260 | for (i = 0; i < nr_segments; i++) { | 271 | for (i = 0; i < nr_segments; i++) { |
| 261 | result = copy_from_user(&in, &segments[i], sizeof(in)); | 272 | result = copy_from_user(&in, &segments[i], sizeof(in)); |
| @@ -272,6 +283,21 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, | |||
| 272 | return -EFAULT; | 283 | return -EFAULT; |
| 273 | } | 284 | } |
| 274 | 285 | ||
| 275 | return sys_kexec_load(entry, nr_segments, ksegments, flags); | 286 | /* Because we write directly to the reserved memory |
| 287 | * region when loading crash kernels we need a mutex here to | ||
| 288 | * prevent multiple crash kernels from attempting to load | ||
| 289 | * simultaneously, and to prevent a crash kernel from loading | ||
| 290 | * over the top of a in use crash kernel. | ||
| 291 | * | ||
| 292 | * KISS: always take the mutex. | ||
| 293 | */ | ||
| 294 | if (!mutex_trylock(&kexec_mutex)) | ||
| 295 | return -EBUSY; | ||
| 296 | |||
| 297 | result = do_kexec_load(entry, nr_segments, ksegments, flags); | ||
| 298 | |||
| 299 | mutex_unlock(&kexec_mutex); | ||
| 300 | |||
| 301 | return result; | ||
| 276 | } | 302 | } |
| 277 | #endif | 303 | #endif |
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 89b5f83f1969..023386338269 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
| @@ -556,9 +556,9 @@ static void print_lock(struct held_lock *hlock) | |||
| 556 | return; | 556 | return; |
| 557 | } | 557 | } |
| 558 | 558 | ||
| 559 | printk(KERN_CONT "%p", hlock->instance); | ||
| 559 | print_lock_name(lock_classes + class_idx - 1); | 560 | print_lock_name(lock_classes + class_idx - 1); |
| 560 | printk(KERN_CONT ", at: [<%p>] %pS\n", | 561 | printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); |
| 561 | (void *)hlock->acquire_ip, (void *)hlock->acquire_ip); | ||
| 562 | } | 562 | } |
| 563 | 563 | ||
| 564 | static void lockdep_print_held_locks(struct task_struct *curr) | 564 | static void lockdep_print_held_locks(struct task_struct *curr) |
| @@ -808,7 +808,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
| 808 | if (verbose(class)) { | 808 | if (verbose(class)) { |
| 809 | graph_unlock(); | 809 | graph_unlock(); |
| 810 | 810 | ||
| 811 | printk("\nnew class %p: %s", class->key, class->name); | 811 | printk("\nnew class %px: %s", class->key, class->name); |
| 812 | if (class->name_version > 1) | 812 | if (class->name_version > 1) |
| 813 | printk(KERN_CONT "#%d", class->name_version); | 813 | printk(KERN_CONT "#%d", class->name_version); |
| 814 | printk(KERN_CONT "\n"); | 814 | printk(KERN_CONT "\n"); |
| @@ -1407,7 +1407,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) | |||
| 1407 | } | 1407 | } |
| 1408 | printk("%*s }\n", depth, ""); | 1408 | printk("%*s }\n", depth, ""); |
| 1409 | 1409 | ||
| 1410 | printk("%*s ... key at: [<%p>] %pS\n", | 1410 | printk("%*s ... key at: [<%px>] %pS\n", |
| 1411 | depth, "", class->key, class->key); | 1411 | depth, "", class->key, class->key); |
| 1412 | } | 1412 | } |
| 1413 | 1413 | ||
| @@ -2340,7 +2340,7 @@ cache_hit: | |||
| 2340 | 2340 | ||
| 2341 | if (very_verbose(class)) { | 2341 | if (very_verbose(class)) { |
| 2342 | printk("\nhash chain already cached, key: " | 2342 | printk("\nhash chain already cached, key: " |
| 2343 | "%016Lx tail class: [%p] %s\n", | 2343 | "%016Lx tail class: [%px] %s\n", |
| 2344 | (unsigned long long)chain_key, | 2344 | (unsigned long long)chain_key, |
| 2345 | class->key, class->name); | 2345 | class->key, class->name); |
| 2346 | } | 2346 | } |
| @@ -2349,7 +2349,7 @@ cache_hit: | |||
| 2349 | } | 2349 | } |
| 2350 | 2350 | ||
| 2351 | if (very_verbose(class)) { | 2351 | if (very_verbose(class)) { |
| 2352 | printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", | 2352 | printk("\nnew hash chain, key: %016Lx tail class: [%px] %s\n", |
| 2353 | (unsigned long long)chain_key, class->key, class->name); | 2353 | (unsigned long long)chain_key, class->key, class->name); |
| 2354 | } | 2354 | } |
| 2355 | 2355 | ||
| @@ -2676,16 +2676,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, | |||
| 2676 | void print_irqtrace_events(struct task_struct *curr) | 2676 | void print_irqtrace_events(struct task_struct *curr) |
| 2677 | { | 2677 | { |
| 2678 | printk("irq event stamp: %u\n", curr->irq_events); | 2678 | printk("irq event stamp: %u\n", curr->irq_events); |
| 2679 | printk("hardirqs last enabled at (%u): [<%p>] %pS\n", | 2679 | printk("hardirqs last enabled at (%u): [<%px>] %pS\n", |
| 2680 | curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, | 2680 | curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, |
| 2681 | (void *)curr->hardirq_enable_ip); | 2681 | (void *)curr->hardirq_enable_ip); |
| 2682 | printk("hardirqs last disabled at (%u): [<%p>] %pS\n", | 2682 | printk("hardirqs last disabled at (%u): [<%px>] %pS\n", |
| 2683 | curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, | 2683 | curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, |
| 2684 | (void *)curr->hardirq_disable_ip); | 2684 | (void *)curr->hardirq_disable_ip); |
| 2685 | printk("softirqs last enabled at (%u): [<%p>] %pS\n", | 2685 | printk("softirqs last enabled at (%u): [<%px>] %pS\n", |
| 2686 | curr->softirq_enable_event, (void *)curr->softirq_enable_ip, | 2686 | curr->softirq_enable_event, (void *)curr->softirq_enable_ip, |
| 2687 | (void *)curr->softirq_enable_ip); | 2687 | (void *)curr->softirq_enable_ip); |
| 2688 | printk("softirqs last disabled at (%u): [<%p>] %pS\n", | 2688 | printk("softirqs last disabled at (%u): [<%px>] %pS\n", |
| 2689 | curr->softirq_disable_event, (void *)curr->softirq_disable_ip, | 2689 | curr->softirq_disable_event, (void *)curr->softirq_disable_ip, |
| 2690 | (void *)curr->softirq_disable_ip); | 2690 | (void *)curr->softirq_disable_ip); |
| 2691 | } | 2691 | } |
| @@ -3207,7 +3207,7 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, | |||
| 3207 | * Sanity check, the lock-class key must be persistent: | 3207 | * Sanity check, the lock-class key must be persistent: |
| 3208 | */ | 3208 | */ |
| 3209 | if (!static_obj(key)) { | 3209 | if (!static_obj(key)) { |
| 3210 | printk("BUG: key %p not in .data!\n", key); | 3210 | printk("BUG: key %px not in .data!\n", key); |
| 3211 | /* | 3211 | /* |
| 3212 | * What it says above ^^^^^, I suggest you read it. | 3212 | * What it says above ^^^^^, I suggest you read it. |
| 3213 | */ | 3213 | */ |
| @@ -3322,7 +3322,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
| 3322 | } | 3322 | } |
| 3323 | atomic_inc((atomic_t *)&class->ops); | 3323 | atomic_inc((atomic_t *)&class->ops); |
| 3324 | if (very_verbose(class)) { | 3324 | if (very_verbose(class)) { |
| 3325 | printk("\nacquire class [%p] %s", class->key, class->name); | 3325 | printk("\nacquire class [%px] %s", class->key, class->name); |
| 3326 | if (class->name_version > 1) | 3326 | if (class->name_version > 1) |
| 3327 | printk(KERN_CONT "#%d", class->name_version); | 3327 | printk(KERN_CONT "#%d", class->name_version); |
| 3328 | printk(KERN_CONT "\n"); | 3328 | printk(KERN_CONT "\n"); |
| @@ -4376,7 +4376,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, | |||
| 4376 | pr_warn("WARNING: held lock freed!\n"); | 4376 | pr_warn("WARNING: held lock freed!\n"); |
| 4377 | print_kernel_ident(); | 4377 | print_kernel_ident(); |
| 4378 | pr_warn("-------------------------\n"); | 4378 | pr_warn("-------------------------\n"); |
| 4379 | pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n", | 4379 | pr_warn("%s/%d is freeing memory %px-%px, with a lock still held there!\n", |
| 4380 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); | 4380 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); |
| 4381 | print_lock(hlock); | 4381 | print_lock(hlock); |
| 4382 | lockdep_print_held_locks(curr); | 4382 | lockdep_print_held_locks(curr); |
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 940633c63254..4f014be7a4b8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
| @@ -1268,8 +1268,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
| 1268 | 1268 | ||
| 1269 | if (unlikely(ret)) { | 1269 | if (unlikely(ret)) { |
| 1270 | __set_current_state(TASK_RUNNING); | 1270 | __set_current_state(TASK_RUNNING); |
| 1271 | if (rt_mutex_has_waiters(lock)) | 1271 | remove_waiter(lock, &waiter); |
| 1272 | remove_waiter(lock, &waiter); | ||
| 1273 | rt_mutex_handle_deadlock(ret, chwalk, &waiter); | 1272 | rt_mutex_handle_deadlock(ret, chwalk, &waiter); |
| 1274 | } | 1273 | } |
| 1275 | 1274 | ||
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 68686b3ec3c1..d1d62f942be2 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h | |||
| @@ -52,12 +52,13 @@ static inline int rt_mutex_has_waiters(struct rt_mutex *lock) | |||
| 52 | static inline struct rt_mutex_waiter * | 52 | static inline struct rt_mutex_waiter * |
| 53 | rt_mutex_top_waiter(struct rt_mutex *lock) | 53 | rt_mutex_top_waiter(struct rt_mutex *lock) |
| 54 | { | 54 | { |
| 55 | struct rt_mutex_waiter *w; | 55 | struct rb_node *leftmost = rb_first_cached(&lock->waiters); |
| 56 | 56 | struct rt_mutex_waiter *w = NULL; | |
| 57 | w = rb_entry(lock->waiters.rb_leftmost, | ||
| 58 | struct rt_mutex_waiter, tree_entry); | ||
| 59 | BUG_ON(w->lock != lock); | ||
| 60 | 57 | ||
| 58 | if (leftmost) { | ||
| 59 | w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry); | ||
| 60 | BUG_ON(w->lock != lock); | ||
| 61 | } | ||
| 61 | return w; | 62 | return w; |
| 62 | } | 63 | } |
| 63 | 64 | ||
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index f549c552dbf1..30465a2f2b6c 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c | |||
| @@ -117,6 +117,7 @@ EXPORT_SYMBOL(down_write_trylock); | |||
| 117 | void up_read(struct rw_semaphore *sem) | 117 | void up_read(struct rw_semaphore *sem) |
| 118 | { | 118 | { |
| 119 | rwsem_release(&sem->dep_map, 1, _RET_IP_); | 119 | rwsem_release(&sem->dep_map, 1, _RET_IP_); |
| 120 | DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED); | ||
| 120 | 121 | ||
| 121 | __up_read(sem); | 122 | __up_read(sem); |
| 122 | } | 123 | } |
| @@ -129,6 +130,7 @@ EXPORT_SYMBOL(up_read); | |||
| 129 | void up_write(struct rw_semaphore *sem) | 130 | void up_write(struct rw_semaphore *sem) |
| 130 | { | 131 | { |
| 131 | rwsem_release(&sem->dep_map, 1, _RET_IP_); | 132 | rwsem_release(&sem->dep_map, 1, _RET_IP_); |
| 133 | DEBUG_RWSEMS_WARN_ON(sem->owner != current); | ||
| 132 | 134 | ||
| 133 | rwsem_clear_owner(sem); | 135 | rwsem_clear_owner(sem); |
| 134 | __up_write(sem); | 136 | __up_write(sem); |
| @@ -142,6 +144,7 @@ EXPORT_SYMBOL(up_write); | |||
| 142 | void downgrade_write(struct rw_semaphore *sem) | 144 | void downgrade_write(struct rw_semaphore *sem) |
| 143 | { | 145 | { |
| 144 | lock_downgrade(&sem->dep_map, _RET_IP_); | 146 | lock_downgrade(&sem->dep_map, _RET_IP_); |
| 147 | DEBUG_RWSEMS_WARN_ON(sem->owner != current); | ||
| 145 | 148 | ||
| 146 | rwsem_set_reader_owned(sem); | 149 | rwsem_set_reader_owned(sem); |
| 147 | __downgrade_write(sem); | 150 | __downgrade_write(sem); |
| @@ -211,6 +214,7 @@ EXPORT_SYMBOL(down_write_killable_nested); | |||
| 211 | 214 | ||
| 212 | void up_read_non_owner(struct rw_semaphore *sem) | 215 | void up_read_non_owner(struct rw_semaphore *sem) |
| 213 | { | 216 | { |
| 217 | DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED); | ||
| 214 | __up_read(sem); | 218 | __up_read(sem); |
| 215 | } | 219 | } |
| 216 | 220 | ||
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index a883b8f1fdc6..a17cba8d94bb 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h | |||
| @@ -16,6 +16,12 @@ | |||
| 16 | */ | 16 | */ |
| 17 | #define RWSEM_READER_OWNED ((struct task_struct *)1UL) | 17 | #define RWSEM_READER_OWNED ((struct task_struct *)1UL) |
| 18 | 18 | ||
| 19 | #ifdef CONFIG_DEBUG_RWSEMS | ||
| 20 | # define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) | ||
| 21 | #else | ||
| 22 | # define DEBUG_RWSEMS_WARN_ON(c) | ||
| 23 | #endif | ||
| 24 | |||
| 19 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | 25 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER |
| 20 | /* | 26 | /* |
| 21 | * All writes to owner are protected by WRITE_ONCE() to make sure that | 27 | * All writes to owner are protected by WRITE_ONCE() to make sure that |
| @@ -41,7 +47,7 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) | |||
| 41 | * do a write to the rwsem cacheline when it is really necessary | 47 | * do a write to the rwsem cacheline when it is really necessary |
| 42 | * to minimize cacheline contention. | 48 | * to minimize cacheline contention. |
| 43 | */ | 49 | */ |
| 44 | if (sem->owner != RWSEM_READER_OWNED) | 50 | if (READ_ONCE(sem->owner) != RWSEM_READER_OWNED) |
| 45 | WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); | 51 | WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); |
| 46 | } | 52 | } |
| 47 | 53 | ||
diff --git a/kernel/module.c b/kernel/module.c index e42764acedb4..a6e43a5806a1 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -2181,10 +2181,6 @@ static void free_module(struct module *mod) | |||
| 2181 | /* Finally, free the core (containing the module structure) */ | 2181 | /* Finally, free the core (containing the module structure) */ |
| 2182 | disable_ro_nx(&mod->core_layout); | 2182 | disable_ro_nx(&mod->core_layout); |
| 2183 | module_memfree(mod->core_layout.base); | 2183 | module_memfree(mod->core_layout.base); |
| 2184 | |||
| 2185 | #ifdef CONFIG_MPU | ||
| 2186 | update_protections(current->mm); | ||
| 2187 | #endif | ||
| 2188 | } | 2184 | } |
| 2189 | 2185 | ||
| 2190 | void *__symbol_get(const char *symbol) | 2186 | void *__symbol_get(const char *symbol) |
diff --git a/kernel/panic.c b/kernel/panic.c index 4b794f1d8561..9d833d913c84 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -289,7 +289,7 @@ void panic(const char *fmt, ...) | |||
| 289 | disabled_wait(caller); | 289 | disabled_wait(caller); |
| 290 | } | 290 | } |
| 291 | #endif | 291 | #endif |
| 292 | pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf); | 292 | pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); |
| 293 | local_irq_enable(); | 293 | local_irq_enable(); |
| 294 | for (i = 0; ; i += PANIC_TIMER_STEP) { | 294 | for (i = 0; ; i += PANIC_TIMER_STEP) { |
| 295 | touch_softlockup_watchdog(); | 295 | touch_softlockup_watchdog(); |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 0b53eef7d34b..93b57f026688 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
| @@ -242,16 +242,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
| 242 | 242 | ||
| 243 | /* | 243 | /* |
| 244 | * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. | 244 | * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. |
| 245 | * sys_wait4() will also block until our children traced from the | 245 | * kernel_wait4() will also block until our children traced from the |
| 246 | * parent namespace are detached and become EXIT_DEAD. | 246 | * parent namespace are detached and become EXIT_DEAD. |
| 247 | */ | 247 | */ |
| 248 | do { | 248 | do { |
| 249 | clear_thread_flag(TIF_SIGPENDING); | 249 | clear_thread_flag(TIF_SIGPENDING); |
| 250 | rc = sys_wait4(-1, NULL, __WALL, NULL); | 250 | rc = kernel_wait4(-1, NULL, __WALL, NULL); |
| 251 | } while (rc != -ECHILD); | 251 | } while (rc != -ECHILD); |
| 252 | 252 | ||
| 253 | /* | 253 | /* |
| 254 | * sys_wait4() above can't reap the EXIT_DEAD children but we do not | 254 | * kernel_wait4() above can't reap the EXIT_DEAD children but we do not |
| 255 | * really care, we could reparent them to the global init. We could | 255 | * really care, we could reparent them to the global init. We could |
| 256 | * exit and reap ->child_reaper even if it is not the last thread in | 256 | * exit and reap ->child_reaper even if it is not the last thread in |
| 257 | * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), | 257 | * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a5c36e9c56a6..4710f1b142fc 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -701,7 +701,7 @@ int hibernate(void) | |||
| 701 | } | 701 | } |
| 702 | 702 | ||
| 703 | pr_info("Syncing filesystems ... \n"); | 703 | pr_info("Syncing filesystems ... \n"); |
| 704 | sys_sync(); | 704 | ksys_sync(); |
| 705 | pr_info("done.\n"); | 705 | pr_info("done.\n"); |
| 706 | 706 | ||
| 707 | error = freeze_processes(); | 707 | error = freeze_processes(); |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0685c4499431..4c10be0f4843 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
| @@ -560,7 +560,7 @@ static int enter_state(suspend_state_t state) | |||
| 560 | #ifndef CONFIG_SUSPEND_SKIP_SYNC | 560 | #ifndef CONFIG_SUSPEND_SKIP_SYNC |
| 561 | trace_suspend_resume(TPS("sync_filesystems"), 0, true); | 561 | trace_suspend_resume(TPS("sync_filesystems"), 0, true); |
| 562 | pr_info("Syncing filesystems ... "); | 562 | pr_info("Syncing filesystems ... "); |
| 563 | sys_sync(); | 563 | ksys_sync(); |
| 564 | pr_cont("done.\n"); | 564 | pr_cont("done.\n"); |
| 565 | trace_suspend_resume(TPS("sync_filesystems"), 0, false); | 565 | trace_suspend_resume(TPS("sync_filesystems"), 0, false); |
| 566 | #endif | 566 | #endif |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 22df9f7ff672..75c959de4b29 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
| @@ -224,7 +224,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
| 224 | break; | 224 | break; |
| 225 | 225 | ||
| 226 | printk("Syncing filesystems ... "); | 226 | printk("Syncing filesystems ... "); |
| 227 | sys_sync(); | 227 | ksys_sync(); |
| 228 | printk("done.\n"); | 228 | printk("done.\n"); |
| 229 | 229 | ||
| 230 | error = freeze_processes(); | 230 | error = freeze_processes(); |
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 6334f2c1abd0..7a693e31184a 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h | |||
| @@ -77,12 +77,18 @@ static inline void rcu_seq_start(unsigned long *sp) | |||
| 77 | WARN_ON_ONCE(rcu_seq_state(*sp) != 1); | 77 | WARN_ON_ONCE(rcu_seq_state(*sp) != 1); |
| 78 | } | 78 | } |
| 79 | 79 | ||
| 80 | /* Compute the end-of-grace-period value for the specified sequence number. */ | ||
| 81 | static inline unsigned long rcu_seq_endval(unsigned long *sp) | ||
| 82 | { | ||
| 83 | return (*sp | RCU_SEQ_STATE_MASK) + 1; | ||
| 84 | } | ||
| 85 | |||
| 80 | /* Adjust sequence number for end of update-side operation. */ | 86 | /* Adjust sequence number for end of update-side operation. */ |
| 81 | static inline void rcu_seq_end(unsigned long *sp) | 87 | static inline void rcu_seq_end(unsigned long *sp) |
| 82 | { | 88 | { |
| 83 | smp_mb(); /* Ensure update-side operation before counter increment. */ | 89 | smp_mb(); /* Ensure update-side operation before counter increment. */ |
| 84 | WARN_ON_ONCE(!rcu_seq_state(*sp)); | 90 | WARN_ON_ONCE(!rcu_seq_state(*sp)); |
| 85 | WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1); | 91 | WRITE_ONCE(*sp, rcu_seq_endval(sp)); |
| 86 | } | 92 | } |
| 87 | 93 | ||
| 88 | /* Take a snapshot of the update side's sequence number. */ | 94 | /* Take a snapshot of the update side's sequence number. */ |
| @@ -295,9 +301,19 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) | |||
| 295 | * Iterate over all possible CPUs in a leaf RCU node. | 301 | * Iterate over all possible CPUs in a leaf RCU node. |
| 296 | */ | 302 | */ |
| 297 | #define for_each_leaf_node_possible_cpu(rnp, cpu) \ | 303 | #define for_each_leaf_node_possible_cpu(rnp, cpu) \ |
| 298 | for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ | 304 | for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ |
| 299 | cpu <= rnp->grphi; \ | 305 | (cpu) <= rnp->grphi; \ |
| 300 | cpu = cpumask_next((cpu), cpu_possible_mask)) | 306 | (cpu) = cpumask_next((cpu), cpu_possible_mask)) |
| 307 | |||
| 308 | /* | ||
| 309 | * Iterate over all CPUs in a leaf RCU node's specified mask. | ||
| 310 | */ | ||
| 311 | #define rcu_find_next_bit(rnp, cpu, mask) \ | ||
| 312 | ((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu))) | ||
| 313 | #define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \ | ||
| 314 | for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ | ||
| 315 | (cpu) <= rnp->grphi; \ | ||
| 316 | (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask))) | ||
| 301 | 317 | ||
| 302 | /* | 318 | /* |
| 303 | * Wrappers for the rcu_node::lock acquire and release. | 319 | * Wrappers for the rcu_node::lock acquire and release. |
| @@ -337,7 +353,7 @@ do { \ | |||
| 337 | } while (0) | 353 | } while (0) |
| 338 | 354 | ||
| 339 | #define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ | 355 | #define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ |
| 340 | raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ | 356 | raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) |
| 341 | 357 | ||
| 342 | #define raw_spin_trylock_rcu_node(p) \ | 358 | #define raw_spin_trylock_rcu_node(p) \ |
| 343 | ({ \ | 359 | ({ \ |
| @@ -348,6 +364,9 @@ do { \ | |||
| 348 | ___locked; \ | 364 | ___locked; \ |
| 349 | }) | 365 | }) |
| 350 | 366 | ||
| 367 | #define raw_lockdep_assert_held_rcu_node(p) \ | ||
| 368 | lockdep_assert_held(&ACCESS_PRIVATE(p, lock)) | ||
| 369 | |||
| 351 | #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ | 370 | #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ |
| 352 | 371 | ||
| 353 | #ifdef CONFIG_TINY_RCU | 372 | #ifdef CONFIG_TINY_RCU |
| @@ -356,24 +375,20 @@ static inline bool rcu_gp_is_normal(void) { return true; } | |||
| 356 | static inline bool rcu_gp_is_expedited(void) { return false; } | 375 | static inline bool rcu_gp_is_expedited(void) { return false; } |
| 357 | static inline void rcu_expedite_gp(void) { } | 376 | static inline void rcu_expedite_gp(void) { } |
| 358 | static inline void rcu_unexpedite_gp(void) { } | 377 | static inline void rcu_unexpedite_gp(void) { } |
| 378 | static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } | ||
| 359 | #else /* #ifdef CONFIG_TINY_RCU */ | 379 | #else /* #ifdef CONFIG_TINY_RCU */ |
| 360 | bool rcu_gp_is_normal(void); /* Internal RCU use. */ | 380 | bool rcu_gp_is_normal(void); /* Internal RCU use. */ |
| 361 | bool rcu_gp_is_expedited(void); /* Internal RCU use. */ | 381 | bool rcu_gp_is_expedited(void); /* Internal RCU use. */ |
| 362 | void rcu_expedite_gp(void); | 382 | void rcu_expedite_gp(void); |
| 363 | void rcu_unexpedite_gp(void); | 383 | void rcu_unexpedite_gp(void); |
| 364 | void rcupdate_announce_bootup_oddness(void); | 384 | void rcupdate_announce_bootup_oddness(void); |
| 385 | void rcu_request_urgent_qs_task(struct task_struct *t); | ||
| 365 | #endif /* #else #ifdef CONFIG_TINY_RCU */ | 386 | #endif /* #else #ifdef CONFIG_TINY_RCU */ |
| 366 | 387 | ||
| 367 | #define RCU_SCHEDULER_INACTIVE 0 | 388 | #define RCU_SCHEDULER_INACTIVE 0 |
| 368 | #define RCU_SCHEDULER_INIT 1 | 389 | #define RCU_SCHEDULER_INIT 1 |
| 369 | #define RCU_SCHEDULER_RUNNING 2 | 390 | #define RCU_SCHEDULER_RUNNING 2 |
| 370 | 391 | ||
| 371 | #ifdef CONFIG_TINY_RCU | ||
| 372 | static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } | ||
| 373 | #else /* #ifdef CONFIG_TINY_RCU */ | ||
| 374 | void rcu_request_urgent_qs_task(struct task_struct *t); | ||
| 375 | #endif /* #else #ifdef CONFIG_TINY_RCU */ | ||
| 376 | |||
| 377 | enum rcutorture_type { | 392 | enum rcutorture_type { |
| 378 | RCU_FLAVOR, | 393 | RCU_FLAVOR, |
| 379 | RCU_BH_FLAVOR, | 394 | RCU_BH_FLAVOR, |
| @@ -470,6 +485,7 @@ void show_rcu_gp_kthreads(void); | |||
| 470 | void rcu_force_quiescent_state(void); | 485 | void rcu_force_quiescent_state(void); |
| 471 | void rcu_bh_force_quiescent_state(void); | 486 | void rcu_bh_force_quiescent_state(void); |
| 472 | void rcu_sched_force_quiescent_state(void); | 487 | void rcu_sched_force_quiescent_state(void); |
| 488 | extern struct workqueue_struct *rcu_gp_wq; | ||
| 473 | #endif /* #else #ifdef CONFIG_TINY_RCU */ | 489 | #endif /* #else #ifdef CONFIG_TINY_RCU */ |
| 474 | 490 | ||
| 475 | #ifdef CONFIG_RCU_NOCB_CPU | 491 | #ifdef CONFIG_RCU_NOCB_CPU |
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index d1ebdf9868bb..777e7a6a0292 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c | |||
| @@ -61,11 +61,30 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); | |||
| 61 | #define VERBOSE_PERFOUT_ERRSTRING(s) \ | 61 | #define VERBOSE_PERFOUT_ERRSTRING(s) \ |
| 62 | do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) | 62 | do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) |
| 63 | 63 | ||
| 64 | /* | ||
| 65 | * The intended use cases for the nreaders and nwriters module parameters | ||
| 66 | * are as follows: | ||
| 67 | * | ||
| 68 | * 1. Specify only the nr_cpus kernel boot parameter. This will | ||
| 69 | * set both nreaders and nwriters to the value specified by | ||
| 70 | * nr_cpus for a mixed reader/writer test. | ||
| 71 | * | ||
| 72 | * 2. Specify the nr_cpus kernel boot parameter, but set | ||
| 73 | * rcuperf.nreaders to zero. This will set nwriters to the | ||
| 74 | * value specified by nr_cpus for an update-only test. | ||
| 75 | * | ||
| 76 | * 3. Specify the nr_cpus kernel boot parameter, but set | ||
| 77 | * rcuperf.nwriters to zero. This will set nreaders to the | ||
| 78 | * value specified by nr_cpus for a read-only test. | ||
| 79 | * | ||
| 80 | * Various other use cases may of course be specified. | ||
| 81 | */ | ||
| 82 | |||
| 64 | torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); | 83 | torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); |
| 65 | torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); | 84 | torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); |
| 66 | torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); | 85 | torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); |
| 67 | torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); | 86 | torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); |
| 68 | torture_param(int, nreaders, 0, "Number of RCU reader threads"); | 87 | torture_param(int, nreaders, -1, "Number of RCU reader threads"); |
| 69 | torture_param(int, nwriters, -1, "Number of RCU updater threads"); | 88 | torture_param(int, nwriters, -1, "Number of RCU updater threads"); |
| 70 | torture_param(bool, shutdown, !IS_ENABLED(MODULE), | 89 | torture_param(bool, shutdown, !IS_ENABLED(MODULE), |
| 71 | "Shutdown at end of performance tests."); | 90 | "Shutdown at end of performance tests."); |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 308e6fdbced8..680c96d8c00f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
| @@ -909,34 +909,38 @@ rcu_torture_writer(void *arg) | |||
| 909 | int nsynctypes = 0; | 909 | int nsynctypes = 0; |
| 910 | 910 | ||
| 911 | VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); | 911 | VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); |
| 912 | if (!can_expedite) { | 912 | if (!can_expedite) |
| 913 | pr_alert("%s" TORTURE_FLAG | 913 | pr_alert("%s" TORTURE_FLAG |
| 914 | " GP expediting controlled from boot/sysfs for %s,\n", | 914 | " GP expediting controlled from boot/sysfs for %s.\n", |
| 915 | torture_type, cur_ops->name); | 915 | torture_type, cur_ops->name); |
| 916 | pr_alert("%s" TORTURE_FLAG | ||
| 917 | " Disabled dynamic grace-period expediting.\n", | ||
| 918 | torture_type); | ||
| 919 | } | ||
| 920 | 916 | ||
| 921 | /* Initialize synctype[] array. If none set, take default. */ | 917 | /* Initialize synctype[] array. If none set, take default. */ |
| 922 | if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) | 918 | if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) |
| 923 | gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; | 919 | gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; |
| 924 | if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) | 920 | if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) { |
| 925 | synctype[nsynctypes++] = RTWS_COND_GET; | 921 | synctype[nsynctypes++] = RTWS_COND_GET; |
| 926 | else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) | 922 | pr_info("%s: Testing conditional GPs.\n", __func__); |
| 927 | pr_alert("rcu_torture_writer: gp_cond without primitives.\n"); | 923 | } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) { |
| 928 | if (gp_exp1 && cur_ops->exp_sync) | 924 | pr_alert("%s: gp_cond without primitives.\n", __func__); |
| 925 | } | ||
| 926 | if (gp_exp1 && cur_ops->exp_sync) { | ||
| 929 | synctype[nsynctypes++] = RTWS_EXP_SYNC; | 927 | synctype[nsynctypes++] = RTWS_EXP_SYNC; |
| 930 | else if (gp_exp && !cur_ops->exp_sync) | 928 | pr_info("%s: Testing expedited GPs.\n", __func__); |
| 931 | pr_alert("rcu_torture_writer: gp_exp without primitives.\n"); | 929 | } else if (gp_exp && !cur_ops->exp_sync) { |
| 932 | if (gp_normal1 && cur_ops->deferred_free) | 930 | pr_alert("%s: gp_exp without primitives.\n", __func__); |
| 931 | } | ||
| 932 | if (gp_normal1 && cur_ops->deferred_free) { | ||
| 933 | synctype[nsynctypes++] = RTWS_DEF_FREE; | 933 | synctype[nsynctypes++] = RTWS_DEF_FREE; |
| 934 | else if (gp_normal && !cur_ops->deferred_free) | 934 | pr_info("%s: Testing asynchronous GPs.\n", __func__); |
| 935 | pr_alert("rcu_torture_writer: gp_normal without primitives.\n"); | 935 | } else if (gp_normal && !cur_ops->deferred_free) { |
| 936 | if (gp_sync1 && cur_ops->sync) | 936 | pr_alert("%s: gp_normal without primitives.\n", __func__); |
| 937 | } | ||
| 938 | if (gp_sync1 && cur_ops->sync) { | ||
| 937 | synctype[nsynctypes++] = RTWS_SYNC; | 939 | synctype[nsynctypes++] = RTWS_SYNC; |
| 938 | else if (gp_sync && !cur_ops->sync) | 940 | pr_info("%s: Testing normal GPs.\n", __func__); |
| 939 | pr_alert("rcu_torture_writer: gp_sync without primitives.\n"); | 941 | } else if (gp_sync && !cur_ops->sync) { |
| 942 | pr_alert("%s: gp_sync without primitives.\n", __func__); | ||
| 943 | } | ||
| 940 | if (WARN_ONCE(nsynctypes == 0, | 944 | if (WARN_ONCE(nsynctypes == 0, |
| 941 | "rcu_torture_writer: No update-side primitives.\n")) { | 945 | "rcu_torture_writer: No update-side primitives.\n")) { |
| 942 | /* | 946 | /* |
| @@ -1011,6 +1015,9 @@ rcu_torture_writer(void *arg) | |||
| 1011 | rcu_unexpedite_gp(); | 1015 | rcu_unexpedite_gp(); |
| 1012 | if (++expediting > 3) | 1016 | if (++expediting > 3) |
| 1013 | expediting = -expediting; | 1017 | expediting = -expediting; |
| 1018 | } else if (!can_expedite) { /* Disabled during boot, recheck. */ | ||
| 1019 | can_expedite = !rcu_gp_is_expedited() && | ||
| 1020 | !rcu_gp_is_normal(); | ||
| 1014 | } | 1021 | } |
| 1015 | rcu_torture_writer_state = RTWS_STUTTER; | 1022 | rcu_torture_writer_state = RTWS_STUTTER; |
| 1016 | stutter_wait("rcu_torture_writer"); | 1023 | stutter_wait("rcu_torture_writer"); |
| @@ -1021,6 +1028,10 @@ rcu_torture_writer(void *arg) | |||
| 1021 | while (can_expedite && expediting++ < 0) | 1028 | while (can_expedite && expediting++ < 0) |
| 1022 | rcu_unexpedite_gp(); | 1029 | rcu_unexpedite_gp(); |
| 1023 | WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited()); | 1030 | WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited()); |
| 1031 | if (!can_expedite) | ||
| 1032 | pr_alert("%s" TORTURE_FLAG | ||
| 1033 | " Dynamic grace-period expediting was disabled.\n", | ||
| 1034 | torture_type); | ||
| 1024 | rcu_torture_writer_state = RTWS_STOPPING; | 1035 | rcu_torture_writer_state = RTWS_STOPPING; |
| 1025 | torture_kthread_stopping("rcu_torture_writer"); | 1036 | torture_kthread_stopping("rcu_torture_writer"); |
| 1026 | return 0; | 1037 | return 0; |
| @@ -1045,13 +1056,13 @@ rcu_torture_fakewriter(void *arg) | |||
| 1045 | torture_random(&rand) % (nfakewriters * 8) == 0) { | 1056 | torture_random(&rand) % (nfakewriters * 8) == 0) { |
| 1046 | cur_ops->cb_barrier(); | 1057 | cur_ops->cb_barrier(); |
| 1047 | } else if (gp_normal == gp_exp) { | 1058 | } else if (gp_normal == gp_exp) { |
| 1048 | if (torture_random(&rand) & 0x80) | 1059 | if (cur_ops->sync && torture_random(&rand) & 0x80) |
| 1049 | cur_ops->sync(); | 1060 | cur_ops->sync(); |
| 1050 | else | 1061 | else if (cur_ops->exp_sync) |
| 1051 | cur_ops->exp_sync(); | 1062 | cur_ops->exp_sync(); |
| 1052 | } else if (gp_normal) { | 1063 | } else if (gp_normal && cur_ops->sync) { |
| 1053 | cur_ops->sync(); | 1064 | cur_ops->sync(); |
| 1054 | } else { | 1065 | } else if (cur_ops->exp_sync) { |
| 1055 | cur_ops->exp_sync(); | 1066 | cur_ops->exp_sync(); |
| 1056 | } | 1067 | } |
| 1057 | stutter_wait("rcu_torture_fakewriter"); | 1068 | stutter_wait("rcu_torture_fakewriter"); |
| @@ -1557,11 +1568,10 @@ static int rcu_torture_barrier_init(void) | |||
| 1557 | atomic_set(&barrier_cbs_count, 0); | 1568 | atomic_set(&barrier_cbs_count, 0); |
| 1558 | atomic_set(&barrier_cbs_invoked, 0); | 1569 | atomic_set(&barrier_cbs_invoked, 0); |
| 1559 | barrier_cbs_tasks = | 1570 | barrier_cbs_tasks = |
| 1560 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), | 1571 | kcalloc(n_barrier_cbs, sizeof(barrier_cbs_tasks[0]), |
| 1561 | GFP_KERNEL); | 1572 | GFP_KERNEL); |
| 1562 | barrier_cbs_wq = | 1573 | barrier_cbs_wq = |
| 1563 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), | 1574 | kcalloc(n_barrier_cbs, sizeof(barrier_cbs_wq[0]), GFP_KERNEL); |
| 1564 | GFP_KERNEL); | ||
| 1565 | if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) | 1575 | if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) |
| 1566 | return -ENOMEM; | 1576 | return -ENOMEM; |
| 1567 | for (i = 0; i < n_barrier_cbs; i++) { | 1577 | for (i = 0; i < n_barrier_cbs; i++) { |
| @@ -1674,7 +1684,7 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) | |||
| 1674 | * next grace period. Unlikely, but can happen. If it | 1684 | * next grace period. Unlikely, but can happen. If it |
| 1675 | * does happen, the debug-objects subsystem won't have splatted. | 1685 | * does happen, the debug-objects subsystem won't have splatted. |
| 1676 | */ | 1686 | */ |
| 1677 | pr_alert("rcutorture: duplicated callback was invoked.\n"); | 1687 | pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME); |
| 1678 | } | 1688 | } |
| 1679 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 1689 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
| 1680 | 1690 | ||
| @@ -1691,7 +1701,7 @@ static void rcu_test_debug_objects(void) | |||
| 1691 | 1701 | ||
| 1692 | init_rcu_head_on_stack(&rh1); | 1702 | init_rcu_head_on_stack(&rh1); |
| 1693 | init_rcu_head_on_stack(&rh2); | 1703 | init_rcu_head_on_stack(&rh2); |
| 1694 | pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); | 1704 | pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME); |
| 1695 | 1705 | ||
| 1696 | /* Try to queue the rh2 pair of callbacks for the same grace period. */ | 1706 | /* Try to queue the rh2 pair of callbacks for the same grace period. */ |
| 1697 | preempt_disable(); /* Prevent preemption from interrupting test. */ | 1707 | preempt_disable(); /* Prevent preemption from interrupting test. */ |
| @@ -1706,11 +1716,11 @@ static void rcu_test_debug_objects(void) | |||
| 1706 | 1716 | ||
| 1707 | /* Wait for them all to get done so we can safely return. */ | 1717 | /* Wait for them all to get done so we can safely return. */ |
| 1708 | rcu_barrier(); | 1718 | rcu_barrier(); |
| 1709 | pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); | 1719 | pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME); |
| 1710 | destroy_rcu_head_on_stack(&rh1); | 1720 | destroy_rcu_head_on_stack(&rh1); |
| 1711 | destroy_rcu_head_on_stack(&rh2); | 1721 | destroy_rcu_head_on_stack(&rh2); |
| 1712 | #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 1722 | #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
| 1713 | pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); | 1723 | pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME); |
| 1714 | #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 1724 | #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
| 1715 | } | 1725 | } |
| 1716 | 1726 | ||
| @@ -1799,7 +1809,7 @@ rcu_torture_init(void) | |||
| 1799 | if (firsterr) | 1809 | if (firsterr) |
| 1800 | goto unwind; | 1810 | goto unwind; |
| 1801 | if (nfakewriters > 0) { | 1811 | if (nfakewriters > 0) { |
| 1802 | fakewriter_tasks = kzalloc(nfakewriters * | 1812 | fakewriter_tasks = kcalloc(nfakewriters, |
| 1803 | sizeof(fakewriter_tasks[0]), | 1813 | sizeof(fakewriter_tasks[0]), |
| 1804 | GFP_KERNEL); | 1814 | GFP_KERNEL); |
| 1805 | if (fakewriter_tasks == NULL) { | 1815 | if (fakewriter_tasks == NULL) { |
| @@ -1814,7 +1824,7 @@ rcu_torture_init(void) | |||
| 1814 | if (firsterr) | 1824 | if (firsterr) |
| 1815 | goto unwind; | 1825 | goto unwind; |
| 1816 | } | 1826 | } |
| 1817 | reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), | 1827 | reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), |
| 1818 | GFP_KERNEL); | 1828 | GFP_KERNEL); |
| 1819 | if (reader_tasks == NULL) { | 1829 | if (reader_tasks == NULL) { |
| 1820 | VERBOSE_TOROUT_ERRSTRING("out of memory"); | 1830 | VERBOSE_TOROUT_ERRSTRING("out of memory"); |
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d5cea81378cc..fb560fca9ef4 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c | |||
| @@ -386,7 +386,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp) | |||
| 386 | flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); | 386 | flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); |
| 387 | if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || | 387 | if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || |
| 388 | WARN_ON(srcu_readers_active(sp))) { | 388 | WARN_ON(srcu_readers_active(sp))) { |
| 389 | pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); | 389 | pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); |
| 390 | return; /* Caller forgot to stop doing call_srcu()? */ | 390 | return; /* Caller forgot to stop doing call_srcu()? */ |
| 391 | } | 391 | } |
| 392 | free_percpu(sp->sda); | 392 | free_percpu(sp->sda); |
| @@ -439,7 +439,7 @@ static void srcu_gp_start(struct srcu_struct *sp) | |||
| 439 | struct srcu_data *sdp = this_cpu_ptr(sp->sda); | 439 | struct srcu_data *sdp = this_cpu_ptr(sp->sda); |
| 440 | int state; | 440 | int state; |
| 441 | 441 | ||
| 442 | lockdep_assert_held(&sp->lock); | 442 | lockdep_assert_held(&ACCESS_PRIVATE(sp, lock)); |
| 443 | WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); | 443 | WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); |
| 444 | rcu_segcblist_advance(&sdp->srcu_cblist, | 444 | rcu_segcblist_advance(&sdp->srcu_cblist, |
| 445 | rcu_seq_current(&sp->srcu_gp_seq)); | 445 | rcu_seq_current(&sp->srcu_gp_seq)); |
| @@ -492,8 +492,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | |||
| 492 | */ | 492 | */ |
| 493 | static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) | 493 | static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) |
| 494 | { | 494 | { |
| 495 | srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq, | 495 | srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay); |
| 496 | &sdp->work, delay); | ||
| 497 | } | 496 | } |
| 498 | 497 | ||
| 499 | /* | 498 | /* |
| @@ -527,11 +526,11 @@ static void srcu_gp_end(struct srcu_struct *sp) | |||
| 527 | { | 526 | { |
| 528 | unsigned long cbdelay; | 527 | unsigned long cbdelay; |
| 529 | bool cbs; | 528 | bool cbs; |
| 529 | bool last_lvl; | ||
| 530 | int cpu; | 530 | int cpu; |
| 531 | unsigned long flags; | 531 | unsigned long flags; |
| 532 | unsigned long gpseq; | 532 | unsigned long gpseq; |
| 533 | int idx; | 533 | int idx; |
| 534 | int idxnext; | ||
| 535 | unsigned long mask; | 534 | unsigned long mask; |
| 536 | struct srcu_data *sdp; | 535 | struct srcu_data *sdp; |
| 537 | struct srcu_node *snp; | 536 | struct srcu_node *snp; |
| @@ -555,11 +554,11 @@ static void srcu_gp_end(struct srcu_struct *sp) | |||
| 555 | 554 | ||
| 556 | /* Initiate callback invocation as needed. */ | 555 | /* Initiate callback invocation as needed. */ |
| 557 | idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); | 556 | idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); |
| 558 | idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); | ||
| 559 | rcu_for_each_node_breadth_first(sp, snp) { | 557 | rcu_for_each_node_breadth_first(sp, snp) { |
| 560 | spin_lock_irq_rcu_node(snp); | 558 | spin_lock_irq_rcu_node(snp); |
| 561 | cbs = false; | 559 | cbs = false; |
| 562 | if (snp >= sp->level[rcu_num_lvls - 1]) | 560 | last_lvl = snp >= sp->level[rcu_num_lvls - 1]; |
| 561 | if (last_lvl) | ||
| 563 | cbs = snp->srcu_have_cbs[idx] == gpseq; | 562 | cbs = snp->srcu_have_cbs[idx] == gpseq; |
| 564 | snp->srcu_have_cbs[idx] = gpseq; | 563 | snp->srcu_have_cbs[idx] = gpseq; |
| 565 | rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); | 564 | rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); |
| @@ -572,13 +571,16 @@ static void srcu_gp_end(struct srcu_struct *sp) | |||
| 572 | srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); | 571 | srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); |
| 573 | 572 | ||
| 574 | /* Occasionally prevent srcu_data counter wrap. */ | 573 | /* Occasionally prevent srcu_data counter wrap. */ |
| 575 | if (!(gpseq & counter_wrap_check)) | 574 | if (!(gpseq & counter_wrap_check) && last_lvl) |
| 576 | for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { | 575 | for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { |
| 577 | sdp = per_cpu_ptr(sp->sda, cpu); | 576 | sdp = per_cpu_ptr(sp->sda, cpu); |
| 578 | spin_lock_irqsave_rcu_node(sdp, flags); | 577 | spin_lock_irqsave_rcu_node(sdp, flags); |
| 579 | if (ULONG_CMP_GE(gpseq, | 578 | if (ULONG_CMP_GE(gpseq, |
| 580 | sdp->srcu_gp_seq_needed + 100)) | 579 | sdp->srcu_gp_seq_needed + 100)) |
| 581 | sdp->srcu_gp_seq_needed = gpseq; | 580 | sdp->srcu_gp_seq_needed = gpseq; |
| 581 | if (ULONG_CMP_GE(gpseq, | ||
| 582 | sdp->srcu_gp_seq_needed_exp + 100)) | ||
| 583 | sdp->srcu_gp_seq_needed_exp = gpseq; | ||
| 582 | spin_unlock_irqrestore_rcu_node(sdp, flags); | 584 | spin_unlock_irqrestore_rcu_node(sdp, flags); |
| 583 | } | 585 | } |
| 584 | } | 586 | } |
| @@ -593,9 +595,7 @@ static void srcu_gp_end(struct srcu_struct *sp) | |||
| 593 | ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { | 595 | ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { |
| 594 | srcu_gp_start(sp); | 596 | srcu_gp_start(sp); |
| 595 | spin_unlock_irq_rcu_node(sp); | 597 | spin_unlock_irq_rcu_node(sp); |
| 596 | /* Throttle expedited grace periods: Should be rare! */ | 598 | srcu_reschedule(sp, 0); |
| 597 | srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff | ||
| 598 | ? 0 : SRCU_INTERVAL); | ||
| 599 | } else { | 599 | } else { |
| 600 | spin_unlock_irq_rcu_node(sp); | 600 | spin_unlock_irq_rcu_node(sp); |
| 601 | } | 601 | } |
| @@ -626,7 +626,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, | |||
| 626 | spin_unlock_irqrestore_rcu_node(snp, flags); | 626 | spin_unlock_irqrestore_rcu_node(snp, flags); |
| 627 | } | 627 | } |
| 628 | spin_lock_irqsave_rcu_node(sp, flags); | 628 | spin_lock_irqsave_rcu_node(sp, flags); |
| 629 | if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) | 629 | if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) |
| 630 | sp->srcu_gp_seq_needed_exp = s; | 630 | sp->srcu_gp_seq_needed_exp = s; |
| 631 | spin_unlock_irqrestore_rcu_node(sp, flags); | 631 | spin_unlock_irqrestore_rcu_node(sp, flags); |
| 632 | } | 632 | } |
| @@ -691,8 +691,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, | |||
| 691 | rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { | 691 | rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { |
| 692 | WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); | 692 | WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); |
| 693 | srcu_gp_start(sp); | 693 | srcu_gp_start(sp); |
| 694 | queue_delayed_work(system_power_efficient_wq, &sp->work, | 694 | queue_delayed_work(rcu_gp_wq, &sp->work, srcu_get_delay(sp)); |
| 695 | srcu_get_delay(sp)); | ||
| 696 | } | 695 | } |
| 697 | spin_unlock_irqrestore_rcu_node(sp, flags); | 696 | spin_unlock_irqrestore_rcu_node(sp, flags); |
| 698 | } | 697 | } |
| @@ -1225,7 +1224,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) | |||
| 1225 | spin_unlock_irq_rcu_node(sp); | 1224 | spin_unlock_irq_rcu_node(sp); |
| 1226 | 1225 | ||
| 1227 | if (pushgp) | 1226 | if (pushgp) |
| 1228 | queue_delayed_work(system_power_efficient_wq, &sp->work, delay); | 1227 | queue_delayed_work(rcu_gp_wq, &sp->work, delay); |
| 1229 | } | 1228 | } |
| 1230 | 1229 | ||
| 1231 | /* | 1230 | /* |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 491bdf39f276..2a734692a581 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
| @@ -1161,7 +1161,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) | |||
| 1161 | */ | 1161 | */ |
| 1162 | static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) | 1162 | static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) |
| 1163 | { | 1163 | { |
| 1164 | lockdep_assert_held(&rnp->lock); | 1164 | raw_lockdep_assert_held_rcu_node(rnp); |
| 1165 | if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) | 1165 | if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum)) |
| 1166 | WRITE_ONCE(rdp->gpwrap, true); | 1166 | WRITE_ONCE(rdp->gpwrap, true); |
| 1167 | if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) | 1167 | if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum)) |
| @@ -1350,6 +1350,7 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) | |||
| 1350 | rsp->gp_kthread ? rsp->gp_kthread->state : ~0, | 1350 | rsp->gp_kthread ? rsp->gp_kthread->state : ~0, |
| 1351 | rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); | 1351 | rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); |
| 1352 | if (rsp->gp_kthread) { | 1352 | if (rsp->gp_kthread) { |
| 1353 | pr_err("RCU grace-period kthread stack dump:\n"); | ||
| 1353 | sched_show_task(rsp->gp_kthread); | 1354 | sched_show_task(rsp->gp_kthread); |
| 1354 | wake_up_process(rsp->gp_kthread); | 1355 | wake_up_process(rsp->gp_kthread); |
| 1355 | } | 1356 | } |
| @@ -1628,7 +1629,7 @@ void rcu_cpu_stall_reset(void) | |||
| 1628 | static unsigned long rcu_cbs_completed(struct rcu_state *rsp, | 1629 | static unsigned long rcu_cbs_completed(struct rcu_state *rsp, |
| 1629 | struct rcu_node *rnp) | 1630 | struct rcu_node *rnp) |
| 1630 | { | 1631 | { |
| 1631 | lockdep_assert_held(&rnp->lock); | 1632 | raw_lockdep_assert_held_rcu_node(rnp); |
| 1632 | 1633 | ||
| 1633 | /* | 1634 | /* |
| 1634 | * If RCU is idle, we just wait for the next grace period. | 1635 | * If RCU is idle, we just wait for the next grace period. |
| @@ -1675,7 +1676,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | |||
| 1675 | bool ret = false; | 1676 | bool ret = false; |
| 1676 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); | 1677 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); |
| 1677 | 1678 | ||
| 1678 | lockdep_assert_held(&rnp->lock); | 1679 | raw_lockdep_assert_held_rcu_node(rnp); |
| 1679 | 1680 | ||
| 1680 | /* | 1681 | /* |
| 1681 | * Pick up grace-period number for new callbacks. If this | 1682 | * Pick up grace-period number for new callbacks. If this |
| @@ -1803,7 +1804,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1803 | { | 1804 | { |
| 1804 | bool ret = false; | 1805 | bool ret = false; |
| 1805 | 1806 | ||
| 1806 | lockdep_assert_held(&rnp->lock); | 1807 | raw_lockdep_assert_held_rcu_node(rnp); |
| 1807 | 1808 | ||
| 1808 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ | 1809 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ |
| 1809 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) | 1810 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) |
| @@ -1843,7 +1844,7 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1843 | static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | 1844 | static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, |
| 1844 | struct rcu_data *rdp) | 1845 | struct rcu_data *rdp) |
| 1845 | { | 1846 | { |
| 1846 | lockdep_assert_held(&rnp->lock); | 1847 | raw_lockdep_assert_held_rcu_node(rnp); |
| 1847 | 1848 | ||
| 1848 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ | 1849 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ |
| 1849 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) | 1850 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) |
| @@ -1871,7 +1872,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1871 | bool ret; | 1872 | bool ret; |
| 1872 | bool need_gp; | 1873 | bool need_gp; |
| 1873 | 1874 | ||
| 1874 | lockdep_assert_held(&rnp->lock); | 1875 | raw_lockdep_assert_held_rcu_node(rnp); |
| 1875 | 1876 | ||
| 1876 | /* Handle the ends of any preceding grace periods first. */ | 1877 | /* Handle the ends of any preceding grace periods first. */ |
| 1877 | if (rdp->completed == rnp->completed && | 1878 | if (rdp->completed == rnp->completed && |
| @@ -2296,7 +2297,7 @@ static bool | |||
| 2296 | rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | 2297 | rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, |
| 2297 | struct rcu_data *rdp) | 2298 | struct rcu_data *rdp) |
| 2298 | { | 2299 | { |
| 2299 | lockdep_assert_held(&rnp->lock); | 2300 | raw_lockdep_assert_held_rcu_node(rnp); |
| 2300 | if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { | 2301 | if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { |
| 2301 | /* | 2302 | /* |
| 2302 | * Either we have not yet spawned the grace-period | 2303 | * Either we have not yet spawned the grace-period |
| @@ -2358,7 +2359,7 @@ static bool rcu_start_gp(struct rcu_state *rsp) | |||
| 2358 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | 2359 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) |
| 2359 | __releases(rcu_get_root(rsp)->lock) | 2360 | __releases(rcu_get_root(rsp)->lock) |
| 2360 | { | 2361 | { |
| 2361 | lockdep_assert_held(&rcu_get_root(rsp)->lock); | 2362 | raw_lockdep_assert_held_rcu_node(rcu_get_root(rsp)); |
| 2362 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 2363 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); |
| 2363 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); | 2364 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); |
| 2364 | raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); | 2365 | raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); |
| @@ -2383,7 +2384,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
| 2383 | unsigned long oldmask = 0; | 2384 | unsigned long oldmask = 0; |
| 2384 | struct rcu_node *rnp_c; | 2385 | struct rcu_node *rnp_c; |
| 2385 | 2386 | ||
| 2386 | lockdep_assert_held(&rnp->lock); | 2387 | raw_lockdep_assert_held_rcu_node(rnp); |
| 2387 | 2388 | ||
| 2388 | /* Walk up the rcu_node hierarchy. */ | 2389 | /* Walk up the rcu_node hierarchy. */ |
| 2389 | for (;;) { | 2390 | for (;;) { |
| @@ -2447,7 +2448,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, | |||
| 2447 | unsigned long mask; | 2448 | unsigned long mask; |
| 2448 | struct rcu_node *rnp_p; | 2449 | struct rcu_node *rnp_p; |
| 2449 | 2450 | ||
| 2450 | lockdep_assert_held(&rnp->lock); | 2451 | raw_lockdep_assert_held_rcu_node(rnp); |
| 2451 | if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || | 2452 | if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || |
| 2452 | rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { | 2453 | rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { |
| 2453 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 2454 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
| @@ -2592,7 +2593,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) | |||
| 2592 | long mask; | 2593 | long mask; |
| 2593 | struct rcu_node *rnp = rnp_leaf; | 2594 | struct rcu_node *rnp = rnp_leaf; |
| 2594 | 2595 | ||
| 2595 | lockdep_assert_held(&rnp->lock); | 2596 | raw_lockdep_assert_held_rcu_node(rnp); |
| 2596 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || | 2597 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || |
| 2597 | rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) | 2598 | rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) |
| 2598 | return; | 2599 | return; |
| @@ -2691,7 +2692,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 2691 | /* Update counts and requeue any remaining callbacks. */ | 2692 | /* Update counts and requeue any remaining callbacks. */ |
| 2692 | rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); | 2693 | rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); |
| 2693 | smp_mb(); /* List handling before counting for rcu_barrier(). */ | 2694 | smp_mb(); /* List handling before counting for rcu_barrier(). */ |
| 2694 | rdp->n_cbs_invoked += count; | ||
| 2695 | rcu_segcblist_insert_count(&rdp->cblist, &rcl); | 2695 | rcu_segcblist_insert_count(&rdp->cblist, &rcl); |
| 2696 | 2696 | ||
| 2697 | /* Reinstate batch limit if we have worked down the excess. */ | 2697 | /* Reinstate batch limit if we have worked down the excess. */ |
| @@ -2845,10 +2845,8 @@ static void force_quiescent_state(struct rcu_state *rsp) | |||
| 2845 | !raw_spin_trylock(&rnp->fqslock); | 2845 | !raw_spin_trylock(&rnp->fqslock); |
| 2846 | if (rnp_old != NULL) | 2846 | if (rnp_old != NULL) |
| 2847 | raw_spin_unlock(&rnp_old->fqslock); | 2847 | raw_spin_unlock(&rnp_old->fqslock); |
| 2848 | if (ret) { | 2848 | if (ret) |
| 2849 | rsp->n_force_qs_lh++; | ||
| 2850 | return; | 2849 | return; |
| 2851 | } | ||
| 2852 | rnp_old = rnp; | 2850 | rnp_old = rnp; |
| 2853 | } | 2851 | } |
| 2854 | /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ | 2852 | /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ |
| @@ -2857,7 +2855,6 @@ static void force_quiescent_state(struct rcu_state *rsp) | |||
| 2857 | raw_spin_lock_irqsave_rcu_node(rnp_old, flags); | 2855 | raw_spin_lock_irqsave_rcu_node(rnp_old, flags); |
| 2858 | raw_spin_unlock(&rnp_old->fqslock); | 2856 | raw_spin_unlock(&rnp_old->fqslock); |
| 2859 | if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | 2857 | if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
| 2860 | rsp->n_force_qs_lh++; | ||
| 2861 | raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); | 2858 | raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); |
| 2862 | return; /* Someone beat us to it. */ | 2859 | return; /* Someone beat us to it. */ |
| 2863 | } | 2860 | } |
| @@ -3355,8 +3352,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 3355 | { | 3352 | { |
| 3356 | struct rcu_node *rnp = rdp->mynode; | 3353 | struct rcu_node *rnp = rdp->mynode; |
| 3357 | 3354 | ||
| 3358 | rdp->n_rcu_pending++; | ||
| 3359 | |||
| 3360 | /* Check for CPU stalls, if enabled. */ | 3355 | /* Check for CPU stalls, if enabled. */ |
| 3361 | check_cpu_stall(rsp, rdp); | 3356 | check_cpu_stall(rsp, rdp); |
| 3362 | 3357 | ||
| @@ -3365,48 +3360,31 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 3365 | return 0; | 3360 | return 0; |
| 3366 | 3361 | ||
| 3367 | /* Is the RCU core waiting for a quiescent state from this CPU? */ | 3362 | /* Is the RCU core waiting for a quiescent state from this CPU? */ |
| 3368 | if (rcu_scheduler_fully_active && | 3363 | if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) |
| 3369 | rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && | ||
| 3370 | rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) { | ||
| 3371 | rdp->n_rp_core_needs_qs++; | ||
| 3372 | } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { | ||
| 3373 | rdp->n_rp_report_qs++; | ||
| 3374 | return 1; | 3364 | return 1; |
| 3375 | } | ||
| 3376 | 3365 | ||
| 3377 | /* Does this CPU have callbacks ready to invoke? */ | 3366 | /* Does this CPU have callbacks ready to invoke? */ |
| 3378 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) { | 3367 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) |
| 3379 | rdp->n_rp_cb_ready++; | ||
| 3380 | return 1; | 3368 | return 1; |
| 3381 | } | ||
| 3382 | 3369 | ||
| 3383 | /* Has RCU gone idle with this CPU needing another grace period? */ | 3370 | /* Has RCU gone idle with this CPU needing another grace period? */ |
| 3384 | if (cpu_needs_another_gp(rsp, rdp)) { | 3371 | if (cpu_needs_another_gp(rsp, rdp)) |
| 3385 | rdp->n_rp_cpu_needs_gp++; | ||
| 3386 | return 1; | 3372 | return 1; |
| 3387 | } | ||
| 3388 | 3373 | ||
| 3389 | /* Has another RCU grace period completed? */ | 3374 | /* Has another RCU grace period completed? */ |
| 3390 | if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ | 3375 | if (READ_ONCE(rnp->completed) != rdp->completed) /* outside lock */ |
| 3391 | rdp->n_rp_gp_completed++; | ||
| 3392 | return 1; | 3376 | return 1; |
| 3393 | } | ||
| 3394 | 3377 | ||
| 3395 | /* Has a new RCU grace period started? */ | 3378 | /* Has a new RCU grace period started? */ |
| 3396 | if (READ_ONCE(rnp->gpnum) != rdp->gpnum || | 3379 | if (READ_ONCE(rnp->gpnum) != rdp->gpnum || |
| 3397 | unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */ | 3380 | unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ |
| 3398 | rdp->n_rp_gp_started++; | ||
| 3399 | return 1; | 3381 | return 1; |
| 3400 | } | ||
| 3401 | 3382 | ||
| 3402 | /* Does this CPU need a deferred NOCB wakeup? */ | 3383 | /* Does this CPU need a deferred NOCB wakeup? */ |
| 3403 | if (rcu_nocb_need_deferred_wakeup(rdp)) { | 3384 | if (rcu_nocb_need_deferred_wakeup(rdp)) |
| 3404 | rdp->n_rp_nocb_defer_wakeup++; | ||
| 3405 | return 1; | 3385 | return 1; |
| 3406 | } | ||
| 3407 | 3386 | ||
| 3408 | /* nothing to do */ | 3387 | /* nothing to do */ |
| 3409 | rdp->n_rp_need_nothing++; | ||
| 3410 | return 0; | 3388 | return 0; |
| 3411 | } | 3389 | } |
| 3412 | 3390 | ||
| @@ -3618,7 +3596,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) | |||
| 3618 | long mask; | 3596 | long mask; |
| 3619 | struct rcu_node *rnp = rnp_leaf; | 3597 | struct rcu_node *rnp = rnp_leaf; |
| 3620 | 3598 | ||
| 3621 | lockdep_assert_held(&rnp->lock); | 3599 | raw_lockdep_assert_held_rcu_node(rnp); |
| 3622 | for (;;) { | 3600 | for (;;) { |
| 3623 | mask = rnp->grpmask; | 3601 | mask = rnp->grpmask; |
| 3624 | rnp = rnp->parent; | 3602 | rnp = rnp->parent; |
| @@ -3636,12 +3614,9 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) | |||
| 3636 | static void __init | 3614 | static void __init |
| 3637 | rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | 3615 | rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) |
| 3638 | { | 3616 | { |
| 3639 | unsigned long flags; | ||
| 3640 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 3617 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
| 3641 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 3642 | 3618 | ||
| 3643 | /* Set up local state, ensuring consistent view of global state. */ | 3619 | /* Set up local state, ensuring consistent view of global state. */ |
| 3644 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 3645 | rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); | 3620 | rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); |
| 3646 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 3621 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
| 3647 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); | 3622 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); |
| @@ -3649,7 +3624,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
| 3649 | rdp->cpu = cpu; | 3624 | rdp->cpu = cpu; |
| 3650 | rdp->rsp = rsp; | 3625 | rdp->rsp = rsp; |
| 3651 | rcu_boot_init_nocb_percpu_data(rdp); | 3626 | rcu_boot_init_nocb_percpu_data(rdp); |
| 3652 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
| 3653 | } | 3627 | } |
| 3654 | 3628 | ||
| 3655 | /* | 3629 | /* |
| @@ -4193,6 +4167,8 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) | |||
| 4193 | pr_cont("\n"); | 4167 | pr_cont("\n"); |
| 4194 | } | 4168 | } |
| 4195 | 4169 | ||
| 4170 | struct workqueue_struct *rcu_gp_wq; | ||
| 4171 | |||
| 4196 | void __init rcu_init(void) | 4172 | void __init rcu_init(void) |
| 4197 | { | 4173 | { |
| 4198 | int cpu; | 4174 | int cpu; |
| @@ -4219,6 +4195,10 @@ void __init rcu_init(void) | |||
| 4219 | rcu_cpu_starting(cpu); | 4195 | rcu_cpu_starting(cpu); |
| 4220 | rcutree_online_cpu(cpu); | 4196 | rcutree_online_cpu(cpu); |
| 4221 | } | 4197 | } |
| 4198 | |||
| 4199 | /* Create workqueue for expedited GPs and for Tree SRCU. */ | ||
| 4200 | rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); | ||
| 4201 | WARN_ON(!rcu_gp_wq); | ||
| 4222 | } | 4202 | } |
| 4223 | 4203 | ||
| 4224 | #include "tree_exp.h" | 4204 | #include "tree_exp.h" |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6488a3b0e729..f491ab4f2e8e 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
| @@ -146,12 +146,6 @@ struct rcu_node { | |||
| 146 | /* boosting for this rcu_node structure. */ | 146 | /* boosting for this rcu_node structure. */ |
| 147 | unsigned int boost_kthread_status; | 147 | unsigned int boost_kthread_status; |
| 148 | /* State of boost_kthread_task for tracing. */ | 148 | /* State of boost_kthread_task for tracing. */ |
| 149 | unsigned long n_tasks_boosted; | ||
| 150 | /* Total number of tasks boosted. */ | ||
| 151 | unsigned long n_exp_boosts; | ||
| 152 | /* Number of tasks boosted for expedited GP. */ | ||
| 153 | unsigned long n_normal_boosts; | ||
| 154 | /* Number of tasks boosted for normal GP. */ | ||
| 155 | #ifdef CONFIG_RCU_NOCB_CPU | 149 | #ifdef CONFIG_RCU_NOCB_CPU |
| 156 | struct swait_queue_head nocb_gp_wq[2]; | 150 | struct swait_queue_head nocb_gp_wq[2]; |
| 157 | /* Place for rcu_nocb_kthread() to wait GP. */ | 151 | /* Place for rcu_nocb_kthread() to wait GP. */ |
| @@ -184,13 +178,6 @@ union rcu_noqs { | |||
| 184 | u16 s; /* Set of bits, aggregate OR here. */ | 178 | u16 s; /* Set of bits, aggregate OR here. */ |
| 185 | }; | 179 | }; |
| 186 | 180 | ||
| 187 | /* Index values for nxttail array in struct rcu_data. */ | ||
| 188 | #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ | ||
| 189 | #define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ | ||
| 190 | #define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ | ||
| 191 | #define RCU_NEXT_TAIL 3 | ||
| 192 | #define RCU_NEXT_SIZE 4 | ||
| 193 | |||
| 194 | /* Per-CPU data for read-copy update. */ | 181 | /* Per-CPU data for read-copy update. */ |
| 195 | struct rcu_data { | 182 | struct rcu_data { |
| 196 | /* 1) quiescent-state and grace-period handling : */ | 183 | /* 1) quiescent-state and grace-period handling : */ |
| @@ -217,8 +204,6 @@ struct rcu_data { | |||
| 217 | /* different grace periods. */ | 204 | /* different grace periods. */ |
| 218 | long qlen_last_fqs_check; | 205 | long qlen_last_fqs_check; |
| 219 | /* qlen at last check for QS forcing */ | 206 | /* qlen at last check for QS forcing */ |
| 220 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ | ||
| 221 | unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ | ||
| 222 | unsigned long n_force_qs_snap; | 207 | unsigned long n_force_qs_snap; |
| 223 | /* did other CPU force QS recently? */ | 208 | /* did other CPU force QS recently? */ |
| 224 | long blimit; /* Upper limit on a processed batch */ | 209 | long blimit; /* Upper limit on a processed batch */ |
| @@ -234,18 +219,7 @@ struct rcu_data { | |||
| 234 | /* Grace period that needs help */ | 219 | /* Grace period that needs help */ |
| 235 | /* from cond_resched(). */ | 220 | /* from cond_resched(). */ |
| 236 | 221 | ||
| 237 | /* 5) __rcu_pending() statistics. */ | 222 | /* 5) _rcu_barrier(), OOM callbacks, and expediting. */ |
| 238 | unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ | ||
| 239 | unsigned long n_rp_core_needs_qs; | ||
| 240 | unsigned long n_rp_report_qs; | ||
| 241 | unsigned long n_rp_cb_ready; | ||
| 242 | unsigned long n_rp_cpu_needs_gp; | ||
| 243 | unsigned long n_rp_gp_completed; | ||
| 244 | unsigned long n_rp_gp_started; | ||
| 245 | unsigned long n_rp_nocb_defer_wakeup; | ||
| 246 | unsigned long n_rp_need_nothing; | ||
| 247 | |||
| 248 | /* 6) _rcu_barrier(), OOM callbacks, and expediting. */ | ||
| 249 | struct rcu_head barrier_head; | 223 | struct rcu_head barrier_head; |
| 250 | #ifdef CONFIG_RCU_FAST_NO_HZ | 224 | #ifdef CONFIG_RCU_FAST_NO_HZ |
| 251 | struct rcu_head oom_head; | 225 | struct rcu_head oom_head; |
| @@ -256,7 +230,7 @@ struct rcu_data { | |||
| 256 | atomic_long_t exp_workdone3; /* # done by others #3. */ | 230 | atomic_long_t exp_workdone3; /* # done by others #3. */ |
| 257 | int exp_dynticks_snap; /* Double-check need for IPI. */ | 231 | int exp_dynticks_snap; /* Double-check need for IPI. */ |
| 258 | 232 | ||
| 259 | /* 7) Callback offloading. */ | 233 | /* 6) Callback offloading. */ |
| 260 | #ifdef CONFIG_RCU_NOCB_CPU | 234 | #ifdef CONFIG_RCU_NOCB_CPU |
| 261 | struct rcu_head *nocb_head; /* CBs waiting for kthread. */ | 235 | struct rcu_head *nocb_head; /* CBs waiting for kthread. */ |
| 262 | struct rcu_head **nocb_tail; | 236 | struct rcu_head **nocb_tail; |
| @@ -283,7 +257,7 @@ struct rcu_data { | |||
| 283 | /* Leader CPU takes GP-end wakeups. */ | 257 | /* Leader CPU takes GP-end wakeups. */ |
| 284 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | 258 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ |
| 285 | 259 | ||
| 286 | /* 8) RCU CPU stall data. */ | 260 | /* 7) RCU CPU stall data. */ |
| 287 | unsigned int softirq_snap; /* Snapshot of softirq activity. */ | 261 | unsigned int softirq_snap; /* Snapshot of softirq activity. */ |
| 288 | /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ | 262 | /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ |
| 289 | struct irq_work rcu_iw; /* Check for non-irq activity. */ | 263 | struct irq_work rcu_iw; /* Check for non-irq activity. */ |
| @@ -374,10 +348,6 @@ struct rcu_state { | |||
| 374 | /* kthreads, if configured. */ | 348 | /* kthreads, if configured. */ |
| 375 | unsigned long n_force_qs; /* Number of calls to */ | 349 | unsigned long n_force_qs; /* Number of calls to */ |
| 376 | /* force_quiescent_state(). */ | 350 | /* force_quiescent_state(). */ |
| 377 | unsigned long n_force_qs_lh; /* ~Number of calls leaving */ | ||
| 378 | /* due to lock unavailable. */ | ||
| 379 | unsigned long n_force_qs_ngp; /* Number of calls leaving */ | ||
| 380 | /* due to no GP active. */ | ||
| 381 | unsigned long gp_start; /* Time at which GP started, */ | 351 | unsigned long gp_start; /* Time at which GP started, */ |
| 382 | /* but in jiffies. */ | 352 | /* but in jiffies. */ |
| 383 | unsigned long gp_activity; /* Time of last GP kthread */ | 353 | unsigned long gp_activity; /* Time of last GP kthread */ |
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 46d61b597731..f72eefab8543 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h | |||
| @@ -29,6 +29,15 @@ static void rcu_exp_gp_seq_start(struct rcu_state *rsp) | |||
| 29 | } | 29 | } |
| 30 | 30 | ||
| 31 | /* | 31 | /* |
| 32 | * Return then value that expedited-grace-period counter will have | ||
| 33 | * at the end of the current grace period. | ||
| 34 | */ | ||
| 35 | static __maybe_unused unsigned long rcu_exp_gp_seq_endval(struct rcu_state *rsp) | ||
| 36 | { | ||
| 37 | return rcu_seq_endval(&rsp->expedited_sequence); | ||
| 38 | } | ||
| 39 | |||
| 40 | /* | ||
| 32 | * Record the end of an expedited grace period. | 41 | * Record the end of an expedited grace period. |
| 33 | */ | 42 | */ |
| 34 | static void rcu_exp_gp_seq_end(struct rcu_state *rsp) | 43 | static void rcu_exp_gp_seq_end(struct rcu_state *rsp) |
| @@ -366,21 +375,30 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, | |||
| 366 | int ret; | 375 | int ret; |
| 367 | struct rcu_node *rnp; | 376 | struct rcu_node *rnp; |
| 368 | 377 | ||
| 378 | trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); | ||
| 369 | sync_exp_reset_tree(rsp); | 379 | sync_exp_reset_tree(rsp); |
| 380 | trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); | ||
| 370 | rcu_for_each_leaf_node(rsp, rnp) { | 381 | rcu_for_each_leaf_node(rsp, rnp) { |
| 371 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | 382 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 372 | 383 | ||
| 373 | /* Each pass checks a CPU for identity, offline, and idle. */ | 384 | /* Each pass checks a CPU for identity, offline, and idle. */ |
| 374 | mask_ofl_test = 0; | 385 | mask_ofl_test = 0; |
| 375 | for_each_leaf_node_possible_cpu(rnp, cpu) { | 386 | for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { |
| 387 | unsigned long mask = leaf_node_cpu_bit(rnp, cpu); | ||
| 376 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 388 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
| 389 | struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); | ||
| 390 | int snap; | ||
| 377 | 391 | ||
| 378 | rdp->exp_dynticks_snap = | ||
| 379 | rcu_dynticks_snap(rdp->dynticks); | ||
| 380 | if (raw_smp_processor_id() == cpu || | 392 | if (raw_smp_processor_id() == cpu || |
| 381 | rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) || | 393 | !(rnp->qsmaskinitnext & mask)) { |
| 382 | !(rnp->qsmaskinitnext & rdp->grpmask)) | 394 | mask_ofl_test |= mask; |
| 383 | mask_ofl_test |= rdp->grpmask; | 395 | } else { |
| 396 | snap = rcu_dynticks_snap(rdtp); | ||
| 397 | if (rcu_dynticks_in_eqs(snap)) | ||
| 398 | mask_ofl_test |= mask; | ||
| 399 | else | ||
| 400 | rdp->exp_dynticks_snap = snap; | ||
| 401 | } | ||
| 384 | } | 402 | } |
| 385 | mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; | 403 | mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; |
| 386 | 404 | ||
| @@ -394,7 +412,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, | |||
| 394 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 412 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
| 395 | 413 | ||
| 396 | /* IPI the remaining CPUs for expedited quiescent state. */ | 414 | /* IPI the remaining CPUs for expedited quiescent state. */ |
| 397 | for_each_leaf_node_possible_cpu(rnp, cpu) { | 415 | for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { |
| 398 | unsigned long mask = leaf_node_cpu_bit(rnp, cpu); | 416 | unsigned long mask = leaf_node_cpu_bit(rnp, cpu); |
| 399 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 417 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
| 400 | 418 | ||
| @@ -417,6 +435,7 @@ retry_ipi: | |||
| 417 | (rnp->expmask & mask)) { | 435 | (rnp->expmask & mask)) { |
| 418 | /* Online, so delay for a bit and try again. */ | 436 | /* Online, so delay for a bit and try again. */ |
| 419 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 437 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
| 438 | trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); | ||
| 420 | schedule_timeout_uninterruptible(1); | 439 | schedule_timeout_uninterruptible(1); |
| 421 | goto retry_ipi; | 440 | goto retry_ipi; |
| 422 | } | 441 | } |
| @@ -443,6 +462,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | |||
| 443 | struct rcu_node *rnp_root = rcu_get_root(rsp); | 462 | struct rcu_node *rnp_root = rcu_get_root(rsp); |
| 444 | int ret; | 463 | int ret; |
| 445 | 464 | ||
| 465 | trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("startwait")); | ||
| 446 | jiffies_stall = rcu_jiffies_till_stall_check(); | 466 | jiffies_stall = rcu_jiffies_till_stall_check(); |
| 447 | jiffies_start = jiffies; | 467 | jiffies_start = jiffies; |
| 448 | 468 | ||
| @@ -606,7 +626,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, | |||
| 606 | rew.rew_rsp = rsp; | 626 | rew.rew_rsp = rsp; |
| 607 | rew.rew_s = s; | 627 | rew.rew_s = s; |
| 608 | INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); | 628 | INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); |
| 609 | schedule_work(&rew.rew_work); | 629 | queue_work(rcu_gp_wq, &rew.rew_work); |
| 610 | } | 630 | } |
| 611 | 631 | ||
| 612 | /* Wait for expedited grace period to complete. */ | 632 | /* Wait for expedited grace period to complete. */ |
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fb88a028deec..84fbee4686d3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
| @@ -180,7 +180,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) | |||
| 180 | (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); | 180 | (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); |
| 181 | struct task_struct *t = current; | 181 | struct task_struct *t = current; |
| 182 | 182 | ||
| 183 | lockdep_assert_held(&rnp->lock); | 183 | raw_lockdep_assert_held_rcu_node(rnp); |
| 184 | WARN_ON_ONCE(rdp->mynode != rnp); | 184 | WARN_ON_ONCE(rdp->mynode != rnp); |
| 185 | WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); | 185 | WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); |
| 186 | 186 | ||
| @@ -560,8 +560,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) | |||
| 560 | } | 560 | } |
| 561 | t = list_entry(rnp->gp_tasks->prev, | 561 | t = list_entry(rnp->gp_tasks->prev, |
| 562 | struct task_struct, rcu_node_entry); | 562 | struct task_struct, rcu_node_entry); |
| 563 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) | 563 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { |
| 564 | /* | ||
| 565 | * We could be printing a lot while holding a spinlock. | ||
| 566 | * Avoid triggering hard lockup. | ||
| 567 | */ | ||
| 568 | touch_nmi_watchdog(); | ||
| 564 | sched_show_task(t); | 569 | sched_show_task(t); |
| 570 | } | ||
| 565 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 571 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
| 566 | } | 572 | } |
| 567 | 573 | ||
| @@ -957,14 +963,10 @@ static int rcu_boost(struct rcu_node *rnp) | |||
| 957 | * expedited grace period must boost all blocked tasks, including | 963 | * expedited grace period must boost all blocked tasks, including |
| 958 | * those blocking the pre-existing normal grace period. | 964 | * those blocking the pre-existing normal grace period. |
| 959 | */ | 965 | */ |
| 960 | if (rnp->exp_tasks != NULL) { | 966 | if (rnp->exp_tasks != NULL) |
| 961 | tb = rnp->exp_tasks; | 967 | tb = rnp->exp_tasks; |
| 962 | rnp->n_exp_boosts++; | 968 | else |
| 963 | } else { | ||
| 964 | tb = rnp->boost_tasks; | 969 | tb = rnp->boost_tasks; |
| 965 | rnp->n_normal_boosts++; | ||
| 966 | } | ||
| 967 | rnp->n_tasks_boosted++; | ||
| 968 | 970 | ||
| 969 | /* | 971 | /* |
| 970 | * We boost task t by manufacturing an rt_mutex that appears to | 972 | * We boost task t by manufacturing an rt_mutex that appears to |
| @@ -1042,7 +1044,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | |||
| 1042 | { | 1044 | { |
| 1043 | struct task_struct *t; | 1045 | struct task_struct *t; |
| 1044 | 1046 | ||
| 1045 | lockdep_assert_held(&rnp->lock); | 1047 | raw_lockdep_assert_held_rcu_node(rnp); |
| 1046 | if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { | 1048 | if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { |
| 1047 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 1049 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
| 1048 | return; | 1050 | return; |
| @@ -1677,6 +1679,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) | |||
| 1677 | char *ticks_title; | 1679 | char *ticks_title; |
| 1678 | unsigned long ticks_value; | 1680 | unsigned long ticks_value; |
| 1679 | 1681 | ||
| 1682 | /* | ||
| 1683 | * We could be printing a lot while holding a spinlock. Avoid | ||
| 1684 | * triggering hard lockup. | ||
| 1685 | */ | ||
| 1686 | touch_nmi_watchdog(); | ||
| 1687 | |||
| 1680 | if (rsp->gpnum == rdp->gpnum) { | 1688 | if (rsp->gpnum == rdp->gpnum) { |
| 1681 | ticks_title = "ticks this GP"; | 1689 | ticks_title = "ticks this GP"; |
| 1682 | ticks_value = rdp->ticks_this_gp; | 1690 | ticks_value = rdp->ticks_this_gp; |
| @@ -2235,7 +2243,6 @@ static int rcu_nocb_kthread(void *arg) | |||
| 2235 | smp_mb__before_atomic(); /* _add after CB invocation. */ | 2243 | smp_mb__before_atomic(); /* _add after CB invocation. */ |
| 2236 | atomic_long_add(-c, &rdp->nocb_q_count); | 2244 | atomic_long_add(-c, &rdp->nocb_q_count); |
| 2237 | atomic_long_add(-cl, &rdp->nocb_q_count_lazy); | 2245 | atomic_long_add(-cl, &rdp->nocb_q_count_lazy); |
| 2238 | rdp->n_nocbs_invoked += c; | ||
| 2239 | } | 2246 | } |
| 2240 | return 0; | 2247 | return 0; |
| 2241 | } | 2248 | } |
| @@ -2312,8 +2319,11 @@ void __init rcu_init_nohz(void) | |||
| 2312 | cpumask_and(rcu_nocb_mask, cpu_possible_mask, | 2319 | cpumask_and(rcu_nocb_mask, cpu_possible_mask, |
| 2313 | rcu_nocb_mask); | 2320 | rcu_nocb_mask); |
| 2314 | } | 2321 | } |
| 2315 | pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", | 2322 | if (cpumask_empty(rcu_nocb_mask)) |
| 2316 | cpumask_pr_args(rcu_nocb_mask)); | 2323 | pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); |
| 2324 | else | ||
| 2325 | pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", | ||
| 2326 | cpumask_pr_args(rcu_nocb_mask)); | ||
| 2317 | if (rcu_nocb_poll) | 2327 | if (rcu_nocb_poll) |
| 2318 | pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); | 2328 | pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); |
| 2319 | 2329 | ||
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index e2f9d4feff40..d9a02b318108 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
| @@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | |||
| 17 | endif | 17 | endif |
| 18 | 18 | ||
| 19 | obj-y += core.o loadavg.o clock.o cputime.o | 19 | obj-y += core.o loadavg.o clock.o cputime.o |
| 20 | obj-y += idle_task.o fair.o rt.o deadline.o | 20 | obj-y += idle.o fair.o rt.o deadline.o |
| 21 | obj-y += wait.o wait_bit.o swait.o completion.o idle.o | 21 | obj-y += wait.o wait_bit.o swait.o completion.o |
| 22 | |||
| 22 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o | 23 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o |
| 23 | obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o | 24 | obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o |
| 24 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 25 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index bb4b9fe026a1..6be6c575b6cd 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c | |||
| @@ -1,10 +1,7 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | #include <linux/proc_fs.h> | 2 | /* |
| 3 | #include <linux/seq_file.h> | 3 | * Auto-group scheduling implementation: |
| 4 | #include <linux/utsname.h> | 4 | */ |
| 5 | #include <linux/security.h> | ||
| 6 | #include <linux/export.h> | ||
| 7 | |||
| 8 | #include "sched.h" | 5 | #include "sched.h" |
| 9 | 6 | ||
| 10 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; | 7 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; |
| @@ -168,18 +165,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) | |||
| 168 | autogroup_kref_put(prev); | 165 | autogroup_kref_put(prev); |
| 169 | } | 166 | } |
| 170 | 167 | ||
| 171 | /* Allocates GFP_KERNEL, cannot be called under any spinlock */ | 168 | /* Allocates GFP_KERNEL, cannot be called under any spinlock: */ |
| 172 | void sched_autogroup_create_attach(struct task_struct *p) | 169 | void sched_autogroup_create_attach(struct task_struct *p) |
| 173 | { | 170 | { |
| 174 | struct autogroup *ag = autogroup_create(); | 171 | struct autogroup *ag = autogroup_create(); |
| 175 | 172 | ||
| 176 | autogroup_move_group(p, ag); | 173 | autogroup_move_group(p, ag); |
| 177 | /* drop extra reference added by autogroup_create() */ | 174 | |
| 175 | /* Drop extra reference added by autogroup_create(): */ | ||
| 178 | autogroup_kref_put(ag); | 176 | autogroup_kref_put(ag); |
| 179 | } | 177 | } |
| 180 | EXPORT_SYMBOL(sched_autogroup_create_attach); | 178 | EXPORT_SYMBOL(sched_autogroup_create_attach); |
| 181 | 179 | ||
| 182 | /* Cannot be called under siglock. Currently has no users */ | 180 | /* Cannot be called under siglock. Currently has no users: */ |
| 183 | void sched_autogroup_detach(struct task_struct *p) | 181 | void sched_autogroup_detach(struct task_struct *p) |
| 184 | { | 182 | { |
| 185 | autogroup_move_group(p, &autogroup_default); | 183 | autogroup_move_group(p, &autogroup_default); |
| @@ -202,7 +200,6 @@ static int __init setup_autogroup(char *str) | |||
| 202 | 200 | ||
| 203 | return 1; | 201 | return 1; |
| 204 | } | 202 | } |
| 205 | |||
| 206 | __setup("noautogroup", setup_autogroup); | 203 | __setup("noautogroup", setup_autogroup); |
| 207 | 204 | ||
| 208 | #ifdef CONFIG_PROC_FS | 205 | #ifdef CONFIG_PROC_FS |
| @@ -224,7 +221,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) | |||
| 224 | if (nice < 0 && !can_nice(current, nice)) | 221 | if (nice < 0 && !can_nice(current, nice)) |
| 225 | return -EPERM; | 222 | return -EPERM; |
| 226 | 223 | ||
| 227 | /* this is a heavy operation taking global locks.. */ | 224 | /* This is a heavy operation, taking global locks.. */ |
| 228 | if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) | 225 | if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) |
| 229 | return -EAGAIN; | 226 | return -EAGAIN; |
| 230 | 227 | ||
| @@ -267,4 +264,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) | |||
| 267 | 264 | ||
| 268 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); | 265 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); |
| 269 | } | 266 | } |
| 270 | #endif /* CONFIG_SCHED_DEBUG */ | 267 | #endif |
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 27cd22b89824..b96419974a1f 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h | |||
| @@ -1,15 +1,11 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | #ifdef CONFIG_SCHED_AUTOGROUP | 2 | #ifdef CONFIG_SCHED_AUTOGROUP |
| 3 | 3 | ||
| 4 | #include <linux/kref.h> | ||
| 5 | #include <linux/rwsem.h> | ||
| 6 | #include <linux/sched/autogroup.h> | ||
| 7 | |||
| 8 | struct autogroup { | 4 | struct autogroup { |
| 9 | /* | 5 | /* |
| 10 | * reference doesn't mean how many thread attach to this | 6 | * Reference doesn't mean how many threads attach to this |
| 11 | * autogroup now. It just stands for the number of task | 7 | * autogroup now. It just stands for the number of tasks |
| 12 | * could use this autogroup. | 8 | * which could use this autogroup. |
| 13 | */ | 9 | */ |
| 14 | struct kref kref; | 10 | struct kref kref; |
| 15 | struct task_group *tg; | 11 | struct task_group *tg; |
| @@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg) | |||
| 56 | return tg; | 52 | return tg; |
| 57 | } | 53 | } |
| 58 | 54 | ||
| 59 | #ifdef CONFIG_SCHED_DEBUG | ||
| 60 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | 55 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) |
| 61 | { | 56 | { |
| 62 | return 0; | 57 | return 0; |
| 63 | } | 58 | } |
| 64 | #endif | ||
| 65 | 59 | ||
| 66 | #endif /* CONFIG_SCHED_AUTOGROUP */ | 60 | #endif /* CONFIG_SCHED_AUTOGROUP */ |
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e086babe6c61..10c83e73837a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * sched_clock for unstable cpu clocks | 2 | * sched_clock() for unstable CPU clocks |
| 3 | * | 3 | * |
| 4 | * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra | 4 | * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra |
| 5 | * | 5 | * |
| @@ -11,7 +11,7 @@ | |||
| 11 | * Guillaume Chazarain <guichaz@gmail.com> | 11 | * Guillaume Chazarain <guichaz@gmail.com> |
| 12 | * | 12 | * |
| 13 | * | 13 | * |
| 14 | * What: | 14 | * What this file implements: |
| 15 | * | 15 | * |
| 16 | * cpu_clock(i) provides a fast (execution time) high resolution | 16 | * cpu_clock(i) provides a fast (execution time) high resolution |
| 17 | * clock with bounded drift between CPUs. The value of cpu_clock(i) | 17 | * clock with bounded drift between CPUs. The value of cpu_clock(i) |
| @@ -26,11 +26,11 @@ | |||
| 26 | * at 0 on boot (but people really shouldn't rely on that). | 26 | * at 0 on boot (but people really shouldn't rely on that). |
| 27 | * | 27 | * |
| 28 | * cpu_clock(i) -- can be used from any context, including NMI. | 28 | * cpu_clock(i) -- can be used from any context, including NMI. |
| 29 | * local_clock() -- is cpu_clock() on the current cpu. | 29 | * local_clock() -- is cpu_clock() on the current CPU. |
| 30 | * | 30 | * |
| 31 | * sched_clock_cpu(i) | 31 | * sched_clock_cpu(i) |
| 32 | * | 32 | * |
| 33 | * How: | 33 | * How it is implemented: |
| 34 | * | 34 | * |
| 35 | * The implementation either uses sched_clock() when | 35 | * The implementation either uses sched_clock() when |
| 36 | * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the | 36 | * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the |
| @@ -52,19 +52,7 @@ | |||
| 52 | * that is otherwise invisible (TSC gets stopped). | 52 | * that is otherwise invisible (TSC gets stopped). |
| 53 | * | 53 | * |
| 54 | */ | 54 | */ |
| 55 | #include <linux/spinlock.h> | 55 | #include "sched.h" |
| 56 | #include <linux/hardirq.h> | ||
| 57 | #include <linux/export.h> | ||
| 58 | #include <linux/percpu.h> | ||
| 59 | #include <linux/ktime.h> | ||
| 60 | #include <linux/sched.h> | ||
| 61 | #include <linux/nmi.h> | ||
| 62 | #include <linux/sched/clock.h> | ||
| 63 | #include <linux/static_key.h> | ||
| 64 | #include <linux/workqueue.h> | ||
| 65 | #include <linux/compiler.h> | ||
| 66 | #include <linux/tick.h> | ||
| 67 | #include <linux/init.h> | ||
| 68 | 56 | ||
| 69 | /* | 57 | /* |
| 70 | * Scheduler clock - returns current time in nanosec units. | 58 | * Scheduler clock - returns current time in nanosec units. |
| @@ -302,21 +290,21 @@ again: | |||
| 302 | * cmpxchg64 below only protects one readout. | 290 | * cmpxchg64 below only protects one readout. |
| 303 | * | 291 | * |
| 304 | * We must reread via sched_clock_local() in the retry case on | 292 | * We must reread via sched_clock_local() in the retry case on |
| 305 | * 32bit as an NMI could use sched_clock_local() via the | 293 | * 32-bit kernels as an NMI could use sched_clock_local() via the |
| 306 | * tracer and hit between the readout of | 294 | * tracer and hit between the readout of |
| 307 | * the low32bit and the high 32bit portion. | 295 | * the low 32-bit and the high 32-bit portion. |
| 308 | */ | 296 | */ |
| 309 | this_clock = sched_clock_local(my_scd); | 297 | this_clock = sched_clock_local(my_scd); |
| 310 | /* | 298 | /* |
| 311 | * We must enforce atomic readout on 32bit, otherwise the | 299 | * We must enforce atomic readout on 32-bit, otherwise the |
| 312 | * update on the remote cpu can hit inbetween the readout of | 300 | * update on the remote CPU can hit inbetween the readout of |
| 313 | * the low32bit and the high 32bit portion. | 301 | * the low 32-bit and the high 32-bit portion. |
| 314 | */ | 302 | */ |
| 315 | remote_clock = cmpxchg64(&scd->clock, 0, 0); | 303 | remote_clock = cmpxchg64(&scd->clock, 0, 0); |
| 316 | #else | 304 | #else |
| 317 | /* | 305 | /* |
| 318 | * On 64bit the read of [my]scd->clock is atomic versus the | 306 | * On 64-bit kernels the read of [my]scd->clock is atomic versus the |
| 319 | * update, so we can avoid the above 32bit dance. | 307 | * update, so we can avoid the above 32-bit dance. |
| 320 | */ | 308 | */ |
| 321 | sched_clock_local(my_scd); | 309 | sched_clock_local(my_scd); |
| 322 | again: | 310 | again: |
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 0926aef10dad..e426b0cb9ac6 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c | |||
| @@ -11,10 +11,7 @@ | |||
| 11 | * typically be used for exclusion which gives rise to priority inversion. | 11 | * typically be used for exclusion which gives rise to priority inversion. |
| 12 | * Waiting for completion is a typically sync point, but not an exclusion point. | 12 | * Waiting for completion is a typically sync point, but not an exclusion point. |
| 13 | */ | 13 | */ |
| 14 | 14 | #include "sched.h" | |
| 15 | #include <linux/sched/signal.h> | ||
| 16 | #include <linux/sched/debug.h> | ||
| 17 | #include <linux/completion.h> | ||
| 18 | 15 | ||
| 19 | /** | 16 | /** |
| 20 | * complete: - signals a single thread waiting on this completion | 17 | * complete: - signals a single thread waiting on this completion |
| @@ -283,7 +280,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout); | |||
| 283 | bool try_wait_for_completion(struct completion *x) | 280 | bool try_wait_for_completion(struct completion *x) |
| 284 | { | 281 | { |
| 285 | unsigned long flags; | 282 | unsigned long flags; |
| 286 | int ret = 1; | 283 | bool ret = true; |
| 287 | 284 | ||
| 288 | /* | 285 | /* |
| 289 | * Since x->done will need to be locked only | 286 | * Since x->done will need to be locked only |
| @@ -292,11 +289,11 @@ bool try_wait_for_completion(struct completion *x) | |||
| 292 | * return early in the blocking case. | 289 | * return early in the blocking case. |
| 293 | */ | 290 | */ |
| 294 | if (!READ_ONCE(x->done)) | 291 | if (!READ_ONCE(x->done)) |
| 295 | return 0; | 292 | return false; |
| 296 | 293 | ||
| 297 | spin_lock_irqsave(&x->wait.lock, flags); | 294 | spin_lock_irqsave(&x->wait.lock, flags); |
| 298 | if (!x->done) | 295 | if (!x->done) |
| 299 | ret = 0; | 296 | ret = false; |
| 300 | else if (x->done != UINT_MAX) | 297 | else if (x->done != UINT_MAX) |
| 301 | x->done--; | 298 | x->done--; |
| 302 | spin_unlock_irqrestore(&x->wait.lock, flags); | 299 | spin_unlock_irqrestore(&x->wait.lock, flags); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c94895bc5a2c..28b68995a417 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -5,37 +5,11 @@ | |||
| 5 | * | 5 | * |
| 6 | * Copyright (C) 1991-2002 Linus Torvalds | 6 | * Copyright (C) 1991-2002 Linus Torvalds |
| 7 | */ | 7 | */ |
| 8 | #include <linux/sched.h> | 8 | #include "sched.h" |
| 9 | #include <linux/sched/clock.h> | ||
| 10 | #include <uapi/linux/sched/types.h> | ||
| 11 | #include <linux/sched/loadavg.h> | ||
| 12 | #include <linux/sched/hotplug.h> | ||
| 13 | #include <linux/wait_bit.h> | ||
| 14 | #include <linux/cpuset.h> | ||
| 15 | #include <linux/delayacct.h> | ||
| 16 | #include <linux/init_task.h> | ||
| 17 | #include <linux/context_tracking.h> | ||
| 18 | #include <linux/rcupdate_wait.h> | ||
| 19 | #include <linux/compat.h> | ||
| 20 | |||
| 21 | #include <linux/blkdev.h> | ||
| 22 | #include <linux/kprobes.h> | ||
| 23 | #include <linux/mmu_context.h> | ||
| 24 | #include <linux/module.h> | ||
| 25 | #include <linux/nmi.h> | ||
| 26 | #include <linux/prefetch.h> | ||
| 27 | #include <linux/profile.h> | ||
| 28 | #include <linux/security.h> | ||
| 29 | #include <linux/syscalls.h> | ||
| 30 | #include <linux/sched/isolation.h> | ||
| 31 | 9 | ||
| 32 | #include <asm/switch_to.h> | 10 | #include <asm/switch_to.h> |
| 33 | #include <asm/tlb.h> | 11 | #include <asm/tlb.h> |
| 34 | #ifdef CONFIG_PARAVIRT | ||
| 35 | #include <asm/paravirt.h> | ||
| 36 | #endif | ||
| 37 | 12 | ||
| 38 | #include "sched.h" | ||
| 39 | #include "../workqueue_internal.h" | 13 | #include "../workqueue_internal.h" |
| 40 | #include "../smpboot.h" | 14 | #include "../smpboot.h" |
| 41 | 15 | ||
| @@ -135,7 +109,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | |||
| 135 | * [L] ->on_rq | 109 | * [L] ->on_rq |
| 136 | * RELEASE (rq->lock) | 110 | * RELEASE (rq->lock) |
| 137 | * | 111 | * |
| 138 | * If we observe the old cpu in task_rq_lock, the acquire of | 112 | * If we observe the old CPU in task_rq_lock, the acquire of |
| 139 | * the old rq->lock will fully serialize against the stores. | 113 | * the old rq->lock will fully serialize against the stores. |
| 140 | * | 114 | * |
| 141 | * If we observe the new CPU in task_rq_lock, the acquire will | 115 | * If we observe the new CPU in task_rq_lock, the acquire will |
| @@ -333,7 +307,7 @@ void hrtick_start(struct rq *rq, u64 delay) | |||
| 333 | } | 307 | } |
| 334 | #endif /* CONFIG_SMP */ | 308 | #endif /* CONFIG_SMP */ |
| 335 | 309 | ||
| 336 | static void init_rq_hrtick(struct rq *rq) | 310 | static void hrtick_rq_init(struct rq *rq) |
| 337 | { | 311 | { |
| 338 | #ifdef CONFIG_SMP | 312 | #ifdef CONFIG_SMP |
| 339 | rq->hrtick_csd_pending = 0; | 313 | rq->hrtick_csd_pending = 0; |
| @@ -351,7 +325,7 @@ static inline void hrtick_clear(struct rq *rq) | |||
| 351 | { | 325 | { |
| 352 | } | 326 | } |
| 353 | 327 | ||
| 354 | static inline void init_rq_hrtick(struct rq *rq) | 328 | static inline void hrtick_rq_init(struct rq *rq) |
| 355 | { | 329 | { |
| 356 | } | 330 | } |
| 357 | #endif /* CONFIG_SCHED_HRTICK */ | 331 | #endif /* CONFIG_SCHED_HRTICK */ |
| @@ -609,7 +583,7 @@ static inline bool got_nohz_idle_kick(void) | |||
| 609 | { | 583 | { |
| 610 | int cpu = smp_processor_id(); | 584 | int cpu = smp_processor_id(); |
| 611 | 585 | ||
| 612 | if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) | 586 | if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) |
| 613 | return false; | 587 | return false; |
| 614 | 588 | ||
| 615 | if (idle_cpu(cpu) && !need_resched()) | 589 | if (idle_cpu(cpu) && !need_resched()) |
| @@ -619,7 +593,7 @@ static inline bool got_nohz_idle_kick(void) | |||
| 619 | * We can't run Idle Load Balance on this CPU for this time so we | 593 | * We can't run Idle Load Balance on this CPU for this time so we |
| 620 | * cancel it and clear NOHZ_BALANCE_KICK | 594 | * cancel it and clear NOHZ_BALANCE_KICK |
| 621 | */ | 595 | */ |
| 622 | clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); | 596 | atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); |
| 623 | return false; | 597 | return false; |
| 624 | } | 598 | } |
| 625 | 599 | ||
| @@ -1457,7 +1431,7 @@ EXPORT_SYMBOL_GPL(kick_process); | |||
| 1457 | * | 1431 | * |
| 1458 | * - cpu_active must be a subset of cpu_online | 1432 | * - cpu_active must be a subset of cpu_online |
| 1459 | * | 1433 | * |
| 1460 | * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, | 1434 | * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, |
| 1461 | * see __set_cpus_allowed_ptr(). At this point the newly online | 1435 | * see __set_cpus_allowed_ptr(). At this point the newly online |
| 1462 | * CPU isn't yet part of the sched domains, and balancing will not | 1436 | * CPU isn't yet part of the sched domains, and balancing will not |
| 1463 | * see it. | 1437 | * see it. |
| @@ -2488,17 +2462,17 @@ void wake_up_new_task(struct task_struct *p) | |||
| 2488 | 2462 | ||
| 2489 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2463 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
| 2490 | 2464 | ||
| 2491 | static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; | 2465 | static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); |
| 2492 | 2466 | ||
| 2493 | void preempt_notifier_inc(void) | 2467 | void preempt_notifier_inc(void) |
| 2494 | { | 2468 | { |
| 2495 | static_key_slow_inc(&preempt_notifier_key); | 2469 | static_branch_inc(&preempt_notifier_key); |
| 2496 | } | 2470 | } |
| 2497 | EXPORT_SYMBOL_GPL(preempt_notifier_inc); | 2471 | EXPORT_SYMBOL_GPL(preempt_notifier_inc); |
| 2498 | 2472 | ||
| 2499 | void preempt_notifier_dec(void) | 2473 | void preempt_notifier_dec(void) |
| 2500 | { | 2474 | { |
| 2501 | static_key_slow_dec(&preempt_notifier_key); | 2475 | static_branch_dec(&preempt_notifier_key); |
| 2502 | } | 2476 | } |
| 2503 | EXPORT_SYMBOL_GPL(preempt_notifier_dec); | 2477 | EXPORT_SYMBOL_GPL(preempt_notifier_dec); |
| 2504 | 2478 | ||
| @@ -2508,7 +2482,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_dec); | |||
| 2508 | */ | 2482 | */ |
| 2509 | void preempt_notifier_register(struct preempt_notifier *notifier) | 2483 | void preempt_notifier_register(struct preempt_notifier *notifier) |
| 2510 | { | 2484 | { |
| 2511 | if (!static_key_false(&preempt_notifier_key)) | 2485 | if (!static_branch_unlikely(&preempt_notifier_key)) |
| 2512 | WARN(1, "registering preempt_notifier while notifiers disabled\n"); | 2486 | WARN(1, "registering preempt_notifier while notifiers disabled\n"); |
| 2513 | 2487 | ||
| 2514 | hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); | 2488 | hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); |
| @@ -2537,7 +2511,7 @@ static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) | |||
| 2537 | 2511 | ||
| 2538 | static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) | 2512 | static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) |
| 2539 | { | 2513 | { |
| 2540 | if (static_key_false(&preempt_notifier_key)) | 2514 | if (static_branch_unlikely(&preempt_notifier_key)) |
| 2541 | __fire_sched_in_preempt_notifiers(curr); | 2515 | __fire_sched_in_preempt_notifiers(curr); |
| 2542 | } | 2516 | } |
| 2543 | 2517 | ||
| @@ -2555,7 +2529,7 @@ static __always_inline void | |||
| 2555 | fire_sched_out_preempt_notifiers(struct task_struct *curr, | 2529 | fire_sched_out_preempt_notifiers(struct task_struct *curr, |
| 2556 | struct task_struct *next) | 2530 | struct task_struct *next) |
| 2557 | { | 2531 | { |
| 2558 | if (static_key_false(&preempt_notifier_key)) | 2532 | if (static_branch_unlikely(&preempt_notifier_key)) |
| 2559 | __fire_sched_out_preempt_notifiers(curr, next); | 2533 | __fire_sched_out_preempt_notifiers(curr, next); |
| 2560 | } | 2534 | } |
| 2561 | 2535 | ||
| @@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq) | |||
| 2629 | raw_spin_unlock_irq(&rq->lock); | 2603 | raw_spin_unlock_irq(&rq->lock); |
| 2630 | } | 2604 | } |
| 2631 | 2605 | ||
| 2606 | /* | ||
| 2607 | * NOP if the arch has not defined these: | ||
| 2608 | */ | ||
| 2609 | |||
| 2610 | #ifndef prepare_arch_switch | ||
| 2611 | # define prepare_arch_switch(next) do { } while (0) | ||
| 2612 | #endif | ||
| 2613 | |||
| 2614 | #ifndef finish_arch_post_lock_switch | ||
| 2615 | # define finish_arch_post_lock_switch() do { } while (0) | ||
| 2616 | #endif | ||
| 2617 | |||
| 2632 | /** | 2618 | /** |
| 2633 | * prepare_task_switch - prepare to switch tasks | 2619 | * prepare_task_switch - prepare to switch tasks |
| 2634 | * @rq: the runqueue preparing to switch | 2620 | * @rq: the runqueue preparing to switch |
| @@ -3037,7 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
| 3037 | 3023 | ||
| 3038 | #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) | 3024 | #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) |
| 3039 | /* | 3025 | /* |
| 3040 | * 64-bit doesn't need locks to atomically read a 64bit value. | 3026 | * 64-bit doesn't need locks to atomically read a 64-bit value. |
| 3041 | * So we have a optimization chance when the task's delta_exec is 0. | 3027 | * So we have a optimization chance when the task's delta_exec is 0. |
| 3042 | * Reading ->on_cpu is racy, but this is ok. | 3028 | * Reading ->on_cpu is racy, but this is ok. |
| 3043 | * | 3029 | * |
| @@ -3096,35 +3082,99 @@ void scheduler_tick(void) | |||
| 3096 | rq->idle_balance = idle_cpu(cpu); | 3082 | rq->idle_balance = idle_cpu(cpu); |
| 3097 | trigger_load_balance(rq); | 3083 | trigger_load_balance(rq); |
| 3098 | #endif | 3084 | #endif |
| 3099 | rq_last_tick_reset(rq); | ||
| 3100 | } | 3085 | } |
| 3101 | 3086 | ||
| 3102 | #ifdef CONFIG_NO_HZ_FULL | 3087 | #ifdef CONFIG_NO_HZ_FULL |
| 3103 | /** | 3088 | |
| 3104 | * scheduler_tick_max_deferment | 3089 | struct tick_work { |
| 3105 | * | 3090 | int cpu; |
| 3106 | * Keep at least one tick per second when a single | 3091 | struct delayed_work work; |
| 3107 | * active task is running because the scheduler doesn't | 3092 | }; |
| 3108 | * yet completely support full dynticks environment. | 3093 | |
| 3109 | * | 3094 | static struct tick_work __percpu *tick_work_cpu; |
| 3110 | * This makes sure that uptime, CFS vruntime, load | 3095 | |
| 3111 | * balancing, etc... continue to move forward, even | 3096 | static void sched_tick_remote(struct work_struct *work) |
| 3112 | * with a very low granularity. | ||
| 3113 | * | ||
| 3114 | * Return: Maximum deferment in nanoseconds. | ||
| 3115 | */ | ||
| 3116 | u64 scheduler_tick_max_deferment(void) | ||
| 3117 | { | 3097 | { |
| 3118 | struct rq *rq = this_rq(); | 3098 | struct delayed_work *dwork = to_delayed_work(work); |
| 3119 | unsigned long next, now = READ_ONCE(jiffies); | 3099 | struct tick_work *twork = container_of(dwork, struct tick_work, work); |
| 3100 | int cpu = twork->cpu; | ||
| 3101 | struct rq *rq = cpu_rq(cpu); | ||
| 3102 | struct rq_flags rf; | ||
| 3103 | |||
| 3104 | /* | ||
| 3105 | * Handle the tick only if it appears the remote CPU is running in full | ||
| 3106 | * dynticks mode. The check is racy by nature, but missing a tick or | ||
| 3107 | * having one too much is no big deal because the scheduler tick updates | ||
| 3108 | * statistics and checks timeslices in a time-independent way, regardless | ||
| 3109 | * of when exactly it is running. | ||
| 3110 | */ | ||
| 3111 | if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { | ||
| 3112 | struct task_struct *curr; | ||
| 3113 | u64 delta; | ||
| 3120 | 3114 | ||
| 3121 | next = rq->last_sched_tick + HZ; | 3115 | rq_lock_irq(rq, &rf); |
| 3116 | update_rq_clock(rq); | ||
| 3117 | curr = rq->curr; | ||
| 3118 | delta = rq_clock_task(rq) - curr->se.exec_start; | ||
| 3122 | 3119 | ||
| 3123 | if (time_before_eq(next, now)) | 3120 | /* |
| 3124 | return 0; | 3121 | * Make sure the next tick runs within a reasonable |
| 3122 | * amount of time. | ||
| 3123 | */ | ||
| 3124 | WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); | ||
| 3125 | curr->sched_class->task_tick(rq, curr, 0); | ||
| 3126 | rq_unlock_irq(rq, &rf); | ||
| 3127 | } | ||
| 3125 | 3128 | ||
| 3126 | return jiffies_to_nsecs(next - now); | 3129 | /* |
| 3130 | * Run the remote tick once per second (1Hz). This arbitrary | ||
| 3131 | * frequency is large enough to avoid overload but short enough | ||
| 3132 | * to keep scheduler internal stats reasonably up to date. | ||
| 3133 | */ | ||
| 3134 | queue_delayed_work(system_unbound_wq, dwork, HZ); | ||
| 3135 | } | ||
| 3136 | |||
| 3137 | static void sched_tick_start(int cpu) | ||
| 3138 | { | ||
| 3139 | struct tick_work *twork; | ||
| 3140 | |||
| 3141 | if (housekeeping_cpu(cpu, HK_FLAG_TICK)) | ||
| 3142 | return; | ||
| 3143 | |||
| 3144 | WARN_ON_ONCE(!tick_work_cpu); | ||
| 3145 | |||
| 3146 | twork = per_cpu_ptr(tick_work_cpu, cpu); | ||
| 3147 | twork->cpu = cpu; | ||
| 3148 | INIT_DELAYED_WORK(&twork->work, sched_tick_remote); | ||
| 3149 | queue_delayed_work(system_unbound_wq, &twork->work, HZ); | ||
| 3127 | } | 3150 | } |
| 3151 | |||
| 3152 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 3153 | static void sched_tick_stop(int cpu) | ||
| 3154 | { | ||
| 3155 | struct tick_work *twork; | ||
| 3156 | |||
| 3157 | if (housekeeping_cpu(cpu, HK_FLAG_TICK)) | ||
| 3158 | return; | ||
| 3159 | |||
| 3160 | WARN_ON_ONCE(!tick_work_cpu); | ||
| 3161 | |||
| 3162 | twork = per_cpu_ptr(tick_work_cpu, cpu); | ||
| 3163 | cancel_delayed_work_sync(&twork->work); | ||
| 3164 | } | ||
| 3165 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
| 3166 | |||
| 3167 | int __init sched_tick_offload_init(void) | ||
| 3168 | { | ||
| 3169 | tick_work_cpu = alloc_percpu(struct tick_work); | ||
| 3170 | BUG_ON(!tick_work_cpu); | ||
| 3171 | |||
| 3172 | return 0; | ||
| 3173 | } | ||
| 3174 | |||
| 3175 | #else /* !CONFIG_NO_HZ_FULL */ | ||
| 3176 | static inline void sched_tick_start(int cpu) { } | ||
| 3177 | static inline void sched_tick_stop(int cpu) { } | ||
| 3128 | #endif | 3178 | #endif |
| 3129 | 3179 | ||
| 3130 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ | 3180 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
| @@ -4892,7 +4942,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, | |||
| 4892 | * | 4942 | * |
| 4893 | * Return: 0. | 4943 | * Return: 0. |
| 4894 | */ | 4944 | */ |
| 4895 | SYSCALL_DEFINE0(sched_yield) | 4945 | static void do_sched_yield(void) |
| 4896 | { | 4946 | { |
| 4897 | struct rq_flags rf; | 4947 | struct rq_flags rf; |
| 4898 | struct rq *rq; | 4948 | struct rq *rq; |
| @@ -4913,7 +4963,11 @@ SYSCALL_DEFINE0(sched_yield) | |||
| 4913 | sched_preempt_enable_no_resched(); | 4963 | sched_preempt_enable_no_resched(); |
| 4914 | 4964 | ||
| 4915 | schedule(); | 4965 | schedule(); |
| 4966 | } | ||
| 4916 | 4967 | ||
| 4968 | SYSCALL_DEFINE0(sched_yield) | ||
| 4969 | { | ||
| 4970 | do_sched_yield(); | ||
| 4917 | return 0; | 4971 | return 0; |
| 4918 | } | 4972 | } |
| 4919 | 4973 | ||
| @@ -4997,7 +5051,7 @@ EXPORT_SYMBOL(__cond_resched_softirq); | |||
| 4997 | void __sched yield(void) | 5051 | void __sched yield(void) |
| 4998 | { | 5052 | { |
| 4999 | set_current_state(TASK_RUNNING); | 5053 | set_current_state(TASK_RUNNING); |
| 5000 | sys_sched_yield(); | 5054 | do_sched_yield(); |
| 5001 | } | 5055 | } |
| 5002 | EXPORT_SYMBOL(yield); | 5056 | EXPORT_SYMBOL(yield); |
| 5003 | 5057 | ||
| @@ -5786,6 +5840,7 @@ int sched_cpu_starting(unsigned int cpu) | |||
| 5786 | { | 5840 | { |
| 5787 | set_cpu_rq_start_time(cpu); | 5841 | set_cpu_rq_start_time(cpu); |
| 5788 | sched_rq_cpu_starting(cpu); | 5842 | sched_rq_cpu_starting(cpu); |
| 5843 | sched_tick_start(cpu); | ||
| 5789 | return 0; | 5844 | return 0; |
| 5790 | } | 5845 | } |
| 5791 | 5846 | ||
| @@ -5797,6 +5852,7 @@ int sched_cpu_dying(unsigned int cpu) | |||
| 5797 | 5852 | ||
| 5798 | /* Handle pending wakeups and then migrate everything off */ | 5853 | /* Handle pending wakeups and then migrate everything off */ |
| 5799 | sched_ttwu_pending(); | 5854 | sched_ttwu_pending(); |
| 5855 | sched_tick_stop(cpu); | ||
| 5800 | 5856 | ||
| 5801 | rq_lock_irqsave(rq, &rf); | 5857 | rq_lock_irqsave(rq, &rf); |
| 5802 | if (rq->rd) { | 5858 | if (rq->rd) { |
| @@ -5809,7 +5865,7 @@ int sched_cpu_dying(unsigned int cpu) | |||
| 5809 | 5865 | ||
| 5810 | calc_load_migrate(rq); | 5866 | calc_load_migrate(rq); |
| 5811 | update_max_interval(); | 5867 | update_max_interval(); |
| 5812 | nohz_balance_exit_idle(cpu); | 5868 | nohz_balance_exit_idle(rq); |
| 5813 | hrtick_clear(rq); | 5869 | hrtick_clear(rq); |
| 5814 | return 0; | 5870 | return 0; |
| 5815 | } | 5871 | } |
| @@ -6022,13 +6078,11 @@ void __init sched_init(void) | |||
| 6022 | rq_attach_root(rq, &def_root_domain); | 6078 | rq_attach_root(rq, &def_root_domain); |
| 6023 | #ifdef CONFIG_NO_HZ_COMMON | 6079 | #ifdef CONFIG_NO_HZ_COMMON |
| 6024 | rq->last_load_update_tick = jiffies; | 6080 | rq->last_load_update_tick = jiffies; |
| 6025 | rq->nohz_flags = 0; | 6081 | rq->last_blocked_load_update_tick = jiffies; |
| 6026 | #endif | 6082 | atomic_set(&rq->nohz_flags, 0); |
| 6027 | #ifdef CONFIG_NO_HZ_FULL | ||
| 6028 | rq->last_sched_tick = 0; | ||
| 6029 | #endif | 6083 | #endif |
| 6030 | #endif /* CONFIG_SMP */ | 6084 | #endif /* CONFIG_SMP */ |
| 6031 | init_rq_hrtick(rq); | 6085 | hrtick_rq_init(rq); |
| 6032 | atomic_set(&rq->nr_iowait, 0); | 6086 | atomic_set(&rq->nr_iowait, 0); |
| 6033 | } | 6087 | } |
| 6034 | 6088 | ||
| @@ -7027,3 +7081,5 @@ const u32 sched_prio_to_wmult[40] = { | |||
| 7027 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, | 7081 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, |
| 7028 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 7082 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
| 7029 | }; | 7083 | }; |
| 7084 | |||
| 7085 | #undef CREATE_TRACE_POINTS | ||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 44ab32a4fab6..9fbb10383434 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
| @@ -1,24 +1,13 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | #include <linux/cgroup.h> | ||
| 3 | #include <linux/slab.h> | ||
| 4 | #include <linux/percpu.h> | ||
| 5 | #include <linux/spinlock.h> | ||
| 6 | #include <linux/cpumask.h> | ||
| 7 | #include <linux/seq_file.h> | ||
| 8 | #include <linux/rcupdate.h> | ||
| 9 | #include <linux/kernel_stat.h> | ||
| 10 | #include <linux/err.h> | ||
| 11 | |||
| 12 | #include "sched.h" | ||
| 13 | |||
| 14 | /* | 2 | /* |
| 15 | * CPU accounting code for task groups. | 3 | * CPU accounting code for task groups. |
| 16 | * | 4 | * |
| 17 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh | 5 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh |
| 18 | * (balbir@in.ibm.com). | 6 | * (balbir@in.ibm.com). |
| 19 | */ | 7 | */ |
| 8 | #include "sched.h" | ||
| 20 | 9 | ||
| 21 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | 10 | /* Time spent by the tasks of the CPU accounting group executing in ... */ |
| 22 | enum cpuacct_stat_index { | 11 | enum cpuacct_stat_index { |
| 23 | CPUACCT_STAT_USER, /* ... user mode */ | 12 | CPUACCT_STAT_USER, /* ... user mode */ |
| 24 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | 13 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ |
| @@ -35,12 +24,12 @@ struct cpuacct_usage { | |||
| 35 | u64 usages[CPUACCT_STAT_NSTATS]; | 24 | u64 usages[CPUACCT_STAT_NSTATS]; |
| 36 | }; | 25 | }; |
| 37 | 26 | ||
| 38 | /* track cpu usage of a group of tasks and its child groups */ | 27 | /* track CPU usage of a group of tasks and its child groups */ |
| 39 | struct cpuacct { | 28 | struct cpuacct { |
| 40 | struct cgroup_subsys_state css; | 29 | struct cgroup_subsys_state css; |
| 41 | /* cpuusage holds pointer to a u64-type object on every cpu */ | 30 | /* cpuusage holds pointer to a u64-type object on every CPU */ |
| 42 | struct cpuacct_usage __percpu *cpuusage; | 31 | struct cpuacct_usage __percpu *cpuusage; |
| 43 | struct kernel_cpustat __percpu *cpustat; | 32 | struct kernel_cpustat __percpu *cpustat; |
| 44 | }; | 33 | }; |
| 45 | 34 | ||
| 46 | static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) | 35 | static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) |
| @@ -48,7 +37,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) | |||
| 48 | return css ? container_of(css, struct cpuacct, css) : NULL; | 37 | return css ? container_of(css, struct cpuacct, css) : NULL; |
| 49 | } | 38 | } |
| 50 | 39 | ||
| 51 | /* return cpu accounting group to which this task belongs */ | 40 | /* Return CPU accounting group to which this task belongs */ |
| 52 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | 41 | static inline struct cpuacct *task_ca(struct task_struct *tsk) |
| 53 | { | 42 | { |
| 54 | return css_ca(task_css(tsk, cpuacct_cgrp_id)); | 43 | return css_ca(task_css(tsk, cpuacct_cgrp_id)); |
| @@ -65,7 +54,7 @@ static struct cpuacct root_cpuacct = { | |||
| 65 | .cpuusage = &root_cpuacct_cpuusage, | 54 | .cpuusage = &root_cpuacct_cpuusage, |
| 66 | }; | 55 | }; |
| 67 | 56 | ||
| 68 | /* create a new cpu accounting group */ | 57 | /* Create a new CPU accounting group */ |
| 69 | static struct cgroup_subsys_state * | 58 | static struct cgroup_subsys_state * |
| 70 | cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) | 59 | cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) |
| 71 | { | 60 | { |
| @@ -96,7 +85,7 @@ out: | |||
| 96 | return ERR_PTR(-ENOMEM); | 85 | return ERR_PTR(-ENOMEM); |
| 97 | } | 86 | } |
| 98 | 87 | ||
| 99 | /* destroy an existing cpu accounting group */ | 88 | /* Destroy an existing CPU accounting group */ |
| 100 | static void cpuacct_css_free(struct cgroup_subsys_state *css) | 89 | static void cpuacct_css_free(struct cgroup_subsys_state *css) |
| 101 | { | 90 | { |
| 102 | struct cpuacct *ca = css_ca(css); | 91 | struct cpuacct *ca = css_ca(css); |
| @@ -162,7 +151,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | |||
| 162 | #endif | 151 | #endif |
| 163 | } | 152 | } |
| 164 | 153 | ||
| 165 | /* return total cpu usage (in nanoseconds) of a group */ | 154 | /* Return total CPU usage (in nanoseconds) of a group */ |
| 166 | static u64 __cpuusage_read(struct cgroup_subsys_state *css, | 155 | static u64 __cpuusage_read(struct cgroup_subsys_state *css, |
| 167 | enum cpuacct_stat_index index) | 156 | enum cpuacct_stat_index index) |
| 168 | { | 157 | { |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 8d9562d890d3..50316455ea66 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
| @@ -10,11 +10,7 @@ | |||
| 10 | * as published by the Free Software Foundation; version 2 | 10 | * as published by the Free Software Foundation; version 2 |
| 11 | * of the License. | 11 | * of the License. |
| 12 | */ | 12 | */ |
| 13 | 13 | #include "sched.h" | |
| 14 | #include <linux/gfp.h> | ||
| 15 | #include <linux/kernel.h> | ||
| 16 | #include <linux/slab.h> | ||
| 17 | #include "cpudeadline.h" | ||
| 18 | 14 | ||
| 19 | static inline int parent(int i) | 15 | static inline int parent(int i) |
| 20 | { | 16 | { |
| @@ -42,8 +38,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx) | |||
| 42 | return; | 38 | return; |
| 43 | 39 | ||
| 44 | /* adapted from lib/prio_heap.c */ | 40 | /* adapted from lib/prio_heap.c */ |
| 45 | while(1) { | 41 | while (1) { |
| 46 | u64 largest_dl; | 42 | u64 largest_dl; |
| 43 | |||
| 47 | l = left_child(idx); | 44 | l = left_child(idx); |
| 48 | r = right_child(idx); | 45 | r = right_child(idx); |
| 49 | largest = idx; | 46 | largest = idx; |
| @@ -131,6 +128,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
| 131 | return 1; | 128 | return 1; |
| 132 | } else { | 129 | } else { |
| 133 | int best_cpu = cpudl_maximum(cp); | 130 | int best_cpu = cpudl_maximum(cp); |
| 131 | |||
| 134 | WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); | 132 | WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); |
| 135 | 133 | ||
| 136 | if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && | 134 | if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && |
| @@ -145,9 +143,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
| 145 | } | 143 | } |
| 146 | 144 | ||
| 147 | /* | 145 | /* |
| 148 | * cpudl_clear - remove a cpu from the cpudl max-heap | 146 | * cpudl_clear - remove a CPU from the cpudl max-heap |
| 149 | * @cp: the cpudl max-heap context | 147 | * @cp: the cpudl max-heap context |
| 150 | * @cpu: the target cpu | 148 | * @cpu: the target CPU |
| 151 | * | 149 | * |
| 152 | * Notes: assumes cpu_rq(cpu)->lock is locked | 150 | * Notes: assumes cpu_rq(cpu)->lock is locked |
| 153 | * | 151 | * |
| @@ -186,8 +184,8 @@ void cpudl_clear(struct cpudl *cp, int cpu) | |||
| 186 | /* | 184 | /* |
| 187 | * cpudl_set - update the cpudl max-heap | 185 | * cpudl_set - update the cpudl max-heap |
| 188 | * @cp: the cpudl max-heap context | 186 | * @cp: the cpudl max-heap context |
| 189 | * @cpu: the target cpu | 187 | * @cpu: the target CPU |
| 190 | * @dl: the new earliest deadline for this cpu | 188 | * @dl: the new earliest deadline for this CPU |
| 191 | * | 189 | * |
| 192 | * Notes: assumes cpu_rq(cpu)->lock is locked | 190 | * Notes: assumes cpu_rq(cpu)->lock is locked |
| 193 | * | 191 | * |
| @@ -205,6 +203,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) | |||
| 205 | old_idx = cp->elements[cpu].idx; | 203 | old_idx = cp->elements[cpu].idx; |
| 206 | if (old_idx == IDX_INVALID) { | 204 | if (old_idx == IDX_INVALID) { |
| 207 | int new_idx = cp->size++; | 205 | int new_idx = cp->size++; |
| 206 | |||
| 208 | cp->elements[new_idx].dl = dl; | 207 | cp->elements[new_idx].dl = dl; |
| 209 | cp->elements[new_idx].cpu = cpu; | 208 | cp->elements[new_idx].cpu = cpu; |
| 210 | cp->elements[cpu].idx = new_idx; | 209 | cp->elements[cpu].idx = new_idx; |
| @@ -221,7 +220,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) | |||
| 221 | /* | 220 | /* |
| 222 | * cpudl_set_freecpu - Set the cpudl.free_cpus | 221 | * cpudl_set_freecpu - Set the cpudl.free_cpus |
| 223 | * @cp: the cpudl max-heap context | 222 | * @cp: the cpudl max-heap context |
| 224 | * @cpu: rd attached cpu | 223 | * @cpu: rd attached CPU |
| 225 | */ | 224 | */ |
| 226 | void cpudl_set_freecpu(struct cpudl *cp, int cpu) | 225 | void cpudl_set_freecpu(struct cpudl *cp, int cpu) |
| 227 | { | 226 | { |
| @@ -231,7 +230,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu) | |||
| 231 | /* | 230 | /* |
| 232 | * cpudl_clear_freecpu - Clear the cpudl.free_cpus | 231 | * cpudl_clear_freecpu - Clear the cpudl.free_cpus |
| 233 | * @cp: the cpudl max-heap context | 232 | * @cp: the cpudl max-heap context |
| 234 | * @cpu: rd attached cpu | 233 | * @cpu: rd attached CPU |
| 235 | */ | 234 | */ |
| 236 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu) | 235 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu) |
| 237 | { | 236 | { |
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index b010d26e108e..0adeda93b5fb 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h | |||
| @@ -1,35 +1,26 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | #ifndef _LINUX_CPUDL_H | ||
| 3 | #define _LINUX_CPUDL_H | ||
| 4 | 2 | ||
| 5 | #include <linux/sched.h> | 3 | #define IDX_INVALID -1 |
| 6 | #include <linux/sched/deadline.h> | ||
| 7 | |||
| 8 | #define IDX_INVALID -1 | ||
| 9 | 4 | ||
| 10 | struct cpudl_item { | 5 | struct cpudl_item { |
| 11 | u64 dl; | 6 | u64 dl; |
| 12 | int cpu; | 7 | int cpu; |
| 13 | int idx; | 8 | int idx; |
| 14 | }; | 9 | }; |
| 15 | 10 | ||
| 16 | struct cpudl { | 11 | struct cpudl { |
| 17 | raw_spinlock_t lock; | 12 | raw_spinlock_t lock; |
| 18 | int size; | 13 | int size; |
| 19 | cpumask_var_t free_cpus; | 14 | cpumask_var_t free_cpus; |
| 20 | struct cpudl_item *elements; | 15 | struct cpudl_item *elements; |
| 21 | }; | 16 | }; |
| 22 | 17 | ||
| 23 | |||
| 24 | #ifdef CONFIG_SMP | 18 | #ifdef CONFIG_SMP |
| 25 | int cpudl_find(struct cpudl *cp, struct task_struct *p, | 19 | int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); |
| 26 | struct cpumask *later_mask); | ||
| 27 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl); | 20 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl); |
| 28 | void cpudl_clear(struct cpudl *cp, int cpu); | 21 | void cpudl_clear(struct cpudl *cp, int cpu); |
| 29 | int cpudl_init(struct cpudl *cp); | 22 | int cpudl_init(struct cpudl *cp); |
| 30 | void cpudl_set_freecpu(struct cpudl *cp, int cpu); | 23 | void cpudl_set_freecpu(struct cpudl *cp, int cpu); |
| 31 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu); | 24 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu); |
| 32 | void cpudl_cleanup(struct cpudl *cp); | 25 | void cpudl_cleanup(struct cpudl *cp); |
| 33 | #endif /* CONFIG_SMP */ | 26 | #endif /* CONFIG_SMP */ |
| 34 | |||
| 35 | #endif /* _LINUX_CPUDL_H */ | ||
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index dbc51442ecbc..5e54cbcae673 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c | |||
| @@ -8,7 +8,6 @@ | |||
| 8 | * it under the terms of the GNU General Public License version 2 as | 8 | * it under the terms of the GNU General Public License version 2 as |
| 9 | * published by the Free Software Foundation. | 9 | * published by the Free Software Foundation. |
| 10 | */ | 10 | */ |
| 11 | |||
| 12 | #include "sched.h" | 11 | #include "sched.h" |
| 13 | 12 | ||
| 14 | DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); | 13 | DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); |
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 617c6741c525..d2c6083304b4 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
| @@ -11,61 +11,56 @@ | |||
| 11 | 11 | ||
| 12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
| 13 | 13 | ||
| 14 | #include <linux/cpufreq.h> | ||
| 15 | #include <linux/kthread.h> | ||
| 16 | #include <uapi/linux/sched/types.h> | ||
| 17 | #include <linux/slab.h> | ||
| 18 | #include <trace/events/power.h> | ||
| 19 | |||
| 20 | #include "sched.h" | 14 | #include "sched.h" |
| 21 | 15 | ||
| 16 | #include <trace/events/power.h> | ||
| 17 | |||
| 22 | struct sugov_tunables { | 18 | struct sugov_tunables { |
| 23 | struct gov_attr_set attr_set; | 19 | struct gov_attr_set attr_set; |
| 24 | unsigned int rate_limit_us; | 20 | unsigned int rate_limit_us; |
| 25 | }; | 21 | }; |
| 26 | 22 | ||
| 27 | struct sugov_policy { | 23 | struct sugov_policy { |
| 28 | struct cpufreq_policy *policy; | 24 | struct cpufreq_policy *policy; |
| 29 | 25 | ||
| 30 | struct sugov_tunables *tunables; | 26 | struct sugov_tunables *tunables; |
| 31 | struct list_head tunables_hook; | 27 | struct list_head tunables_hook; |
| 32 | 28 | ||
| 33 | raw_spinlock_t update_lock; /* For shared policies */ | 29 | raw_spinlock_t update_lock; /* For shared policies */ |
| 34 | u64 last_freq_update_time; | 30 | u64 last_freq_update_time; |
| 35 | s64 freq_update_delay_ns; | 31 | s64 freq_update_delay_ns; |
| 36 | unsigned int next_freq; | 32 | unsigned int next_freq; |
| 37 | unsigned int cached_raw_freq; | 33 | unsigned int cached_raw_freq; |
| 38 | 34 | ||
| 39 | /* The next fields are only needed if fast switch cannot be used. */ | 35 | /* The next fields are only needed if fast switch cannot be used: */ |
| 40 | struct irq_work irq_work; | 36 | struct irq_work irq_work; |
| 41 | struct kthread_work work; | 37 | struct kthread_work work; |
| 42 | struct mutex work_lock; | 38 | struct mutex work_lock; |
| 43 | struct kthread_worker worker; | 39 | struct kthread_worker worker; |
| 44 | struct task_struct *thread; | 40 | struct task_struct *thread; |
| 45 | bool work_in_progress; | 41 | bool work_in_progress; |
| 46 | 42 | ||
| 47 | bool need_freq_update; | 43 | bool need_freq_update; |
| 48 | }; | 44 | }; |
| 49 | 45 | ||
| 50 | struct sugov_cpu { | 46 | struct sugov_cpu { |
| 51 | struct update_util_data update_util; | 47 | struct update_util_data update_util; |
| 52 | struct sugov_policy *sg_policy; | 48 | struct sugov_policy *sg_policy; |
| 53 | unsigned int cpu; | 49 | unsigned int cpu; |
| 54 | 50 | ||
| 55 | bool iowait_boost_pending; | 51 | bool iowait_boost_pending; |
| 56 | unsigned int iowait_boost; | 52 | unsigned int iowait_boost; |
| 57 | unsigned int iowait_boost_max; | 53 | unsigned int iowait_boost_max; |
| 58 | u64 last_update; | 54 | u64 last_update; |
| 59 | 55 | ||
| 60 | /* The fields below are only needed when sharing a policy. */ | 56 | /* The fields below are only needed when sharing a policy: */ |
| 61 | unsigned long util_cfs; | 57 | unsigned long util_cfs; |
| 62 | unsigned long util_dl; | 58 | unsigned long util_dl; |
| 63 | unsigned long max; | 59 | unsigned long max; |
| 64 | unsigned int flags; | ||
| 65 | 60 | ||
| 66 | /* The field below is for single-CPU policies only. */ | 61 | /* The field below is for single-CPU policies only: */ |
| 67 | #ifdef CONFIG_NO_HZ_COMMON | 62 | #ifdef CONFIG_NO_HZ_COMMON |
| 68 | unsigned long saved_idle_calls; | 63 | unsigned long saved_idle_calls; |
| 69 | #endif | 64 | #endif |
| 70 | }; | 65 | }; |
| 71 | 66 | ||
| @@ -79,9 +74,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) | |||
| 79 | 74 | ||
| 80 | /* | 75 | /* |
| 81 | * Since cpufreq_update_util() is called with rq->lock held for | 76 | * Since cpufreq_update_util() is called with rq->lock held for |
| 82 | * the @target_cpu, our per-cpu data is fully serialized. | 77 | * the @target_cpu, our per-CPU data is fully serialized. |
| 83 | * | 78 | * |
| 84 | * However, drivers cannot in general deal with cross-cpu | 79 | * However, drivers cannot in general deal with cross-CPU |
| 85 | * requests, so while get_next_freq() will work, our | 80 | * requests, so while get_next_freq() will work, our |
| 86 | * sugov_update_commit() call may not for the fast switching platforms. | 81 | * sugov_update_commit() call may not for the fast switching platforms. |
| 87 | * | 82 | * |
| @@ -111,6 +106,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) | |||
| 111 | } | 106 | } |
| 112 | 107 | ||
| 113 | delta_ns = time - sg_policy->last_freq_update_time; | 108 | delta_ns = time - sg_policy->last_freq_update_time; |
| 109 | |||
| 114 | return delta_ns >= sg_policy->freq_update_delay_ns; | 110 | return delta_ns >= sg_policy->freq_update_delay_ns; |
| 115 | } | 111 | } |
| 116 | 112 | ||
| @@ -186,17 +182,28 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) | |||
| 186 | 182 | ||
| 187 | static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) | 183 | static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) |
| 188 | { | 184 | { |
| 185 | struct rq *rq = cpu_rq(sg_cpu->cpu); | ||
| 186 | unsigned long util; | ||
| 187 | |||
| 188 | if (rq->rt.rt_nr_running) { | ||
| 189 | util = sg_cpu->max; | ||
| 190 | } else { | ||
| 191 | util = sg_cpu->util_dl; | ||
| 192 | if (rq->cfs.h_nr_running) | ||
| 193 | util += sg_cpu->util_cfs; | ||
| 194 | } | ||
| 195 | |||
| 189 | /* | 196 | /* |
| 190 | * Ideally we would like to set util_dl as min/guaranteed freq and | 197 | * Ideally we would like to set util_dl as min/guaranteed freq and |
| 191 | * util_cfs + util_dl as requested freq. However, cpufreq is not yet | 198 | * util_cfs + util_dl as requested freq. However, cpufreq is not yet |
| 192 | * ready for such an interface. So, we only do the latter for now. | 199 | * ready for such an interface. So, we only do the latter for now. |
| 193 | */ | 200 | */ |
| 194 | return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max); | 201 | return min(util, sg_cpu->max); |
| 195 | } | 202 | } |
| 196 | 203 | ||
| 197 | static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time) | 204 | static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) |
| 198 | { | 205 | { |
| 199 | if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) { | 206 | if (flags & SCHED_CPUFREQ_IOWAIT) { |
| 200 | if (sg_cpu->iowait_boost_pending) | 207 | if (sg_cpu->iowait_boost_pending) |
| 201 | return; | 208 | return; |
| 202 | 209 | ||
| @@ -260,43 +267,51 @@ static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) | |||
| 260 | static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } | 267 | static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } |
| 261 | #endif /* CONFIG_NO_HZ_COMMON */ | 268 | #endif /* CONFIG_NO_HZ_COMMON */ |
| 262 | 269 | ||
| 270 | /* | ||
| 271 | * Make sugov_should_update_freq() ignore the rate limit when DL | ||
| 272 | * has increased the utilization. | ||
| 273 | */ | ||
| 274 | static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) | ||
| 275 | { | ||
| 276 | if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl) | ||
| 277 | sg_policy->need_freq_update = true; | ||
| 278 | } | ||
| 279 | |||
| 263 | static void sugov_update_single(struct update_util_data *hook, u64 time, | 280 | static void sugov_update_single(struct update_util_data *hook, u64 time, |
| 264 | unsigned int flags) | 281 | unsigned int flags) |
| 265 | { | 282 | { |
| 266 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); | 283 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); |
| 267 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 284 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
| 268 | struct cpufreq_policy *policy = sg_policy->policy; | ||
| 269 | unsigned long util, max; | 285 | unsigned long util, max; |
| 270 | unsigned int next_f; | 286 | unsigned int next_f; |
| 271 | bool busy; | 287 | bool busy; |
| 272 | 288 | ||
| 273 | sugov_set_iowait_boost(sg_cpu, time); | 289 | sugov_set_iowait_boost(sg_cpu, time, flags); |
| 274 | sg_cpu->last_update = time; | 290 | sg_cpu->last_update = time; |
| 275 | 291 | ||
| 292 | ignore_dl_rate_limit(sg_cpu, sg_policy); | ||
| 293 | |||
| 276 | if (!sugov_should_update_freq(sg_policy, time)) | 294 | if (!sugov_should_update_freq(sg_policy, time)) |
| 277 | return; | 295 | return; |
| 278 | 296 | ||
| 279 | busy = sugov_cpu_is_busy(sg_cpu); | 297 | busy = sugov_cpu_is_busy(sg_cpu); |
| 280 | 298 | ||
| 281 | if (flags & SCHED_CPUFREQ_RT) { | 299 | sugov_get_util(sg_cpu); |
| 282 | next_f = policy->cpuinfo.max_freq; | 300 | max = sg_cpu->max; |
| 283 | } else { | 301 | util = sugov_aggregate_util(sg_cpu); |
| 284 | sugov_get_util(sg_cpu); | 302 | sugov_iowait_boost(sg_cpu, &util, &max); |
| 285 | max = sg_cpu->max; | 303 | next_f = get_next_freq(sg_policy, util, max); |
| 286 | util = sugov_aggregate_util(sg_cpu); | 304 | /* |
| 287 | sugov_iowait_boost(sg_cpu, &util, &max); | 305 | * Do not reduce the frequency if the CPU has not been idle |
| 288 | next_f = get_next_freq(sg_policy, util, max); | 306 | * recently, as the reduction is likely to be premature then. |
| 289 | /* | 307 | */ |
| 290 | * Do not reduce the frequency if the CPU has not been idle | 308 | if (busy && next_f < sg_policy->next_freq) { |
| 291 | * recently, as the reduction is likely to be premature then. | 309 | next_f = sg_policy->next_freq; |
| 292 | */ | ||
| 293 | if (busy && next_f < sg_policy->next_freq) { | ||
| 294 | next_f = sg_policy->next_freq; | ||
| 295 | 310 | ||
| 296 | /* Reset cached freq as next_freq has changed */ | 311 | /* Reset cached freq as next_freq has changed */ |
| 297 | sg_policy->cached_raw_freq = 0; | 312 | sg_policy->cached_raw_freq = 0; |
| 298 | } | ||
| 299 | } | 313 | } |
| 314 | |||
| 300 | sugov_update_commit(sg_policy, time, next_f); | 315 | sugov_update_commit(sg_policy, time, next_f); |
| 301 | } | 316 | } |
| 302 | 317 | ||
| @@ -312,6 +327,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) | |||
| 312 | unsigned long j_util, j_max; | 327 | unsigned long j_util, j_max; |
| 313 | s64 delta_ns; | 328 | s64 delta_ns; |
| 314 | 329 | ||
| 330 | sugov_get_util(j_sg_cpu); | ||
| 331 | |||
| 315 | /* | 332 | /* |
| 316 | * If the CFS CPU utilization was last updated before the | 333 | * If the CFS CPU utilization was last updated before the |
| 317 | * previous frequency update and the time elapsed between the | 334 | * previous frequency update and the time elapsed between the |
| @@ -325,28 +342,22 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) | |||
| 325 | if (delta_ns > TICK_NSEC) { | 342 | if (delta_ns > TICK_NSEC) { |
| 326 | j_sg_cpu->iowait_boost = 0; | 343 | j_sg_cpu->iowait_boost = 0; |
| 327 | j_sg_cpu->iowait_boost_pending = false; | 344 | j_sg_cpu->iowait_boost_pending = false; |
| 328 | j_sg_cpu->util_cfs = 0; | ||
| 329 | if (j_sg_cpu->util_dl == 0) | ||
| 330 | continue; | ||
| 331 | } | 345 | } |
| 332 | if (j_sg_cpu->flags & SCHED_CPUFREQ_RT) | ||
| 333 | return policy->cpuinfo.max_freq; | ||
| 334 | 346 | ||
| 335 | j_max = j_sg_cpu->max; | 347 | j_max = j_sg_cpu->max; |
| 336 | j_util = sugov_aggregate_util(j_sg_cpu); | 348 | j_util = sugov_aggregate_util(j_sg_cpu); |
| 349 | sugov_iowait_boost(j_sg_cpu, &j_util, &j_max); | ||
| 337 | if (j_util * max > j_max * util) { | 350 | if (j_util * max > j_max * util) { |
| 338 | util = j_util; | 351 | util = j_util; |
| 339 | max = j_max; | 352 | max = j_max; |
| 340 | } | 353 | } |
| 341 | |||
| 342 | sugov_iowait_boost(j_sg_cpu, &util, &max); | ||
| 343 | } | 354 | } |
| 344 | 355 | ||
| 345 | return get_next_freq(sg_policy, util, max); | 356 | return get_next_freq(sg_policy, util, max); |
| 346 | } | 357 | } |
| 347 | 358 | ||
| 348 | static void sugov_update_shared(struct update_util_data *hook, u64 time, | 359 | static void |
| 349 | unsigned int flags) | 360 | sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) |
| 350 | { | 361 | { |
| 351 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); | 362 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); |
| 352 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 363 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
| @@ -354,18 +365,13 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, | |||
| 354 | 365 | ||
| 355 | raw_spin_lock(&sg_policy->update_lock); | 366 | raw_spin_lock(&sg_policy->update_lock); |
| 356 | 367 | ||
| 357 | sugov_get_util(sg_cpu); | 368 | sugov_set_iowait_boost(sg_cpu, time, flags); |
| 358 | sg_cpu->flags = flags; | ||
| 359 | |||
| 360 | sugov_set_iowait_boost(sg_cpu, time); | ||
| 361 | sg_cpu->last_update = time; | 369 | sg_cpu->last_update = time; |
| 362 | 370 | ||
| 363 | if (sugov_should_update_freq(sg_policy, time)) { | 371 | ignore_dl_rate_limit(sg_cpu, sg_policy); |
| 364 | if (flags & SCHED_CPUFREQ_RT) | ||
| 365 | next_f = sg_policy->policy->cpuinfo.max_freq; | ||
| 366 | else | ||
| 367 | next_f = sugov_next_freq_shared(sg_cpu, time); | ||
| 368 | 372 | ||
| 373 | if (sugov_should_update_freq(sg_policy, time)) { | ||
| 374 | next_f = sugov_next_freq_shared(sg_cpu, time); | ||
| 369 | sugov_update_commit(sg_policy, time, next_f); | 375 | sugov_update_commit(sg_policy, time, next_f); |
| 370 | } | 376 | } |
| 371 | 377 | ||
| @@ -423,8 +429,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) | |||
| 423 | return sprintf(buf, "%u\n", tunables->rate_limit_us); | 429 | return sprintf(buf, "%u\n", tunables->rate_limit_us); |
| 424 | } | 430 | } |
| 425 | 431 | ||
| 426 | static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, | 432 | static ssize_t |
| 427 | size_t count) | 433 | rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count) |
| 428 | { | 434 | { |
| 429 | struct sugov_tunables *tunables = to_sugov_tunables(attr_set); | 435 | struct sugov_tunables *tunables = to_sugov_tunables(attr_set); |
| 430 | struct sugov_policy *sg_policy; | 436 | struct sugov_policy *sg_policy; |
| @@ -479,11 +485,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) | |||
| 479 | { | 485 | { |
| 480 | struct task_struct *thread; | 486 | struct task_struct *thread; |
| 481 | struct sched_attr attr = { | 487 | struct sched_attr attr = { |
| 482 | .size = sizeof(struct sched_attr), | 488 | .size = sizeof(struct sched_attr), |
| 483 | .sched_policy = SCHED_DEADLINE, | 489 | .sched_policy = SCHED_DEADLINE, |
| 484 | .sched_flags = SCHED_FLAG_SUGOV, | 490 | .sched_flags = SCHED_FLAG_SUGOV, |
| 485 | .sched_nice = 0, | 491 | .sched_nice = 0, |
| 486 | .sched_priority = 0, | 492 | .sched_priority = 0, |
| 487 | /* | 493 | /* |
| 488 | * Fake (unused) bandwidth; workaround to "fix" | 494 | * Fake (unused) bandwidth; workaround to "fix" |
| 489 | * priority inheritance. | 495 | * priority inheritance. |
| @@ -662,21 +668,20 @@ static int sugov_start(struct cpufreq_policy *policy) | |||
| 662 | struct sugov_policy *sg_policy = policy->governor_data; | 668 | struct sugov_policy *sg_policy = policy->governor_data; |
| 663 | unsigned int cpu; | 669 | unsigned int cpu; |
| 664 | 670 | ||
| 665 | sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; | 671 | sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; |
| 666 | sg_policy->last_freq_update_time = 0; | 672 | sg_policy->last_freq_update_time = 0; |
| 667 | sg_policy->next_freq = UINT_MAX; | 673 | sg_policy->next_freq = UINT_MAX; |
| 668 | sg_policy->work_in_progress = false; | 674 | sg_policy->work_in_progress = false; |
| 669 | sg_policy->need_freq_update = false; | 675 | sg_policy->need_freq_update = false; |
| 670 | sg_policy->cached_raw_freq = 0; | 676 | sg_policy->cached_raw_freq = 0; |
| 671 | 677 | ||
| 672 | for_each_cpu(cpu, policy->cpus) { | 678 | for_each_cpu(cpu, policy->cpus) { |
| 673 | struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); | 679 | struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); |
| 674 | 680 | ||
| 675 | memset(sg_cpu, 0, sizeof(*sg_cpu)); | 681 | memset(sg_cpu, 0, sizeof(*sg_cpu)); |
| 676 | sg_cpu->cpu = cpu; | 682 | sg_cpu->cpu = cpu; |
| 677 | sg_cpu->sg_policy = sg_policy; | 683 | sg_cpu->sg_policy = sg_policy; |
| 678 | sg_cpu->flags = 0; | 684 | sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; |
| 679 | sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; | ||
| 680 | } | 685 | } |
| 681 | 686 | ||
| 682 | for_each_cpu(cpu, policy->cpus) { | 687 | for_each_cpu(cpu, policy->cpus) { |
| @@ -720,14 +725,14 @@ static void sugov_limits(struct cpufreq_policy *policy) | |||
| 720 | } | 725 | } |
| 721 | 726 | ||
| 722 | static struct cpufreq_governor schedutil_gov = { | 727 | static struct cpufreq_governor schedutil_gov = { |
| 723 | .name = "schedutil", | 728 | .name = "schedutil", |
| 724 | .owner = THIS_MODULE, | 729 | .owner = THIS_MODULE, |
| 725 | .dynamic_switching = true, | 730 | .dynamic_switching = true, |
| 726 | .init = sugov_init, | 731 | .init = sugov_init, |
| 727 | .exit = sugov_exit, | 732 | .exit = sugov_exit, |
| 728 | .start = sugov_start, | 733 | .start = sugov_start, |
| 729 | .stop = sugov_stop, | 734 | .stop = sugov_stop, |
| 730 | .limits = sugov_limits, | 735 | .limits = sugov_limits, |
| 731 | }; | 736 | }; |
| 732 | 737 | ||
| 733 | #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL | 738 | #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL |
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 2511aba36b89..daaadf939ccb 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c | |||
| @@ -14,7 +14,7 @@ | |||
| 14 | * | 14 | * |
| 15 | * going from the lowest priority to the highest. CPUs in the INVALID state | 15 | * going from the lowest priority to the highest. CPUs in the INVALID state |
| 16 | * are not eligible for routing. The system maintains this state with | 16 | * are not eligible for routing. The system maintains this state with |
| 17 | * a 2 dimensional bitmap (the first for priority class, the second for cpus | 17 | * a 2 dimensional bitmap (the first for priority class, the second for CPUs |
| 18 | * in that class). Therefore a typical application without affinity | 18 | * in that class). Therefore a typical application without affinity |
| 19 | * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit | 19 | * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit |
| 20 | * searches). For tasks with affinity restrictions, the algorithm has a | 20 | * searches). For tasks with affinity restrictions, the algorithm has a |
| @@ -26,12 +26,7 @@ | |||
| 26 | * as published by the Free Software Foundation; version 2 | 26 | * as published by the Free Software Foundation; version 2 |
| 27 | * of the License. | 27 | * of the License. |
| 28 | */ | 28 | */ |
| 29 | 29 | #include "sched.h" | |
| 30 | #include <linux/gfp.h> | ||
| 31 | #include <linux/sched.h> | ||
| 32 | #include <linux/sched/rt.h> | ||
| 33 | #include <linux/slab.h> | ||
| 34 | #include "cpupri.h" | ||
| 35 | 30 | ||
| 36 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ | 31 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ |
| 37 | static int convert_prio(int prio) | 32 | static int convert_prio(int prio) |
| @@ -128,9 +123,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, | |||
| 128 | } | 123 | } |
| 129 | 124 | ||
| 130 | /** | 125 | /** |
| 131 | * cpupri_set - update the cpu priority setting | 126 | * cpupri_set - update the CPU priority setting |
| 132 | * @cp: The cpupri context | 127 | * @cp: The cpupri context |
| 133 | * @cpu: The target cpu | 128 | * @cpu: The target CPU |
| 134 | * @newpri: The priority (INVALID-RT99) to assign to this CPU | 129 | * @newpri: The priority (INVALID-RT99) to assign to this CPU |
| 135 | * | 130 | * |
| 136 | * Note: Assumes cpu_rq(cpu)->lock is locked | 131 | * Note: Assumes cpu_rq(cpu)->lock is locked |
| @@ -151,7 +146,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
| 151 | return; | 146 | return; |
| 152 | 147 | ||
| 153 | /* | 148 | /* |
| 154 | * If the cpu was currently mapped to a different value, we | 149 | * If the CPU was currently mapped to a different value, we |
| 155 | * need to map it to the new value then remove the old value. | 150 | * need to map it to the new value then remove the old value. |
| 156 | * Note, we must add the new value first, otherwise we risk the | 151 | * Note, we must add the new value first, otherwise we risk the |
| 157 | * cpu being missed by the priority loop in cpupri_find. | 152 | * cpu being missed by the priority loop in cpupri_find. |
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index bab050019071..7dc20a3232e7 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h | |||
| @@ -1,32 +1,25 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | #ifndef _LINUX_CPUPRI_H | ||
| 3 | #define _LINUX_CPUPRI_H | ||
| 4 | |||
| 5 | #include <linux/sched.h> | ||
| 6 | 2 | ||
| 7 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) | 3 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) |
| 8 | 4 | ||
| 9 | #define CPUPRI_INVALID -1 | 5 | #define CPUPRI_INVALID -1 |
| 10 | #define CPUPRI_IDLE 0 | 6 | #define CPUPRI_IDLE 0 |
| 11 | #define CPUPRI_NORMAL 1 | 7 | #define CPUPRI_NORMAL 1 |
| 12 | /* values 2-101 are RT priorities 0-99 */ | 8 | /* values 2-101 are RT priorities 0-99 */ |
| 13 | 9 | ||
| 14 | struct cpupri_vec { | 10 | struct cpupri_vec { |
| 15 | atomic_t count; | 11 | atomic_t count; |
| 16 | cpumask_var_t mask; | 12 | cpumask_var_t mask; |
| 17 | }; | 13 | }; |
| 18 | 14 | ||
| 19 | struct cpupri { | 15 | struct cpupri { |
| 20 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; | 16 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; |
| 21 | int *cpu_to_pri; | 17 | int *cpu_to_pri; |
| 22 | }; | 18 | }; |
| 23 | 19 | ||
| 24 | #ifdef CONFIG_SMP | 20 | #ifdef CONFIG_SMP |
| 25 | int cpupri_find(struct cpupri *cp, | 21 | int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); |
| 26 | struct task_struct *p, struct cpumask *lowest_mask); | ||
| 27 | void cpupri_set(struct cpupri *cp, int cpu, int pri); | 22 | void cpupri_set(struct cpupri *cp, int cpu, int pri); |
| 28 | int cpupri_init(struct cpupri *cp); | 23 | int cpupri_init(struct cpupri *cp); |
| 29 | void cpupri_cleanup(struct cpupri *cp); | 24 | void cpupri_cleanup(struct cpupri *cp); |
| 30 | #endif | 25 | #endif |
| 31 | |||
| 32 | #endif /* _LINUX_CPUPRI_H */ | ||
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index bac6ac9a4ec7..0796f938c4f0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
| @@ -1,10 +1,6 @@ | |||
| 1 | #include <linux/export.h> | 1 | /* |
| 2 | #include <linux/sched.h> | 2 | * Simple CPU accounting cgroup controller |
| 3 | #include <linux/tsacct_kern.h> | 3 | */ |
| 4 | #include <linux/kernel_stat.h> | ||
| 5 | #include <linux/static_key.h> | ||
| 6 | #include <linux/context_tracking.h> | ||
| 7 | #include <linux/sched/cputime.h> | ||
| 8 | #include "sched.h" | 4 | #include "sched.h" |
| 9 | 5 | ||
| 10 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 6 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
| @@ -113,9 +109,9 @@ static inline void task_group_account_field(struct task_struct *p, int index, | |||
| 113 | } | 109 | } |
| 114 | 110 | ||
| 115 | /* | 111 | /* |
| 116 | * Account user cpu time to a process. | 112 | * Account user CPU time to a process. |
| 117 | * @p: the process that the cpu time gets accounted to | 113 | * @p: the process that the CPU time gets accounted to |
| 118 | * @cputime: the cpu time spent in user space since the last update | 114 | * @cputime: the CPU time spent in user space since the last update |
| 119 | */ | 115 | */ |
| 120 | void account_user_time(struct task_struct *p, u64 cputime) | 116 | void account_user_time(struct task_struct *p, u64 cputime) |
| 121 | { | 117 | { |
| @@ -135,9 +131,9 @@ void account_user_time(struct task_struct *p, u64 cputime) | |||
| 135 | } | 131 | } |
| 136 | 132 | ||
| 137 | /* | 133 | /* |
| 138 | * Account guest cpu time to a process. | 134 | * Account guest CPU time to a process. |
| 139 | * @p: the process that the cpu time gets accounted to | 135 | * @p: the process that the CPU time gets accounted to |
| 140 | * @cputime: the cpu time spent in virtual machine since the last update | 136 | * @cputime: the CPU time spent in virtual machine since the last update |
| 141 | */ | 137 | */ |
| 142 | void account_guest_time(struct task_struct *p, u64 cputime) | 138 | void account_guest_time(struct task_struct *p, u64 cputime) |
| 143 | { | 139 | { |
| @@ -159,9 +155,9 @@ void account_guest_time(struct task_struct *p, u64 cputime) | |||
| 159 | } | 155 | } |
| 160 | 156 | ||
| 161 | /* | 157 | /* |
| 162 | * Account system cpu time to a process and desired cpustat field | 158 | * Account system CPU time to a process and desired cpustat field |
| 163 | * @p: the process that the cpu time gets accounted to | 159 | * @p: the process that the CPU time gets accounted to |
| 164 | * @cputime: the cpu time spent in kernel space since the last update | 160 | * @cputime: the CPU time spent in kernel space since the last update |
| 165 | * @index: pointer to cpustat field that has to be updated | 161 | * @index: pointer to cpustat field that has to be updated |
| 166 | */ | 162 | */ |
| 167 | void account_system_index_time(struct task_struct *p, | 163 | void account_system_index_time(struct task_struct *p, |
| @@ -179,10 +175,10 @@ void account_system_index_time(struct task_struct *p, | |||
| 179 | } | 175 | } |
| 180 | 176 | ||
| 181 | /* | 177 | /* |
| 182 | * Account system cpu time to a process. | 178 | * Account system CPU time to a process. |
| 183 | * @p: the process that the cpu time gets accounted to | 179 | * @p: the process that the CPU time gets accounted to |
| 184 | * @hardirq_offset: the offset to subtract from hardirq_count() | 180 | * @hardirq_offset: the offset to subtract from hardirq_count() |
| 185 | * @cputime: the cpu time spent in kernel space since the last update | 181 | * @cputime: the CPU time spent in kernel space since the last update |
| 186 | */ | 182 | */ |
| 187 | void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) | 183 | void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) |
| 188 | { | 184 | { |
| @@ -205,7 +201,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) | |||
| 205 | 201 | ||
| 206 | /* | 202 | /* |
| 207 | * Account for involuntary wait time. | 203 | * Account for involuntary wait time. |
| 208 | * @cputime: the cpu time spent in involuntary wait | 204 | * @cputime: the CPU time spent in involuntary wait |
| 209 | */ | 205 | */ |
| 210 | void account_steal_time(u64 cputime) | 206 | void account_steal_time(u64 cputime) |
| 211 | { | 207 | { |
| @@ -216,7 +212,7 @@ void account_steal_time(u64 cputime) | |||
| 216 | 212 | ||
| 217 | /* | 213 | /* |
| 218 | * Account for idle time. | 214 | * Account for idle time. |
| 219 | * @cputime: the cpu time spent in idle wait | 215 | * @cputime: the CPU time spent in idle wait |
| 220 | */ | 216 | */ |
| 221 | void account_idle_time(u64 cputime) | 217 | void account_idle_time(u64 cputime) |
| 222 | { | 218 | { |
| @@ -338,7 +334,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
| 338 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 334 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
| 339 | /* | 335 | /* |
| 340 | * Account a tick to a process and cpustat | 336 | * Account a tick to a process and cpustat |
| 341 | * @p: the process that the cpu time gets accounted to | 337 | * @p: the process that the CPU time gets accounted to |
| 342 | * @user_tick: is the tick from userspace | 338 | * @user_tick: is the tick from userspace |
| 343 | * @rq: the pointer to rq | 339 | * @rq: the pointer to rq |
| 344 | * | 340 | * |
| @@ -400,17 +396,16 @@ static void irqtime_account_idle_ticks(int ticks) | |||
| 400 | irqtime_account_process_tick(current, 0, rq, ticks); | 396 | irqtime_account_process_tick(current, 0, rq, ticks); |
| 401 | } | 397 | } |
| 402 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 398 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
| 403 | static inline void irqtime_account_idle_ticks(int ticks) {} | 399 | static inline void irqtime_account_idle_ticks(int ticks) { } |
| 404 | static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, | 400 | static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, |
| 405 | struct rq *rq, int nr_ticks) {} | 401 | struct rq *rq, int nr_ticks) { } |
| 406 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | 402 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
| 407 | 403 | ||
| 408 | /* | 404 | /* |
| 409 | * Use precise platform statistics if available: | 405 | * Use precise platform statistics if available: |
| 410 | */ | 406 | */ |
| 411 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | 407 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING |
| 412 | 408 | # ifndef __ARCH_HAS_VTIME_TASK_SWITCH | |
| 413 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH | ||
| 414 | void vtime_common_task_switch(struct task_struct *prev) | 409 | void vtime_common_task_switch(struct task_struct *prev) |
| 415 | { | 410 | { |
| 416 | if (is_idle_task(prev)) | 411 | if (is_idle_task(prev)) |
| @@ -421,8 +416,7 @@ void vtime_common_task_switch(struct task_struct *prev) | |||
| 421 | vtime_flush(prev); | 416 | vtime_flush(prev); |
| 422 | arch_vtime_task_switch(prev); | 417 | arch_vtime_task_switch(prev); |
| 423 | } | 418 | } |
| 424 | #endif | 419 | # endif |
| 425 | |||
| 426 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ | 420 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ |
| 427 | 421 | ||
| 428 | 422 | ||
| @@ -469,10 +463,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) | |||
| 469 | *ut = cputime.utime; | 463 | *ut = cputime.utime; |
| 470 | *st = cputime.stime; | 464 | *st = cputime.stime; |
| 471 | } | 465 | } |
| 472 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | 466 | |
| 467 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */ | ||
| 468 | |||
| 473 | /* | 469 | /* |
| 474 | * Account a single tick of cpu time. | 470 | * Account a single tick of CPU time. |
| 475 | * @p: the process that the cpu time gets accounted to | 471 | * @p: the process that the CPU time gets accounted to |
| 476 | * @user_tick: indicates if the tick is a user or a system tick | 472 | * @user_tick: indicates if the tick is a user or a system tick |
| 477 | */ | 473 | */ |
| 478 | void account_process_tick(struct task_struct *p, int user_tick) | 474 | void account_process_tick(struct task_struct *p, int user_tick) |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9df09782025c..d1c7bf7c7e5b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
| @@ -17,9 +17,6 @@ | |||
| 17 | */ | 17 | */ |
| 18 | #include "sched.h" | 18 | #include "sched.h" |
| 19 | 19 | ||
| 20 | #include <linux/slab.h> | ||
| 21 | #include <uapi/linux/sched/types.h> | ||
| 22 | |||
| 23 | struct dl_bandwidth def_dl_bandwidth; | 20 | struct dl_bandwidth def_dl_bandwidth; |
| 24 | 21 | ||
| 25 | static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) | 22 | static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) |
| @@ -87,7 +84,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) | |||
| 87 | SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ | 84 | SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ |
| 88 | SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); | 85 | SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); |
| 89 | /* kick cpufreq (see the comment in kernel/sched/sched.h). */ | 86 | /* kick cpufreq (see the comment in kernel/sched/sched.h). */ |
| 90 | cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); | 87 | cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); |
| 91 | } | 88 | } |
| 92 | 89 | ||
| 93 | static inline | 90 | static inline |
| @@ -101,7 +98,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) | |||
| 101 | if (dl_rq->running_bw > old) | 98 | if (dl_rq->running_bw > old) |
| 102 | dl_rq->running_bw = 0; | 99 | dl_rq->running_bw = 0; |
| 103 | /* kick cpufreq (see the comment in kernel/sched/sched.h). */ | 100 | /* kick cpufreq (see the comment in kernel/sched/sched.h). */ |
| 104 | cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); | 101 | cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); |
| 105 | } | 102 | } |
| 106 | 103 | ||
| 107 | static inline | 104 | static inline |
| @@ -514,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head); | |||
| 514 | static void push_dl_tasks(struct rq *); | 511 | static void push_dl_tasks(struct rq *); |
| 515 | static void pull_dl_task(struct rq *); | 512 | static void pull_dl_task(struct rq *); |
| 516 | 513 | ||
| 517 | static inline void queue_push_tasks(struct rq *rq) | 514 | static inline void deadline_queue_push_tasks(struct rq *rq) |
| 518 | { | 515 | { |
| 519 | if (!has_pushable_dl_tasks(rq)) | 516 | if (!has_pushable_dl_tasks(rq)) |
| 520 | return; | 517 | return; |
| @@ -522,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq) | |||
| 522 | queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); | 519 | queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); |
| 523 | } | 520 | } |
| 524 | 521 | ||
| 525 | static inline void queue_pull_task(struct rq *rq) | 522 | static inline void deadline_queue_pull_task(struct rq *rq) |
| 526 | { | 523 | { |
| 527 | queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); | 524 | queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); |
| 528 | } | 525 | } |
| @@ -539,12 +536,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p | |||
| 539 | 536 | ||
| 540 | /* | 537 | /* |
| 541 | * If we cannot preempt any rq, fall back to pick any | 538 | * If we cannot preempt any rq, fall back to pick any |
| 542 | * online cpu. | 539 | * online CPU: |
| 543 | */ | 540 | */ |
| 544 | cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); | 541 | cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); |
| 545 | if (cpu >= nr_cpu_ids) { | 542 | if (cpu >= nr_cpu_ids) { |
| 546 | /* | 543 | /* |
| 547 | * Fail to find any suitable cpu. | 544 | * Failed to find any suitable CPU. |
| 548 | * The task will never come back! | 545 | * The task will never come back! |
| 549 | */ | 546 | */ |
| 550 | BUG_ON(dl_bandwidth_enabled()); | 547 | BUG_ON(dl_bandwidth_enabled()); |
| @@ -597,19 +594,18 @@ static inline void pull_dl_task(struct rq *rq) | |||
| 597 | { | 594 | { |
| 598 | } | 595 | } |
| 599 | 596 | ||
| 600 | static inline void queue_push_tasks(struct rq *rq) | 597 | static inline void deadline_queue_push_tasks(struct rq *rq) |
| 601 | { | 598 | { |
| 602 | } | 599 | } |
| 603 | 600 | ||
| 604 | static inline void queue_pull_task(struct rq *rq) | 601 | static inline void deadline_queue_pull_task(struct rq *rq) |
| 605 | { | 602 | { |
| 606 | } | 603 | } |
| 607 | #endif /* CONFIG_SMP */ | 604 | #endif /* CONFIG_SMP */ |
| 608 | 605 | ||
| 609 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); | 606 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); |
| 610 | static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); | 607 | static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); |
| 611 | static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | 608 | static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); |
| 612 | int flags); | ||
| 613 | 609 | ||
| 614 | /* | 610 | /* |
| 615 | * We are being explicitly informed that a new instance is starting, | 611 | * We are being explicitly informed that a new instance is starting, |
| @@ -1763,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | |||
| 1763 | if (hrtick_enabled(rq)) | 1759 | if (hrtick_enabled(rq)) |
| 1764 | start_hrtick_dl(rq, p); | 1760 | start_hrtick_dl(rq, p); |
| 1765 | 1761 | ||
| 1766 | queue_push_tasks(rq); | 1762 | deadline_queue_push_tasks(rq); |
| 1767 | 1763 | ||
| 1768 | return p; | 1764 | return p; |
| 1769 | } | 1765 | } |
| @@ -1776,6 +1772,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) | |||
| 1776 | enqueue_pushable_dl_task(rq, p); | 1772 | enqueue_pushable_dl_task(rq, p); |
| 1777 | } | 1773 | } |
| 1778 | 1774 | ||
| 1775 | /* | ||
| 1776 | * scheduler tick hitting a task of our scheduling class. | ||
| 1777 | * | ||
| 1778 | * NOTE: This function can be called remotely by the tick offload that | ||
| 1779 | * goes along full dynticks. Therefore no local assumption can be made | ||
| 1780 | * and everything must be accessed through the @rq and @curr passed in | ||
| 1781 | * parameters. | ||
| 1782 | */ | ||
| 1779 | static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) | 1783 | static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) |
| 1780 | { | 1784 | { |
| 1781 | update_curr_dl(rq); | 1785 | update_curr_dl(rq); |
| @@ -1865,7 +1869,7 @@ static int find_later_rq(struct task_struct *task) | |||
| 1865 | 1869 | ||
| 1866 | /* | 1870 | /* |
| 1867 | * We have to consider system topology and task affinity | 1871 | * We have to consider system topology and task affinity |
| 1868 | * first, then we can look for a suitable cpu. | 1872 | * first, then we can look for a suitable CPU. |
| 1869 | */ | 1873 | */ |
| 1870 | if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) | 1874 | if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) |
| 1871 | return -1; | 1875 | return -1; |
| @@ -1879,7 +1883,7 @@ static int find_later_rq(struct task_struct *task) | |||
| 1879 | * Now we check how well this matches with task's | 1883 | * Now we check how well this matches with task's |
| 1880 | * affinity and system topology. | 1884 | * affinity and system topology. |
| 1881 | * | 1885 | * |
| 1882 | * The last cpu where the task run is our first | 1886 | * The last CPU where the task run is our first |
| 1883 | * guess, since it is most likely cache-hot there. | 1887 | * guess, since it is most likely cache-hot there. |
| 1884 | */ | 1888 | */ |
| 1885 | if (cpumask_test_cpu(cpu, later_mask)) | 1889 | if (cpumask_test_cpu(cpu, later_mask)) |
| @@ -1909,9 +1913,9 @@ static int find_later_rq(struct task_struct *task) | |||
| 1909 | best_cpu = cpumask_first_and(later_mask, | 1913 | best_cpu = cpumask_first_and(later_mask, |
| 1910 | sched_domain_span(sd)); | 1914 | sched_domain_span(sd)); |
| 1911 | /* | 1915 | /* |
| 1912 | * Last chance: if a cpu being in both later_mask | 1916 | * Last chance: if a CPU being in both later_mask |
| 1913 | * and current sd span is valid, that becomes our | 1917 | * and current sd span is valid, that becomes our |
| 1914 | * choice. Of course, the latest possible cpu is | 1918 | * choice. Of course, the latest possible CPU is |
| 1915 | * already under consideration through later_mask. | 1919 | * already under consideration through later_mask. |
| 1916 | */ | 1920 | */ |
| 1917 | if (best_cpu < nr_cpu_ids) { | 1921 | if (best_cpu < nr_cpu_ids) { |
| @@ -2067,7 +2071,7 @@ retry: | |||
| 2067 | if (task == next_task) { | 2071 | if (task == next_task) { |
| 2068 | /* | 2072 | /* |
| 2069 | * The task is still there. We don't try | 2073 | * The task is still there. We don't try |
| 2070 | * again, some other cpu will pull it when ready. | 2074 | * again, some other CPU will pull it when ready. |
| 2071 | */ | 2075 | */ |
| 2072 | goto out; | 2076 | goto out; |
| 2073 | } | 2077 | } |
| @@ -2300,12 +2304,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) | |||
| 2300 | /* | 2304 | /* |
| 2301 | * Since this might be the only -deadline task on the rq, | 2305 | * Since this might be the only -deadline task on the rq, |
| 2302 | * this is the right place to try to pull some other one | 2306 | * this is the right place to try to pull some other one |
| 2303 | * from an overloaded cpu, if any. | 2307 | * from an overloaded CPU, if any. |
| 2304 | */ | 2308 | */ |
| 2305 | if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) | 2309 | if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) |
| 2306 | return; | 2310 | return; |
| 2307 | 2311 | ||
| 2308 | queue_pull_task(rq); | 2312 | deadline_queue_pull_task(rq); |
| 2309 | } | 2313 | } |
| 2310 | 2314 | ||
| 2311 | /* | 2315 | /* |
| @@ -2327,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
| 2327 | if (rq->curr != p) { | 2331 | if (rq->curr != p) { |
| 2328 | #ifdef CONFIG_SMP | 2332 | #ifdef CONFIG_SMP |
| 2329 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) | 2333 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) |
| 2330 | queue_push_tasks(rq); | 2334 | deadline_queue_push_tasks(rq); |
| 2331 | #endif | 2335 | #endif |
| 2332 | if (dl_task(rq->curr)) | 2336 | if (dl_task(rq->curr)) |
| 2333 | check_preempt_curr_dl(rq, p, 0); | 2337 | check_preempt_curr_dl(rq, p, 0); |
| @@ -2352,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, | |||
| 2352 | * or lowering its prio, so... | 2356 | * or lowering its prio, so... |
| 2353 | */ | 2357 | */ |
| 2354 | if (!rq->dl.overloaded) | 2358 | if (!rq->dl.overloaded) |
| 2355 | queue_pull_task(rq); | 2359 | deadline_queue_pull_task(rq); |
| 2356 | 2360 | ||
| 2357 | /* | 2361 | /* |
| 2358 | * If we now have a earlier deadline task than p, | 2362 | * If we now have a earlier deadline task than p, |
| @@ -2626,17 +2630,17 @@ void __dl_clear_params(struct task_struct *p) | |||
| 2626 | { | 2630 | { |
| 2627 | struct sched_dl_entity *dl_se = &p->dl; | 2631 | struct sched_dl_entity *dl_se = &p->dl; |
| 2628 | 2632 | ||
| 2629 | dl_se->dl_runtime = 0; | 2633 | dl_se->dl_runtime = 0; |
| 2630 | dl_se->dl_deadline = 0; | 2634 | dl_se->dl_deadline = 0; |
| 2631 | dl_se->dl_period = 0; | 2635 | dl_se->dl_period = 0; |
| 2632 | dl_se->flags = 0; | 2636 | dl_se->flags = 0; |
| 2633 | dl_se->dl_bw = 0; | 2637 | dl_se->dl_bw = 0; |
| 2634 | dl_se->dl_density = 0; | 2638 | dl_se->dl_density = 0; |
| 2635 | 2639 | ||
| 2636 | dl_se->dl_throttled = 0; | 2640 | dl_se->dl_throttled = 0; |
| 2637 | dl_se->dl_yielded = 0; | 2641 | dl_se->dl_yielded = 0; |
| 2638 | dl_se->dl_non_contending = 0; | 2642 | dl_se->dl_non_contending = 0; |
| 2639 | dl_se->dl_overrun = 0; | 2643 | dl_se->dl_overrun = 0; |
| 2640 | } | 2644 | } |
| 2641 | 2645 | ||
| 2642 | bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) | 2646 | bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) |
| @@ -2655,21 +2659,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) | |||
| 2655 | #ifdef CONFIG_SMP | 2659 | #ifdef CONFIG_SMP |
| 2656 | int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) | 2660 | int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) |
| 2657 | { | 2661 | { |
| 2658 | unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, | 2662 | unsigned int dest_cpu; |
| 2659 | cs_cpus_allowed); | ||
| 2660 | struct dl_bw *dl_b; | 2663 | struct dl_bw *dl_b; |
| 2661 | bool overflow; | 2664 | bool overflow; |
| 2662 | int cpus, ret; | 2665 | int cpus, ret; |
| 2663 | unsigned long flags; | 2666 | unsigned long flags; |
| 2664 | 2667 | ||
| 2668 | dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); | ||
| 2669 | |||
| 2665 | rcu_read_lock_sched(); | 2670 | rcu_read_lock_sched(); |
| 2666 | dl_b = dl_bw_of(dest_cpu); | 2671 | dl_b = dl_bw_of(dest_cpu); |
| 2667 | raw_spin_lock_irqsave(&dl_b->lock, flags); | 2672 | raw_spin_lock_irqsave(&dl_b->lock, flags); |
| 2668 | cpus = dl_bw_cpus(dest_cpu); | 2673 | cpus = dl_bw_cpus(dest_cpu); |
| 2669 | overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); | 2674 | overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); |
| 2670 | if (overflow) | 2675 | if (overflow) { |
| 2671 | ret = -EBUSY; | 2676 | ret = -EBUSY; |
| 2672 | else { | 2677 | } else { |
| 2673 | /* | 2678 | /* |
| 2674 | * We reserve space for this task in the destination | 2679 | * We reserve space for this task in the destination |
| 2675 | * root_domain, as we can't fail after this point. | 2680 | * root_domain, as we can't fail after this point. |
| @@ -2681,6 +2686,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo | |||
| 2681 | } | 2686 | } |
| 2682 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 2687 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
| 2683 | rcu_read_unlock_sched(); | 2688 | rcu_read_unlock_sched(); |
| 2689 | |||
| 2684 | return ret; | 2690 | return ret; |
| 2685 | } | 2691 | } |
| 2686 | 2692 | ||
| @@ -2701,6 +2707,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, | |||
| 2701 | ret = 0; | 2707 | ret = 0; |
| 2702 | raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); | 2708 | raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); |
| 2703 | rcu_read_unlock_sched(); | 2709 | rcu_read_unlock_sched(); |
| 2710 | |||
| 2704 | return ret; | 2711 | return ret; |
| 2705 | } | 2712 | } |
| 2706 | 2713 | ||
| @@ -2718,6 +2725,7 @@ bool dl_cpu_busy(unsigned int cpu) | |||
| 2718 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | 2725 | overflow = __dl_overflow(dl_b, cpus, 0, 0); |
| 2719 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 2726 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
| 2720 | rcu_read_unlock_sched(); | 2727 | rcu_read_unlock_sched(); |
| 2728 | |||
| 2721 | return overflow; | 2729 | return overflow; |
| 2722 | } | 2730 | } |
| 2723 | #endif | 2731 | #endif |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 72c401b3b15c..15b10e210a6b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * kernel/sched/debug.c | 2 | * kernel/sched/debug.c |
| 3 | * | 3 | * |
| 4 | * Print the CFS rbtree | 4 | * Print the CFS rbtree and other debugging details |
| 5 | * | 5 | * |
| 6 | * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar | 6 | * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar |
| 7 | * | 7 | * |
| @@ -9,16 +9,6 @@ | |||
| 9 | * it under the terms of the GNU General Public License version 2 as | 9 | * it under the terms of the GNU General Public License version 2 as |
| 10 | * published by the Free Software Foundation. | 10 | * published by the Free Software Foundation. |
| 11 | */ | 11 | */ |
| 12 | |||
| 13 | #include <linux/proc_fs.h> | ||
| 14 | #include <linux/sched/mm.h> | ||
| 15 | #include <linux/sched/task.h> | ||
| 16 | #include <linux/seq_file.h> | ||
| 17 | #include <linux/kallsyms.h> | ||
| 18 | #include <linux/utsname.h> | ||
| 19 | #include <linux/mempolicy.h> | ||
| 20 | #include <linux/debugfs.h> | ||
| 21 | |||
| 22 | #include "sched.h" | 12 | #include "sched.h" |
| 23 | 13 | ||
| 24 | static DEFINE_SPINLOCK(sched_debug_lock); | 14 | static DEFINE_SPINLOCK(sched_debug_lock); |
| @@ -274,34 +264,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
| 274 | if (table == NULL) | 264 | if (table == NULL) |
| 275 | return NULL; | 265 | return NULL; |
| 276 | 266 | ||
| 277 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | 267 | set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); |
| 278 | sizeof(long), 0644, proc_doulongvec_minmax, false); | 268 | set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); |
| 279 | set_table_entry(&table[1], "max_interval", &sd->max_interval, | 269 | set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
| 280 | sizeof(long), 0644, proc_doulongvec_minmax, false); | 270 | set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
| 281 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, | 271 | set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
| 282 | sizeof(int), 0644, proc_dointvec_minmax, true); | 272 | set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
| 283 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, | 273 | set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
| 284 | sizeof(int), 0644, proc_dointvec_minmax, true); | 274 | set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false); |
| 285 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, | 275 | set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false); |
| 286 | sizeof(int), 0644, proc_dointvec_minmax, true); | 276 | set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false); |
| 287 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, | 277 | set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false); |
| 288 | sizeof(int), 0644, proc_dointvec_minmax, true); | 278 | set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false); |
| 289 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, | 279 | set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); |
| 290 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
| 291 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, | ||
| 292 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
| 293 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | ||
| 294 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
| 295 | set_table_entry(&table[9], "cache_nice_tries", | ||
| 296 | &sd->cache_nice_tries, | ||
| 297 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
| 298 | set_table_entry(&table[10], "flags", &sd->flags, | ||
| 299 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
| 300 | set_table_entry(&table[11], "max_newidle_lb_cost", | ||
| 301 | &sd->max_newidle_lb_cost, | ||
| 302 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
| 303 | set_table_entry(&table[12], "name", sd->name, | ||
| 304 | CORENAME_MAX_SIZE, 0444, proc_dostring, false); | ||
| 305 | /* &table[13] is terminator */ | 280 | /* &table[13] is terminator */ |
| 306 | 281 | ||
| 307 | return table; | 282 | return table; |
| @@ -332,8 +307,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) | |||
| 332 | return table; | 307 | return table; |
| 333 | } | 308 | } |
| 334 | 309 | ||
| 335 | static cpumask_var_t sd_sysctl_cpus; | 310 | static cpumask_var_t sd_sysctl_cpus; |
| 336 | static struct ctl_table_header *sd_sysctl_header; | 311 | static struct ctl_table_header *sd_sysctl_header; |
| 337 | 312 | ||
| 338 | void register_sched_domain_sysctl(void) | 313 | void register_sched_domain_sysctl(void) |
| 339 | { | 314 | { |
| @@ -413,14 +388,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
| 413 | { | 388 | { |
| 414 | struct sched_entity *se = tg->se[cpu]; | 389 | struct sched_entity *se = tg->se[cpu]; |
| 415 | 390 | ||
| 416 | #define P(F) \ | 391 | #define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) |
| 417 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) | 392 | #define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) |
| 418 | #define P_SCHEDSTAT(F) \ | 393 | #define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) |
| 419 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) | 394 | #define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) |
| 420 | #define PN(F) \ | ||
| 421 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) | ||
| 422 | #define PN_SCHEDSTAT(F) \ | ||
| 423 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) | ||
| 424 | 395 | ||
| 425 | if (!se) | 396 | if (!se) |
| 426 | return; | 397 | return; |
| @@ -428,6 +399,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
| 428 | PN(se->exec_start); | 399 | PN(se->exec_start); |
| 429 | PN(se->vruntime); | 400 | PN(se->vruntime); |
| 430 | PN(se->sum_exec_runtime); | 401 | PN(se->sum_exec_runtime); |
| 402 | |||
| 431 | if (schedstat_enabled()) { | 403 | if (schedstat_enabled()) { |
| 432 | PN_SCHEDSTAT(se->statistics.wait_start); | 404 | PN_SCHEDSTAT(se->statistics.wait_start); |
| 433 | PN_SCHEDSTAT(se->statistics.sleep_start); | 405 | PN_SCHEDSTAT(se->statistics.sleep_start); |
| @@ -440,6 +412,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
| 440 | PN_SCHEDSTAT(se->statistics.wait_sum); | 412 | PN_SCHEDSTAT(se->statistics.wait_sum); |
| 441 | P_SCHEDSTAT(se->statistics.wait_count); | 413 | P_SCHEDSTAT(se->statistics.wait_count); |
| 442 | } | 414 | } |
| 415 | |||
| 443 | P(se->load.weight); | 416 | P(se->load.weight); |
| 444 | P(se->runnable_weight); | 417 | P(se->runnable_weight); |
| 445 | #ifdef CONFIG_SMP | 418 | #ifdef CONFIG_SMP |
| @@ -464,6 +437,7 @@ static char *task_group_path(struct task_group *tg) | |||
| 464 | return group_path; | 437 | return group_path; |
| 465 | 438 | ||
| 466 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); | 439 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); |
| 440 | |||
| 467 | return group_path; | 441 | return group_path; |
| 468 | } | 442 | } |
| 469 | #endif | 443 | #endif |
| @@ -569,6 +543,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 569 | cfs_rq->avg.runnable_load_avg); | 543 | cfs_rq->avg.runnable_load_avg); |
| 570 | SEQ_printf(m, " .%-30s: %lu\n", "util_avg", | 544 | SEQ_printf(m, " .%-30s: %lu\n", "util_avg", |
| 571 | cfs_rq->avg.util_avg); | 545 | cfs_rq->avg.util_avg); |
| 546 | SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", | ||
| 547 | cfs_rq->avg.util_est.enqueued); | ||
| 572 | SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", | 548 | SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", |
| 573 | cfs_rq->removed.load_avg); | 549 | cfs_rq->removed.load_avg); |
| 574 | SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", | 550 | SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", |
| @@ -804,9 +780,9 @@ void sysrq_sched_debug_show(void) | |||
| 804 | /* | 780 | /* |
| 805 | * This itererator needs some explanation. | 781 | * This itererator needs some explanation. |
| 806 | * It returns 1 for the header position. | 782 | * It returns 1 for the header position. |
| 807 | * This means 2 is cpu 0. | 783 | * This means 2 is CPU 0. |
| 808 | * In a hotplugged system some cpus, including cpu 0, may be missing so we have | 784 | * In a hotplugged system some CPUs, including CPU 0, may be missing so we have |
| 809 | * to use cpumask_* to iterate over the cpus. | 785 | * to use cpumask_* to iterate over the CPUs. |
| 810 | */ | 786 | */ |
| 811 | static void *sched_debug_start(struct seq_file *file, loff_t *offset) | 787 | static void *sched_debug_start(struct seq_file *file, loff_t *offset) |
| 812 | { | 788 | { |
| @@ -826,6 +802,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset) | |||
| 826 | 802 | ||
| 827 | if (n < nr_cpu_ids) | 803 | if (n < nr_cpu_ids) |
| 828 | return (void *)(unsigned long)(n + 2); | 804 | return (void *)(unsigned long)(n + 2); |
| 805 | |||
| 829 | return NULL; | 806 | return NULL; |
| 830 | } | 807 | } |
| 831 | 808 | ||
| @@ -840,10 +817,10 @@ static void sched_debug_stop(struct seq_file *file, void *data) | |||
| 840 | } | 817 | } |
| 841 | 818 | ||
| 842 | static const struct seq_operations sched_debug_sops = { | 819 | static const struct seq_operations sched_debug_sops = { |
| 843 | .start = sched_debug_start, | 820 | .start = sched_debug_start, |
| 844 | .next = sched_debug_next, | 821 | .next = sched_debug_next, |
| 845 | .stop = sched_debug_stop, | 822 | .stop = sched_debug_stop, |
| 846 | .show = sched_debug_show, | 823 | .show = sched_debug_show, |
| 847 | }; | 824 | }; |
| 848 | 825 | ||
| 849 | static int sched_debug_release(struct inode *inode, struct file *file) | 826 | static int sched_debug_release(struct inode *inode, struct file *file) |
| @@ -881,14 +858,10 @@ static int __init init_sched_debug_procfs(void) | |||
| 881 | 858 | ||
| 882 | __initcall(init_sched_debug_procfs); | 859 | __initcall(init_sched_debug_procfs); |
| 883 | 860 | ||
| 884 | #define __P(F) \ | 861 | #define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) |
| 885 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) | 862 | #define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) |
| 886 | #define P(F) \ | 863 | #define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) |
| 887 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) | 864 | #define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) |
| 888 | #define __PN(F) \ | ||
| 889 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) | ||
| 890 | #define PN(F) \ | ||
| 891 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) | ||
| 892 | 865 | ||
| 893 | 866 | ||
| 894 | #ifdef CONFIG_NUMA_BALANCING | 867 | #ifdef CONFIG_NUMA_BALANCING |
| @@ -1023,6 +996,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, | |||
| 1023 | P(se.avg.runnable_load_avg); | 996 | P(se.avg.runnable_load_avg); |
| 1024 | P(se.avg.util_avg); | 997 | P(se.avg.util_avg); |
| 1025 | P(se.avg.last_update_time); | 998 | P(se.avg.last_update_time); |
| 999 | P(se.avg.util_est.ewma); | ||
| 1000 | P(se.avg.util_est.enqueued); | ||
| 1026 | #endif | 1001 | #endif |
| 1027 | P(policy); | 1002 | P(policy); |
| 1028 | P(prio); | 1003 | P(prio); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5eb3ffc9be84..0951d1c58d2f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -20,25 +20,10 @@ | |||
| 20 | * Adaptive scheduling granularity, math enhancements by Peter Zijlstra | 20 | * Adaptive scheduling granularity, math enhancements by Peter Zijlstra |
| 21 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra | 21 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra |
| 22 | */ | 22 | */ |
| 23 | 23 | #include "sched.h" | |
| 24 | #include <linux/sched/mm.h> | ||
| 25 | #include <linux/sched/topology.h> | ||
| 26 | |||
| 27 | #include <linux/latencytop.h> | ||
| 28 | #include <linux/cpumask.h> | ||
| 29 | #include <linux/cpuidle.h> | ||
| 30 | #include <linux/slab.h> | ||
| 31 | #include <linux/profile.h> | ||
| 32 | #include <linux/interrupt.h> | ||
| 33 | #include <linux/mempolicy.h> | ||
| 34 | #include <linux/migrate.h> | ||
| 35 | #include <linux/task_work.h> | ||
| 36 | #include <linux/sched/isolation.h> | ||
| 37 | 24 | ||
| 38 | #include <trace/events/sched.h> | 25 | #include <trace/events/sched.h> |
| 39 | 26 | ||
| 40 | #include "sched.h" | ||
| 41 | |||
| 42 | /* | 27 | /* |
| 43 | * Targeted preemption latency for CPU-bound tasks: | 28 | * Targeted preemption latency for CPU-bound tasks: |
| 44 | * | 29 | * |
| @@ -103,7 +88,7 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | |||
| 103 | 88 | ||
| 104 | #ifdef CONFIG_SMP | 89 | #ifdef CONFIG_SMP |
| 105 | /* | 90 | /* |
| 106 | * For asym packing, by default the lower numbered cpu has higher priority. | 91 | * For asym packing, by default the lower numbered CPU has higher priority. |
| 107 | */ | 92 | */ |
| 108 | int __weak arch_asym_cpu_priority(int cpu) | 93 | int __weak arch_asym_cpu_priority(int cpu) |
| 109 | { | 94 | { |
| @@ -787,7 +772,7 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
| 787 | * For !fair tasks do: | 772 | * For !fair tasks do: |
| 788 | * | 773 | * |
| 789 | update_cfs_rq_load_avg(now, cfs_rq); | 774 | update_cfs_rq_load_avg(now, cfs_rq); |
| 790 | attach_entity_load_avg(cfs_rq, se); | 775 | attach_entity_load_avg(cfs_rq, se, 0); |
| 791 | switched_from_fair(rq, p); | 776 | switched_from_fair(rq, p); |
| 792 | * | 777 | * |
| 793 | * such that the next switched_to_fair() has the | 778 | * such that the next switched_to_fair() has the |
| @@ -1181,7 +1166,7 @@ pid_t task_numa_group_id(struct task_struct *p) | |||
| 1181 | } | 1166 | } |
| 1182 | 1167 | ||
| 1183 | /* | 1168 | /* |
| 1184 | * The averaged statistics, shared & private, memory & cpu, | 1169 | * The averaged statistics, shared & private, memory & CPU, |
| 1185 | * occupy the first half of the array. The second half of the | 1170 | * occupy the first half of the array. The second half of the |
| 1186 | * array is for current counters, which are averaged into the | 1171 | * array is for current counters, which are averaged into the |
| 1187 | * first set by task_numa_placement. | 1172 | * first set by task_numa_placement. |
| @@ -1587,7 +1572,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
| 1587 | * be incurred if the tasks were swapped. | 1572 | * be incurred if the tasks were swapped. |
| 1588 | */ | 1573 | */ |
| 1589 | if (cur) { | 1574 | if (cur) { |
| 1590 | /* Skip this swap candidate if cannot move to the source cpu */ | 1575 | /* Skip this swap candidate if cannot move to the source CPU: */ |
| 1591 | if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) | 1576 | if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) |
| 1592 | goto unlock; | 1577 | goto unlock; |
| 1593 | 1578 | ||
| @@ -1631,7 +1616,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
| 1631 | goto balance; | 1616 | goto balance; |
| 1632 | } | 1617 | } |
| 1633 | 1618 | ||
| 1634 | /* Balance doesn't matter much if we're running a task per cpu */ | 1619 | /* Balance doesn't matter much if we're running a task per CPU: */ |
| 1635 | if (imp > env->best_imp && src_rq->nr_running == 1 && | 1620 | if (imp > env->best_imp && src_rq->nr_running == 1 && |
| 1636 | dst_rq->nr_running == 1) | 1621 | dst_rq->nr_running == 1) |
| 1637 | goto assign; | 1622 | goto assign; |
| @@ -1676,7 +1661,7 @@ balance: | |||
| 1676 | */ | 1661 | */ |
| 1677 | if (!cur) { | 1662 | if (!cur) { |
| 1678 | /* | 1663 | /* |
| 1679 | * select_idle_siblings() uses an per-cpu cpumask that | 1664 | * select_idle_siblings() uses an per-CPU cpumask that |
| 1680 | * can be used from IRQ context. | 1665 | * can be used from IRQ context. |
| 1681 | */ | 1666 | */ |
| 1682 | local_irq_disable(); | 1667 | local_irq_disable(); |
| @@ -1869,6 +1854,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
| 1869 | static void numa_migrate_preferred(struct task_struct *p) | 1854 | static void numa_migrate_preferred(struct task_struct *p) |
| 1870 | { | 1855 | { |
| 1871 | unsigned long interval = HZ; | 1856 | unsigned long interval = HZ; |
| 1857 | unsigned long numa_migrate_retry; | ||
| 1872 | 1858 | ||
| 1873 | /* This task has no NUMA fault statistics yet */ | 1859 | /* This task has no NUMA fault statistics yet */ |
| 1874 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) | 1860 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) |
| @@ -1876,7 +1862,18 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
| 1876 | 1862 | ||
| 1877 | /* Periodically retry migrating the task to the preferred node */ | 1863 | /* Periodically retry migrating the task to the preferred node */ |
| 1878 | interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); | 1864 | interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); |
| 1879 | p->numa_migrate_retry = jiffies + interval; | 1865 | numa_migrate_retry = jiffies + interval; |
| 1866 | |||
| 1867 | /* | ||
| 1868 | * Check that the new retry threshold is after the current one. If | ||
| 1869 | * the retry is in the future, it implies that wake_affine has | ||
| 1870 | * temporarily asked NUMA balancing to backoff from placement. | ||
| 1871 | */ | ||
| 1872 | if (numa_migrate_retry > p->numa_migrate_retry) | ||
| 1873 | return; | ||
| 1874 | |||
| 1875 | /* Safe to try placing the task on the preferred node */ | ||
| 1876 | p->numa_migrate_retry = numa_migrate_retry; | ||
| 1880 | 1877 | ||
| 1881 | /* Success if task is already running on preferred CPU */ | 1878 | /* Success if task is already running on preferred CPU */ |
| 1882 | if (task_node(p) == p->numa_preferred_nid) | 1879 | if (task_node(p) == p->numa_preferred_nid) |
| @@ -2823,7 +2820,7 @@ void reweight_task(struct task_struct *p, int prio) | |||
| 2823 | } | 2820 | } |
| 2824 | 2821 | ||
| 2825 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2822 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 2826 | # ifdef CONFIG_SMP | 2823 | #ifdef CONFIG_SMP |
| 2827 | /* | 2824 | /* |
| 2828 | * All this does is approximate the hierarchical proportion which includes that | 2825 | * All this does is approximate the hierarchical proportion which includes that |
| 2829 | * global sum we all love to hate. | 2826 | * global sum we all love to hate. |
| @@ -2974,7 +2971,7 @@ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) | |||
| 2974 | 2971 | ||
| 2975 | return clamp_t(long, runnable, MIN_SHARES, shares); | 2972 | return clamp_t(long, runnable, MIN_SHARES, shares); |
| 2976 | } | 2973 | } |
| 2977 | # endif /* CONFIG_SMP */ | 2974 | #endif /* CONFIG_SMP */ |
| 2978 | 2975 | ||
| 2979 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | 2976 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); |
| 2980 | 2977 | ||
| @@ -3012,11 +3009,11 @@ static inline void update_cfs_group(struct sched_entity *se) | |||
| 3012 | } | 3009 | } |
| 3013 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 3010 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 3014 | 3011 | ||
| 3015 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | 3012 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) |
| 3016 | { | 3013 | { |
| 3017 | struct rq *rq = rq_of(cfs_rq); | 3014 | struct rq *rq = rq_of(cfs_rq); |
| 3018 | 3015 | ||
| 3019 | if (&rq->cfs == cfs_rq) { | 3016 | if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) { |
| 3020 | /* | 3017 | /* |
| 3021 | * There are a few boundary cases this might miss but it should | 3018 | * There are a few boundary cases this might miss but it should |
| 3022 | * get called often enough that that should (hopefully) not be | 3019 | * get called often enough that that should (hopefully) not be |
| @@ -3031,7 +3028,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | |||
| 3031 | * | 3028 | * |
| 3032 | * See cpu_util(). | 3029 | * See cpu_util(). |
| 3033 | */ | 3030 | */ |
| 3034 | cpufreq_update_util(rq, 0); | 3031 | cpufreq_update_util(rq, flags); |
| 3035 | } | 3032 | } |
| 3036 | } | 3033 | } |
| 3037 | 3034 | ||
| @@ -3246,6 +3243,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna | |||
| 3246 | } | 3243 | } |
| 3247 | 3244 | ||
| 3248 | /* | 3245 | /* |
| 3246 | * When a task is dequeued, its estimated utilization should not be update if | ||
| 3247 | * its util_avg has not been updated at least once. | ||
| 3248 | * This flag is used to synchronize util_avg updates with util_est updates. | ||
| 3249 | * We map this information into the LSB bit of the utilization saved at | ||
| 3250 | * dequeue time (i.e. util_est.dequeued). | ||
| 3251 | */ | ||
| 3252 | #define UTIL_AVG_UNCHANGED 0x1 | ||
| 3253 | |||
| 3254 | static inline void cfs_se_util_change(struct sched_avg *avg) | ||
| 3255 | { | ||
| 3256 | unsigned int enqueued; | ||
| 3257 | |||
| 3258 | if (!sched_feat(UTIL_EST)) | ||
| 3259 | return; | ||
| 3260 | |||
| 3261 | /* Avoid store if the flag has been already set */ | ||
| 3262 | enqueued = avg->util_est.enqueued; | ||
| 3263 | if (!(enqueued & UTIL_AVG_UNCHANGED)) | ||
| 3264 | return; | ||
| 3265 | |||
| 3266 | /* Reset flag to report util_avg has been updated */ | ||
| 3267 | enqueued &= ~UTIL_AVG_UNCHANGED; | ||
| 3268 | WRITE_ONCE(avg->util_est.enqueued, enqueued); | ||
| 3269 | } | ||
| 3270 | |||
| 3271 | /* | ||
| 3249 | * sched_entity: | 3272 | * sched_entity: |
| 3250 | * | 3273 | * |
| 3251 | * task: | 3274 | * task: |
| @@ -3296,6 +3319,7 @@ __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entit | |||
| 3296 | cfs_rq->curr == se)) { | 3319 | cfs_rq->curr == se)) { |
| 3297 | 3320 | ||
| 3298 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); | 3321 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); |
| 3322 | cfs_se_util_change(&se->avg); | ||
| 3299 | return 1; | 3323 | return 1; |
| 3300 | } | 3324 | } |
| 3301 | 3325 | ||
| @@ -3350,7 +3374,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) | |||
| 3350 | } | 3374 | } |
| 3351 | 3375 | ||
| 3352 | /* | 3376 | /* |
| 3353 | * Called within set_task_rq() right before setting a task's cpu. The | 3377 | * Called within set_task_rq() right before setting a task's CPU. The |
| 3354 | * caller only guarantees p->pi_lock is held; no other assumptions, | 3378 | * caller only guarantees p->pi_lock is held; no other assumptions, |
| 3355 | * including the state of rq->lock, should be made. | 3379 | * including the state of rq->lock, should be made. |
| 3356 | */ | 3380 | */ |
| @@ -3529,7 +3553,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf | |||
| 3529 | 3553 | ||
| 3530 | /* | 3554 | /* |
| 3531 | * runnable_sum can't be lower than running_sum | 3555 | * runnable_sum can't be lower than running_sum |
| 3532 | * As running sum is scale with cpu capacity wehreas the runnable sum | 3556 | * As running sum is scale with CPU capacity wehreas the runnable sum |
| 3533 | * is not we rescale running_sum 1st | 3557 | * is not we rescale running_sum 1st |
| 3534 | */ | 3558 | */ |
| 3535 | running_sum = se->avg.util_sum / | 3559 | running_sum = se->avg.util_sum / |
| @@ -3689,7 +3713,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | |||
| 3689 | #endif | 3713 | #endif |
| 3690 | 3714 | ||
| 3691 | if (decayed) | 3715 | if (decayed) |
| 3692 | cfs_rq_util_change(cfs_rq); | 3716 | cfs_rq_util_change(cfs_rq, 0); |
| 3693 | 3717 | ||
| 3694 | return decayed; | 3718 | return decayed; |
| 3695 | } | 3719 | } |
| @@ -3702,7 +3726,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | |||
| 3702 | * Must call update_cfs_rq_load_avg() before this, since we rely on | 3726 | * Must call update_cfs_rq_load_avg() before this, since we rely on |
| 3703 | * cfs_rq->avg.last_update_time being current. | 3727 | * cfs_rq->avg.last_update_time being current. |
| 3704 | */ | 3728 | */ |
| 3705 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3729 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
| 3706 | { | 3730 | { |
| 3707 | u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; | 3731 | u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; |
| 3708 | 3732 | ||
| @@ -3738,7 +3762,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
| 3738 | 3762 | ||
| 3739 | add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); | 3763 | add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); |
| 3740 | 3764 | ||
| 3741 | cfs_rq_util_change(cfs_rq); | 3765 | cfs_rq_util_change(cfs_rq, flags); |
| 3742 | } | 3766 | } |
| 3743 | 3767 | ||
| 3744 | /** | 3768 | /** |
| @@ -3757,7 +3781,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
| 3757 | 3781 | ||
| 3758 | add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); | 3782 | add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); |
| 3759 | 3783 | ||
| 3760 | cfs_rq_util_change(cfs_rq); | 3784 | cfs_rq_util_change(cfs_rq, 0); |
| 3761 | } | 3785 | } |
| 3762 | 3786 | ||
| 3763 | /* | 3787 | /* |
| @@ -3787,7 +3811,14 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
| 3787 | 3811 | ||
| 3788 | if (!se->avg.last_update_time && (flags & DO_ATTACH)) { | 3812 | if (!se->avg.last_update_time && (flags & DO_ATTACH)) { |
| 3789 | 3813 | ||
| 3790 | attach_entity_load_avg(cfs_rq, se); | 3814 | /* |
| 3815 | * DO_ATTACH means we're here from enqueue_entity(). | ||
| 3816 | * !last_update_time means we've passed through | ||
| 3817 | * migrate_task_rq_fair() indicating we migrated. | ||
| 3818 | * | ||
| 3819 | * IOW we're enqueueing a task on a new CPU. | ||
| 3820 | */ | ||
| 3821 | attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION); | ||
| 3791 | update_tg_load_avg(cfs_rq, 0); | 3822 | update_tg_load_avg(cfs_rq, 0); |
| 3792 | 3823 | ||
| 3793 | } else if (decayed && (flags & UPDATE_TG)) | 3824 | } else if (decayed && (flags & UPDATE_TG)) |
| @@ -3869,6 +3900,120 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) | |||
| 3869 | 3900 | ||
| 3870 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf); | 3901 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf); |
| 3871 | 3902 | ||
| 3903 | static inline unsigned long task_util(struct task_struct *p) | ||
| 3904 | { | ||
| 3905 | return READ_ONCE(p->se.avg.util_avg); | ||
| 3906 | } | ||
| 3907 | |||
| 3908 | static inline unsigned long _task_util_est(struct task_struct *p) | ||
| 3909 | { | ||
| 3910 | struct util_est ue = READ_ONCE(p->se.avg.util_est); | ||
| 3911 | |||
| 3912 | return max(ue.ewma, ue.enqueued); | ||
| 3913 | } | ||
| 3914 | |||
| 3915 | static inline unsigned long task_util_est(struct task_struct *p) | ||
| 3916 | { | ||
| 3917 | return max(task_util(p), _task_util_est(p)); | ||
| 3918 | } | ||
| 3919 | |||
| 3920 | static inline void util_est_enqueue(struct cfs_rq *cfs_rq, | ||
| 3921 | struct task_struct *p) | ||
| 3922 | { | ||
| 3923 | unsigned int enqueued; | ||
| 3924 | |||
| 3925 | if (!sched_feat(UTIL_EST)) | ||
| 3926 | return; | ||
| 3927 | |||
| 3928 | /* Update root cfs_rq's estimated utilization */ | ||
| 3929 | enqueued = cfs_rq->avg.util_est.enqueued; | ||
| 3930 | enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED); | ||
| 3931 | WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); | ||
| 3932 | } | ||
| 3933 | |||
| 3934 | /* | ||
| 3935 | * Check if a (signed) value is within a specified (unsigned) margin, | ||
| 3936 | * based on the observation that: | ||
| 3937 | * | ||
| 3938 | * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) | ||
| 3939 | * | ||
| 3940 | * NOTE: this only works when value + maring < INT_MAX. | ||
| 3941 | */ | ||
| 3942 | static inline bool within_margin(int value, int margin) | ||
| 3943 | { | ||
| 3944 | return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); | ||
| 3945 | } | ||
| 3946 | |||
| 3947 | static void | ||
| 3948 | util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) | ||
| 3949 | { | ||
| 3950 | long last_ewma_diff; | ||
| 3951 | struct util_est ue; | ||
| 3952 | |||
| 3953 | if (!sched_feat(UTIL_EST)) | ||
| 3954 | return; | ||
| 3955 | |||
| 3956 | /* | ||
| 3957 | * Update root cfs_rq's estimated utilization | ||
| 3958 | * | ||
| 3959 | * If *p is the last task then the root cfs_rq's estimated utilization | ||
| 3960 | * of a CPU is 0 by definition. | ||
| 3961 | */ | ||
| 3962 | ue.enqueued = 0; | ||
| 3963 | if (cfs_rq->nr_running) { | ||
| 3964 | ue.enqueued = cfs_rq->avg.util_est.enqueued; | ||
| 3965 | ue.enqueued -= min_t(unsigned int, ue.enqueued, | ||
| 3966 | (_task_util_est(p) | UTIL_AVG_UNCHANGED)); | ||
| 3967 | } | ||
| 3968 | WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); | ||
| 3969 | |||
| 3970 | /* | ||
| 3971 | * Skip update of task's estimated utilization when the task has not | ||
| 3972 | * yet completed an activation, e.g. being migrated. | ||
| 3973 | */ | ||
| 3974 | if (!task_sleep) | ||
| 3975 | return; | ||
| 3976 | |||
| 3977 | /* | ||
| 3978 | * If the PELT values haven't changed since enqueue time, | ||
| 3979 | * skip the util_est update. | ||
| 3980 | */ | ||
| 3981 | ue = p->se.avg.util_est; | ||
| 3982 | if (ue.enqueued & UTIL_AVG_UNCHANGED) | ||
| 3983 | return; | ||
| 3984 | |||
| 3985 | /* | ||
| 3986 | * Skip update of task's estimated utilization when its EWMA is | ||
| 3987 | * already ~1% close to its last activation value. | ||
| 3988 | */ | ||
| 3989 | ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); | ||
| 3990 | last_ewma_diff = ue.enqueued - ue.ewma; | ||
| 3991 | if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) | ||
| 3992 | return; | ||
| 3993 | |||
| 3994 | /* | ||
| 3995 | * Update Task's estimated utilization | ||
| 3996 | * | ||
| 3997 | * When *p completes an activation we can consolidate another sample | ||
| 3998 | * of the task size. This is done by storing the current PELT value | ||
| 3999 | * as ue.enqueued and by using this value to update the Exponential | ||
| 4000 | * Weighted Moving Average (EWMA): | ||
| 4001 | * | ||
| 4002 | * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) | ||
| 4003 | * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) | ||
| 4004 | * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) | ||
| 4005 | * = w * ( last_ewma_diff ) + ewma(t-1) | ||
| 4006 | * = w * (last_ewma_diff + ewma(t-1) / w) | ||
| 4007 | * | ||
| 4008 | * Where 'w' is the weight of new samples, which is configured to be | ||
| 4009 | * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) | ||
| 4010 | */ | ||
| 4011 | ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; | ||
| 4012 | ue.ewma += last_ewma_diff; | ||
| 4013 | ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; | ||
| 4014 | WRITE_ONCE(p->se.avg.util_est, ue); | ||
| 4015 | } | ||
| 4016 | |||
| 3872 | #else /* CONFIG_SMP */ | 4017 | #else /* CONFIG_SMP */ |
| 3873 | 4018 | ||
| 3874 | static inline int | 4019 | static inline int |
| @@ -3883,13 +4028,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | |||
| 3883 | 4028 | ||
| 3884 | static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) | 4029 | static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) |
| 3885 | { | 4030 | { |
| 3886 | cfs_rq_util_change(cfs_rq); | 4031 | cfs_rq_util_change(cfs_rq, 0); |
| 3887 | } | 4032 | } |
| 3888 | 4033 | ||
| 3889 | static inline void remove_entity_load_avg(struct sched_entity *se) {} | 4034 | static inline void remove_entity_load_avg(struct sched_entity *se) {} |
| 3890 | 4035 | ||
| 3891 | static inline void | 4036 | static inline void |
| 3892 | attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | 4037 | attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} |
| 3893 | static inline void | 4038 | static inline void |
| 3894 | detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | 4039 | detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} |
| 3895 | 4040 | ||
| @@ -3898,6 +4043,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf) | |||
| 3898 | return 0; | 4043 | return 0; |
| 3899 | } | 4044 | } |
| 3900 | 4045 | ||
| 4046 | static inline void | ||
| 4047 | util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} | ||
| 4048 | |||
| 4049 | static inline void | ||
| 4050 | util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, | ||
| 4051 | bool task_sleep) {} | ||
| 4052 | |||
| 3901 | #endif /* CONFIG_SMP */ | 4053 | #endif /* CONFIG_SMP */ |
| 3902 | 4054 | ||
| 3903 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) | 4055 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| @@ -4676,7 +4828,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 4676 | if (!se) | 4828 | if (!se) |
| 4677 | add_nr_running(rq, task_delta); | 4829 | add_nr_running(rq, task_delta); |
| 4678 | 4830 | ||
| 4679 | /* determine whether we need to wake up potentially idle cpu */ | 4831 | /* Determine whether we need to wake up potentially idle CPU: */ |
| 4680 | if (rq->curr == rq->idle && rq->cfs.nr_running) | 4832 | if (rq->curr == rq->idle && rq->cfs.nr_running) |
| 4681 | resched_curr(rq); | 4833 | resched_curr(rq); |
| 4682 | } | 4834 | } |
| @@ -5041,7 +5193,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
| 5041 | } | 5193 | } |
| 5042 | 5194 | ||
| 5043 | /* | 5195 | /* |
| 5044 | * Both these cpu hotplug callbacks race against unregister_fair_sched_group() | 5196 | * Both these CPU hotplug callbacks race against unregister_fair_sched_group() |
| 5045 | * | 5197 | * |
| 5046 | * The race is harmless, since modifying bandwidth settings of unhooked group | 5198 | * The race is harmless, since modifying bandwidth settings of unhooked group |
| 5047 | * bits doesn't do much. | 5199 | * bits doesn't do much. |
| @@ -5086,7 +5238,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) | |||
| 5086 | */ | 5238 | */ |
| 5087 | cfs_rq->runtime_remaining = 1; | 5239 | cfs_rq->runtime_remaining = 1; |
| 5088 | /* | 5240 | /* |
| 5089 | * Offline rq is schedulable till cpu is completely disabled | 5241 | * Offline rq is schedulable till CPU is completely disabled |
| 5090 | * in take_cpu_down(), so we prevent new cfs throttling here. | 5242 | * in take_cpu_down(), so we prevent new cfs throttling here. |
| 5091 | */ | 5243 | */ |
| 5092 | cfs_rq->runtime_enabled = 0; | 5244 | cfs_rq->runtime_enabled = 0; |
| @@ -5245,6 +5397,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 5245 | if (!se) | 5397 | if (!se) |
| 5246 | add_nr_running(rq, 1); | 5398 | add_nr_running(rq, 1); |
| 5247 | 5399 | ||
| 5400 | util_est_enqueue(&rq->cfs, p); | ||
| 5248 | hrtick_update(rq); | 5401 | hrtick_update(rq); |
| 5249 | } | 5402 | } |
| 5250 | 5403 | ||
| @@ -5304,6 +5457,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 5304 | if (!se) | 5457 | if (!se) |
| 5305 | sub_nr_running(rq, 1); | 5458 | sub_nr_running(rq, 1); |
| 5306 | 5459 | ||
| 5460 | util_est_dequeue(&rq->cfs, p, task_sleep); | ||
| 5307 | hrtick_update(rq); | 5461 | hrtick_update(rq); |
| 5308 | } | 5462 | } |
| 5309 | 5463 | ||
| @@ -5323,8 +5477,8 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); | |||
| 5323 | * | 5477 | * |
| 5324 | * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load | 5478 | * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load |
| 5325 | * | 5479 | * |
| 5326 | * If a cpu misses updates for n ticks (as it was idle) and update gets | 5480 | * If a CPU misses updates for n ticks (as it was idle) and update gets |
| 5327 | * called on the n+1-th tick when cpu may be busy, then we have: | 5481 | * called on the n+1-th tick when CPU may be busy, then we have: |
| 5328 | * | 5482 | * |
| 5329 | * load_n = (1 - 1/2^i)^n * load_0 | 5483 | * load_n = (1 - 1/2^i)^n * load_0 |
| 5330 | * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load | 5484 | * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load |
| @@ -5379,6 +5533,15 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | |||
| 5379 | } | 5533 | } |
| 5380 | return load; | 5534 | return load; |
| 5381 | } | 5535 | } |
| 5536 | |||
| 5537 | static struct { | ||
| 5538 | cpumask_var_t idle_cpus_mask; | ||
| 5539 | atomic_t nr_cpus; | ||
| 5540 | int has_blocked; /* Idle CPUS has blocked load */ | ||
| 5541 | unsigned long next_balance; /* in jiffy units */ | ||
| 5542 | unsigned long next_blocked; /* Next update of blocked load in jiffies */ | ||
| 5543 | } nohz ____cacheline_aligned; | ||
| 5544 | |||
| 5382 | #endif /* CONFIG_NO_HZ_COMMON */ | 5545 | #endif /* CONFIG_NO_HZ_COMMON */ |
| 5383 | 5546 | ||
| 5384 | /** | 5547 | /** |
| @@ -5468,7 +5631,7 @@ static unsigned long weighted_cpuload(struct rq *rq) | |||
| 5468 | #ifdef CONFIG_NO_HZ_COMMON | 5631 | #ifdef CONFIG_NO_HZ_COMMON |
| 5469 | /* | 5632 | /* |
| 5470 | * There is no sane way to deal with nohz on smp when using jiffies because the | 5633 | * There is no sane way to deal with nohz on smp when using jiffies because the |
| 5471 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | 5634 | * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading |
| 5472 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | 5635 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. |
| 5473 | * | 5636 | * |
| 5474 | * Therefore we need to avoid the delta approach from the regular tick when | 5637 | * Therefore we need to avoid the delta approach from the regular tick when |
| @@ -5579,7 +5742,7 @@ void cpu_load_update_active(struct rq *this_rq) | |||
| 5579 | } | 5742 | } |
| 5580 | 5743 | ||
| 5581 | /* | 5744 | /* |
| 5582 | * Return a low guess at the load of a migration-source cpu weighted | 5745 | * Return a low guess at the load of a migration-source CPU weighted |
| 5583 | * according to the scheduling class and "nice" value. | 5746 | * according to the scheduling class and "nice" value. |
| 5584 | * | 5747 | * |
| 5585 | * We want to under-estimate the load of migration sources, to | 5748 | * We want to under-estimate the load of migration sources, to |
| @@ -5597,7 +5760,7 @@ static unsigned long source_load(int cpu, int type) | |||
| 5597 | } | 5760 | } |
| 5598 | 5761 | ||
| 5599 | /* | 5762 | /* |
| 5600 | * Return a high guess at the load of a migration-target cpu weighted | 5763 | * Return a high guess at the load of a migration-target CPU weighted |
| 5601 | * according to the scheduling class and "nice" value. | 5764 | * according to the scheduling class and "nice" value. |
| 5602 | */ | 5765 | */ |
| 5603 | static unsigned long target_load(int cpu, int type) | 5766 | static unsigned long target_load(int cpu, int type) |
| @@ -5724,7 +5887,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, | |||
| 5724 | unsigned long task_load; | 5887 | unsigned long task_load; |
| 5725 | 5888 | ||
| 5726 | this_eff_load = target_load(this_cpu, sd->wake_idx); | 5889 | this_eff_load = target_load(this_cpu, sd->wake_idx); |
| 5727 | prev_eff_load = source_load(prev_cpu, sd->wake_idx); | ||
| 5728 | 5890 | ||
| 5729 | if (sync) { | 5891 | if (sync) { |
| 5730 | unsigned long current_load = task_h_load(current); | 5892 | unsigned long current_load = task_h_load(current); |
| @@ -5742,18 +5904,69 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, | |||
| 5742 | this_eff_load *= 100; | 5904 | this_eff_load *= 100; |
| 5743 | this_eff_load *= capacity_of(prev_cpu); | 5905 | this_eff_load *= capacity_of(prev_cpu); |
| 5744 | 5906 | ||
| 5907 | prev_eff_load = source_load(prev_cpu, sd->wake_idx); | ||
| 5745 | prev_eff_load -= task_load; | 5908 | prev_eff_load -= task_load; |
| 5746 | if (sched_feat(WA_BIAS)) | 5909 | if (sched_feat(WA_BIAS)) |
| 5747 | prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; | 5910 | prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; |
| 5748 | prev_eff_load *= capacity_of(this_cpu); | 5911 | prev_eff_load *= capacity_of(this_cpu); |
| 5749 | 5912 | ||
| 5750 | return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits; | 5913 | /* |
| 5914 | * If sync, adjust the weight of prev_eff_load such that if | ||
| 5915 | * prev_eff == this_eff that select_idle_sibling() will consider | ||
| 5916 | * stacking the wakee on top of the waker if no other CPU is | ||
| 5917 | * idle. | ||
| 5918 | */ | ||
| 5919 | if (sync) | ||
| 5920 | prev_eff_load += 1; | ||
| 5921 | |||
| 5922 | return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits; | ||
| 5923 | } | ||
| 5924 | |||
| 5925 | #ifdef CONFIG_NUMA_BALANCING | ||
| 5926 | static void | ||
| 5927 | update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) | ||
| 5928 | { | ||
| 5929 | unsigned long interval; | ||
| 5930 | |||
| 5931 | if (!static_branch_likely(&sched_numa_balancing)) | ||
| 5932 | return; | ||
| 5933 | |||
| 5934 | /* If balancing has no preference then continue gathering data */ | ||
| 5935 | if (p->numa_preferred_nid == -1) | ||
| 5936 | return; | ||
| 5937 | |||
| 5938 | /* | ||
| 5939 | * If the wakeup is not affecting locality then it is neutral from | ||
| 5940 | * the perspective of NUMA balacing so continue gathering data. | ||
| 5941 | */ | ||
| 5942 | if (cpu_to_node(prev_cpu) == cpu_to_node(target)) | ||
| 5943 | return; | ||
| 5944 | |||
| 5945 | /* | ||
| 5946 | * Temporarily prevent NUMA balancing trying to place waker/wakee after | ||
| 5947 | * wakee has been moved by wake_affine. This will potentially allow | ||
| 5948 | * related tasks to converge and update their data placement. The | ||
| 5949 | * 4 * numa_scan_period is to allow the two-pass filter to migrate | ||
| 5950 | * hot data to the wakers node. | ||
| 5951 | */ | ||
| 5952 | interval = max(sysctl_numa_balancing_scan_delay, | ||
| 5953 | p->numa_scan_period << 2); | ||
| 5954 | p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); | ||
| 5955 | |||
| 5956 | interval = max(sysctl_numa_balancing_scan_delay, | ||
| 5957 | current->numa_scan_period << 2); | ||
| 5958 | current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); | ||
| 5751 | } | 5959 | } |
| 5960 | #else | ||
| 5961 | static void | ||
| 5962 | update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) | ||
| 5963 | { | ||
| 5964 | } | ||
| 5965 | #endif | ||
| 5752 | 5966 | ||
| 5753 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, | 5967 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, |
| 5754 | int prev_cpu, int sync) | 5968 | int this_cpu, int prev_cpu, int sync) |
| 5755 | { | 5969 | { |
| 5756 | int this_cpu = smp_processor_id(); | ||
| 5757 | int target = nr_cpumask_bits; | 5970 | int target = nr_cpumask_bits; |
| 5758 | 5971 | ||
| 5759 | if (sched_feat(WA_IDLE)) | 5972 | if (sched_feat(WA_IDLE)) |
| @@ -5766,12 +5979,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, | |||
| 5766 | if (target == nr_cpumask_bits) | 5979 | if (target == nr_cpumask_bits) |
| 5767 | return prev_cpu; | 5980 | return prev_cpu; |
| 5768 | 5981 | ||
| 5982 | update_wa_numa_placement(p, prev_cpu, target); | ||
| 5769 | schedstat_inc(sd->ttwu_move_affine); | 5983 | schedstat_inc(sd->ttwu_move_affine); |
| 5770 | schedstat_inc(p->se.statistics.nr_wakeups_affine); | 5984 | schedstat_inc(p->se.statistics.nr_wakeups_affine); |
| 5771 | return target; | 5985 | return target; |
| 5772 | } | 5986 | } |
| 5773 | 5987 | ||
| 5774 | static inline unsigned long task_util(struct task_struct *p); | ||
| 5775 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p); | 5988 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p); |
| 5776 | 5989 | ||
| 5777 | static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) | 5990 | static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) |
| @@ -5826,7 +6039,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
| 5826 | max_spare_cap = 0; | 6039 | max_spare_cap = 0; |
| 5827 | 6040 | ||
| 5828 | for_each_cpu(i, sched_group_span(group)) { | 6041 | for_each_cpu(i, sched_group_span(group)) { |
| 5829 | /* Bias balancing toward cpus of our domain */ | 6042 | /* Bias balancing toward CPUs of our domain */ |
| 5830 | if (local_group) | 6043 | if (local_group) |
| 5831 | load = source_load(i, load_idx); | 6044 | load = source_load(i, load_idx); |
| 5832 | else | 6045 | else |
| @@ -5856,7 +6069,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
| 5856 | if (min_runnable_load > (runnable_load + imbalance)) { | 6069 | if (min_runnable_load > (runnable_load + imbalance)) { |
| 5857 | /* | 6070 | /* |
| 5858 | * The runnable load is significantly smaller | 6071 | * The runnable load is significantly smaller |
| 5859 | * so we can pick this new cpu | 6072 | * so we can pick this new CPU: |
| 5860 | */ | 6073 | */ |
| 5861 | min_runnable_load = runnable_load; | 6074 | min_runnable_load = runnable_load; |
| 5862 | min_avg_load = avg_load; | 6075 | min_avg_load = avg_load; |
| @@ -5865,7 +6078,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
| 5865 | (100*min_avg_load > imbalance_scale*avg_load)) { | 6078 | (100*min_avg_load > imbalance_scale*avg_load)) { |
| 5866 | /* | 6079 | /* |
| 5867 | * The runnable loads are close so take the | 6080 | * The runnable loads are close so take the |
| 5868 | * blocked load into account through avg_load. | 6081 | * blocked load into account through avg_load: |
| 5869 | */ | 6082 | */ |
| 5870 | min_avg_load = avg_load; | 6083 | min_avg_load = avg_load; |
| 5871 | idlest = group; | 6084 | idlest = group; |
| @@ -5903,6 +6116,18 @@ skip_spare: | |||
| 5903 | if (!idlest) | 6116 | if (!idlest) |
| 5904 | return NULL; | 6117 | return NULL; |
| 5905 | 6118 | ||
| 6119 | /* | ||
| 6120 | * When comparing groups across NUMA domains, it's possible for the | ||
| 6121 | * local domain to be very lightly loaded relative to the remote | ||
| 6122 | * domains but "imbalance" skews the comparison making remote CPUs | ||
| 6123 | * look much more favourable. When considering cross-domain, add | ||
| 6124 | * imbalance to the runnable load on the remote node and consider | ||
| 6125 | * staying local. | ||
| 6126 | */ | ||
| 6127 | if ((sd->flags & SD_NUMA) && | ||
| 6128 | min_runnable_load + imbalance >= this_runnable_load) | ||
| 6129 | return NULL; | ||
| 6130 | |||
| 5906 | if (min_runnable_load > (this_runnable_load + imbalance)) | 6131 | if (min_runnable_load > (this_runnable_load + imbalance)) |
| 5907 | return NULL; | 6132 | return NULL; |
| 5908 | 6133 | ||
| @@ -5914,7 +6139,7 @@ skip_spare: | |||
| 5914 | } | 6139 | } |
| 5915 | 6140 | ||
| 5916 | /* | 6141 | /* |
| 5917 | * find_idlest_group_cpu - find the idlest cpu among the cpus in group. | 6142 | * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. |
| 5918 | */ | 6143 | */ |
| 5919 | static int | 6144 | static int |
| 5920 | find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | 6145 | find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) |
| @@ -5992,12 +6217,12 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p | |||
| 5992 | 6217 | ||
| 5993 | new_cpu = find_idlest_group_cpu(group, p, cpu); | 6218 | new_cpu = find_idlest_group_cpu(group, p, cpu); |
| 5994 | if (new_cpu == cpu) { | 6219 | if (new_cpu == cpu) { |
| 5995 | /* Now try balancing at a lower domain level of cpu */ | 6220 | /* Now try balancing at a lower domain level of 'cpu': */ |
| 5996 | sd = sd->child; | 6221 | sd = sd->child; |
| 5997 | continue; | 6222 | continue; |
| 5998 | } | 6223 | } |
| 5999 | 6224 | ||
| 6000 | /* Now try balancing at a lower domain level of new_cpu */ | 6225 | /* Now try balancing at a lower domain level of 'new_cpu': */ |
| 6001 | cpu = new_cpu; | 6226 | cpu = new_cpu; |
| 6002 | weight = sd->span_weight; | 6227 | weight = sd->span_weight; |
| 6003 | sd = NULL; | 6228 | sd = NULL; |
| @@ -6007,7 +6232,6 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p | |||
| 6007 | if (tmp->flags & sd_flag) | 6232 | if (tmp->flags & sd_flag) |
| 6008 | sd = tmp; | 6233 | sd = tmp; |
| 6009 | } | 6234 | } |
| 6010 | /* while loop will break here if sd == NULL */ | ||
| 6011 | } | 6235 | } |
| 6012 | 6236 | ||
| 6013 | return new_cpu; | 6237 | return new_cpu; |
| @@ -6203,12 +6427,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
| 6203 | return target; | 6427 | return target; |
| 6204 | 6428 | ||
| 6205 | /* | 6429 | /* |
| 6206 | * If the previous cpu is cache affine and idle, don't be stupid. | 6430 | * If the previous CPU is cache affine and idle, don't be stupid: |
| 6207 | */ | 6431 | */ |
| 6208 | if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) | 6432 | if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) |
| 6209 | return prev; | 6433 | return prev; |
| 6210 | 6434 | ||
| 6211 | /* Check a recently used CPU as a potential idle candidate */ | 6435 | /* Check a recently used CPU as a potential idle candidate: */ |
| 6212 | recent_used_cpu = p->recent_used_cpu; | 6436 | recent_used_cpu = p->recent_used_cpu; |
| 6213 | if (recent_used_cpu != prev && | 6437 | if (recent_used_cpu != prev && |
| 6214 | recent_used_cpu != target && | 6438 | recent_used_cpu != target && |
| @@ -6217,7 +6441,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
| 6217 | cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { | 6441 | cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { |
| 6218 | /* | 6442 | /* |
| 6219 | * Replace recent_used_cpu with prev as it is a potential | 6443 | * Replace recent_used_cpu with prev as it is a potential |
| 6220 | * candidate for the next wake. | 6444 | * candidate for the next wake: |
| 6221 | */ | 6445 | */ |
| 6222 | p->recent_used_cpu = prev; | 6446 | p->recent_used_cpu = prev; |
| 6223 | return recent_used_cpu; | 6447 | return recent_used_cpu; |
| @@ -6242,11 +6466,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
| 6242 | return target; | 6466 | return target; |
| 6243 | } | 6467 | } |
| 6244 | 6468 | ||
| 6245 | /* | 6469 | /** |
| 6246 | * cpu_util returns the amount of capacity of a CPU that is used by CFS | 6470 | * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks |
| 6247 | * tasks. The unit of the return value must be the one of capacity so we can | 6471 | * @cpu: the CPU to get the utilization of |
| 6248 | * compare the utilization with the capacity of the CPU that is available for | 6472 | * |
| 6249 | * CFS task (ie cpu_capacity). | 6473 | * The unit of the return value must be the one of capacity so we can compare |
| 6474 | * the utilization with the capacity of the CPU that is available for CFS task | ||
| 6475 | * (ie cpu_capacity). | ||
| 6250 | * | 6476 | * |
| 6251 | * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the | 6477 | * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the |
| 6252 | * recent utilization of currently non-runnable tasks on a CPU. It represents | 6478 | * recent utilization of currently non-runnable tasks on a CPU. It represents |
| @@ -6257,6 +6483,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
| 6257 | * current capacity (capacity_curr <= capacity_orig) of the CPU because it is | 6483 | * current capacity (capacity_curr <= capacity_orig) of the CPU because it is |
| 6258 | * the running time on this CPU scaled by capacity_curr. | 6484 | * the running time on this CPU scaled by capacity_curr. |
| 6259 | * | 6485 | * |
| 6486 | * The estimated utilization of a CPU is defined to be the maximum between its | ||
| 6487 | * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks | ||
| 6488 | * currently RUNNABLE on that CPU. | ||
| 6489 | * This allows to properly represent the expected utilization of a CPU which | ||
| 6490 | * has just got a big task running since a long sleep period. At the same time | ||
| 6491 | * however it preserves the benefits of the "blocked utilization" in | ||
| 6492 | * describing the potential for other tasks waking up on the same CPU. | ||
| 6493 | * | ||
| 6260 | * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even | 6494 | * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even |
| 6261 | * higher than capacity_orig because of unfortunate rounding in | 6495 | * higher than capacity_orig because of unfortunate rounding in |
| 6262 | * cfs.avg.util_avg or just after migrating tasks and new task wakeups until | 6496 | * cfs.avg.util_avg or just after migrating tasks and new task wakeups until |
| @@ -6267,36 +6501,77 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
| 6267 | * available capacity. We allow utilization to overshoot capacity_curr (but not | 6501 | * available capacity. We allow utilization to overshoot capacity_curr (but not |
| 6268 | * capacity_orig) as it useful for predicting the capacity required after task | 6502 | * capacity_orig) as it useful for predicting the capacity required after task |
| 6269 | * migrations (scheduler-driven DVFS). | 6503 | * migrations (scheduler-driven DVFS). |
| 6504 | * | ||
| 6505 | * Return: the (estimated) utilization for the specified CPU | ||
| 6270 | */ | 6506 | */ |
| 6271 | static unsigned long cpu_util(int cpu) | 6507 | static inline unsigned long cpu_util(int cpu) |
| 6272 | { | 6508 | { |
| 6273 | unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; | 6509 | struct cfs_rq *cfs_rq; |
| 6274 | unsigned long capacity = capacity_orig_of(cpu); | 6510 | unsigned int util; |
| 6275 | 6511 | ||
| 6276 | return (util >= capacity) ? capacity : util; | 6512 | cfs_rq = &cpu_rq(cpu)->cfs; |
| 6277 | } | 6513 | util = READ_ONCE(cfs_rq->avg.util_avg); |
| 6278 | 6514 | ||
| 6279 | static inline unsigned long task_util(struct task_struct *p) | 6515 | if (sched_feat(UTIL_EST)) |
| 6280 | { | 6516 | util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); |
| 6281 | return p->se.avg.util_avg; | 6517 | |
| 6518 | return min_t(unsigned long, util, capacity_orig_of(cpu)); | ||
| 6282 | } | 6519 | } |
| 6283 | 6520 | ||
| 6284 | /* | 6521 | /* |
| 6285 | * cpu_util_wake: Compute cpu utilization with any contributions from | 6522 | * cpu_util_wake: Compute CPU utilization with any contributions from |
| 6286 | * the waking task p removed. | 6523 | * the waking task p removed. |
| 6287 | */ | 6524 | */ |
| 6288 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p) | 6525 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p) |
| 6289 | { | 6526 | { |
| 6290 | unsigned long util, capacity; | 6527 | struct cfs_rq *cfs_rq; |
| 6528 | unsigned int util; | ||
| 6291 | 6529 | ||
| 6292 | /* Task has no contribution or is new */ | 6530 | /* Task has no contribution or is new */ |
| 6293 | if (cpu != task_cpu(p) || !p->se.avg.last_update_time) | 6531 | if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) |
| 6294 | return cpu_util(cpu); | 6532 | return cpu_util(cpu); |
| 6295 | 6533 | ||
| 6296 | capacity = capacity_orig_of(cpu); | 6534 | cfs_rq = &cpu_rq(cpu)->cfs; |
| 6297 | util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); | 6535 | util = READ_ONCE(cfs_rq->avg.util_avg); |
| 6298 | 6536 | ||
| 6299 | return (util >= capacity) ? capacity : util; | 6537 | /* Discount task's blocked util from CPU's util */ |
| 6538 | util -= min_t(unsigned int, util, task_util(p)); | ||
| 6539 | |||
| 6540 | /* | ||
| 6541 | * Covered cases: | ||
| 6542 | * | ||
| 6543 | * a) if *p is the only task sleeping on this CPU, then: | ||
| 6544 | * cpu_util (== task_util) > util_est (== 0) | ||
| 6545 | * and thus we return: | ||
| 6546 | * cpu_util_wake = (cpu_util - task_util) = 0 | ||
| 6547 | * | ||
| 6548 | * b) if other tasks are SLEEPING on this CPU, which is now exiting | ||
| 6549 | * IDLE, then: | ||
| 6550 | * cpu_util >= task_util | ||
| 6551 | * cpu_util > util_est (== 0) | ||
| 6552 | * and thus we discount *p's blocked utilization to return: | ||
| 6553 | * cpu_util_wake = (cpu_util - task_util) >= 0 | ||
| 6554 | * | ||
| 6555 | * c) if other tasks are RUNNABLE on that CPU and | ||
| 6556 | * util_est > cpu_util | ||
| 6557 | * then we use util_est since it returns a more restrictive | ||
| 6558 | * estimation of the spare capacity on that CPU, by just | ||
| 6559 | * considering the expected utilization of tasks already | ||
| 6560 | * runnable on that CPU. | ||
| 6561 | * | ||
| 6562 | * Cases a) and b) are covered by the above code, while case c) is | ||
| 6563 | * covered by the following code when estimated utilization is | ||
| 6564 | * enabled. | ||
| 6565 | */ | ||
| 6566 | if (sched_feat(UTIL_EST)) | ||
| 6567 | util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); | ||
| 6568 | |||
| 6569 | /* | ||
| 6570 | * Utilization (estimated) can exceed the CPU capacity, thus let's | ||
| 6571 | * clamp to the maximum CPU capacity to ensure consistency with | ||
| 6572 | * the cpu_util call. | ||
| 6573 | */ | ||
| 6574 | return min_t(unsigned long, util, capacity_orig_of(cpu)); | ||
| 6300 | } | 6575 | } |
| 6301 | 6576 | ||
| 6302 | /* | 6577 | /* |
| @@ -6328,10 +6603,10 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) | |||
| 6328 | * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, | 6603 | * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, |
| 6329 | * SD_BALANCE_FORK, or SD_BALANCE_EXEC. | 6604 | * SD_BALANCE_FORK, or SD_BALANCE_EXEC. |
| 6330 | * | 6605 | * |
| 6331 | * Balances load by selecting the idlest cpu in the idlest group, or under | 6606 | * Balances load by selecting the idlest CPU in the idlest group, or under |
| 6332 | * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. | 6607 | * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set. |
| 6333 | * | 6608 | * |
| 6334 | * Returns the target cpu number. | 6609 | * Returns the target CPU number. |
| 6335 | * | 6610 | * |
| 6336 | * preempt must be disabled. | 6611 | * preempt must be disabled. |
| 6337 | */ | 6612 | */ |
| @@ -6342,7 +6617,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
| 6342 | int cpu = smp_processor_id(); | 6617 | int cpu = smp_processor_id(); |
| 6343 | int new_cpu = prev_cpu; | 6618 | int new_cpu = prev_cpu; |
| 6344 | int want_affine = 0; | 6619 | int want_affine = 0; |
| 6345 | int sync = wake_flags & WF_SYNC; | 6620 | int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); |
| 6346 | 6621 | ||
| 6347 | if (sd_flag & SD_BALANCE_WAKE) { | 6622 | if (sd_flag & SD_BALANCE_WAKE) { |
| 6348 | record_wakee(p); | 6623 | record_wakee(p); |
| @@ -6356,7 +6631,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
| 6356 | break; | 6631 | break; |
| 6357 | 6632 | ||
| 6358 | /* | 6633 | /* |
| 6359 | * If both cpu and prev_cpu are part of this domain, | 6634 | * If both 'cpu' and 'prev_cpu' are part of this domain, |
| 6360 | * cpu is a valid SD_WAKE_AFFINE target. | 6635 | * cpu is a valid SD_WAKE_AFFINE target. |
| 6361 | */ | 6636 | */ |
| 6362 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && | 6637 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && |
| @@ -6376,7 +6651,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
| 6376 | if (cpu == prev_cpu) | 6651 | if (cpu == prev_cpu) |
| 6377 | goto pick_cpu; | 6652 | goto pick_cpu; |
| 6378 | 6653 | ||
| 6379 | new_cpu = wake_affine(affine_sd, p, prev_cpu, sync); | 6654 | new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync); |
| 6380 | } | 6655 | } |
| 6381 | 6656 | ||
| 6382 | if (sd && !(sd_flag & SD_BALANCE_FORK)) { | 6657 | if (sd && !(sd_flag & SD_BALANCE_FORK)) { |
| @@ -6407,9 +6682,9 @@ pick_cpu: | |||
| 6407 | static void detach_entity_cfs_rq(struct sched_entity *se); | 6682 | static void detach_entity_cfs_rq(struct sched_entity *se); |
| 6408 | 6683 | ||
| 6409 | /* | 6684 | /* |
| 6410 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and | 6685 | * Called immediately before a task is migrated to a new CPU; task_cpu(p) and |
| 6411 | * cfs_rq_of(p) references at time of call are still valid and identify the | 6686 | * cfs_rq_of(p) references at time of call are still valid and identify the |
| 6412 | * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. | 6687 | * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. |
| 6413 | */ | 6688 | */ |
| 6414 | static void migrate_task_rq_fair(struct task_struct *p) | 6689 | static void migrate_task_rq_fair(struct task_struct *p) |
| 6415 | { | 6690 | { |
| @@ -6738,7 +7013,7 @@ simple: | |||
| 6738 | 7013 | ||
| 6739 | p = task_of(se); | 7014 | p = task_of(se); |
| 6740 | 7015 | ||
| 6741 | done: __maybe_unused | 7016 | done: __maybe_unused; |
| 6742 | #ifdef CONFIG_SMP | 7017 | #ifdef CONFIG_SMP |
| 6743 | /* | 7018 | /* |
| 6744 | * Move the next running task to the front of | 7019 | * Move the next running task to the front of |
| @@ -6843,17 +7118,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
| 6843 | * BASICS | 7118 | * BASICS |
| 6844 | * | 7119 | * |
| 6845 | * The purpose of load-balancing is to achieve the same basic fairness the | 7120 | * The purpose of load-balancing is to achieve the same basic fairness the |
| 6846 | * per-cpu scheduler provides, namely provide a proportional amount of compute | 7121 | * per-CPU scheduler provides, namely provide a proportional amount of compute |
| 6847 | * time to each task. This is expressed in the following equation: | 7122 | * time to each task. This is expressed in the following equation: |
| 6848 | * | 7123 | * |
| 6849 | * W_i,n/P_i == W_j,n/P_j for all i,j (1) | 7124 | * W_i,n/P_i == W_j,n/P_j for all i,j (1) |
| 6850 | * | 7125 | * |
| 6851 | * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight | 7126 | * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight |
| 6852 | * W_i,0 is defined as: | 7127 | * W_i,0 is defined as: |
| 6853 | * | 7128 | * |
| 6854 | * W_i,0 = \Sum_j w_i,j (2) | 7129 | * W_i,0 = \Sum_j w_i,j (2) |
| 6855 | * | 7130 | * |
| 6856 | * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight | 7131 | * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight |
| 6857 | * is derived from the nice value as per sched_prio_to_weight[]. | 7132 | * is derived from the nice value as per sched_prio_to_weight[]. |
| 6858 | * | 7133 | * |
| 6859 | * The weight average is an exponential decay average of the instantaneous | 7134 | * The weight average is an exponential decay average of the instantaneous |
| @@ -6861,7 +7136,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
| 6861 | * | 7136 | * |
| 6862 | * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) | 7137 | * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) |
| 6863 | * | 7138 | * |
| 6864 | * C_i is the compute capacity of cpu i, typically it is the | 7139 | * C_i is the compute capacity of CPU i, typically it is the |
| 6865 | * fraction of 'recent' time available for SCHED_OTHER task execution. But it | 7140 | * fraction of 'recent' time available for SCHED_OTHER task execution. But it |
| 6866 | * can also include other factors [XXX]. | 7141 | * can also include other factors [XXX]. |
| 6867 | * | 7142 | * |
| @@ -6882,11 +7157,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
| 6882 | * SCHED DOMAINS | 7157 | * SCHED DOMAINS |
| 6883 | * | 7158 | * |
| 6884 | * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) | 7159 | * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) |
| 6885 | * for all i,j solution, we create a tree of cpus that follows the hardware | 7160 | * for all i,j solution, we create a tree of CPUs that follows the hardware |
| 6886 | * topology where each level pairs two lower groups (or better). This results | 7161 | * topology where each level pairs two lower groups (or better). This results |
| 6887 | * in O(log n) layers. Furthermore we reduce the number of cpus going up the | 7162 | * in O(log n) layers. Furthermore we reduce the number of CPUs going up the |
| 6888 | * tree to only the first of the previous level and we decrease the frequency | 7163 | * tree to only the first of the previous level and we decrease the frequency |
| 6889 | * of load-balance at each level inv. proportional to the number of cpus in | 7164 | * of load-balance at each level inv. proportional to the number of CPUs in |
| 6890 | * the groups. | 7165 | * the groups. |
| 6891 | * | 7166 | * |
| 6892 | * This yields: | 7167 | * This yields: |
| @@ -6895,7 +7170,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
| 6895 | * \Sum { --- * --- * 2^i } = O(n) (5) | 7170 | * \Sum { --- * --- * 2^i } = O(n) (5) |
| 6896 | * i = 0 2^i 2^i | 7171 | * i = 0 2^i 2^i |
| 6897 | * `- size of each group | 7172 | * `- size of each group |
| 6898 | * | | `- number of cpus doing load-balance | 7173 | * | | `- number of CPUs doing load-balance |
| 6899 | * | `- freq | 7174 | * | `- freq |
| 6900 | * `- sum over all levels | 7175 | * `- sum over all levels |
| 6901 | * | 7176 | * |
| @@ -6903,7 +7178,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
| 6903 | * this makes (5) the runtime complexity of the balancer. | 7178 | * this makes (5) the runtime complexity of the balancer. |
| 6904 | * | 7179 | * |
| 6905 | * An important property here is that each CPU is still (indirectly) connected | 7180 | * An important property here is that each CPU is still (indirectly) connected |
| 6906 | * to every other cpu in at most O(log n) steps: | 7181 | * to every other CPU in at most O(log n) steps: |
| 6907 | * | 7182 | * |
| 6908 | * The adjacency matrix of the resulting graph is given by: | 7183 | * The adjacency matrix of the resulting graph is given by: |
| 6909 | * | 7184 | * |
| @@ -6915,7 +7190,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
| 6915 | * | 7190 | * |
| 6916 | * A^(log_2 n)_i,j != 0 for all i,j (7) | 7191 | * A^(log_2 n)_i,j != 0 for all i,j (7) |
| 6917 | * | 7192 | * |
| 6918 | * Showing there's indeed a path between every cpu in at most O(log n) steps. | 7193 | * Showing there's indeed a path between every CPU in at most O(log n) steps. |
| 6919 | * The task movement gives a factor of O(m), giving a convergence complexity | 7194 | * The task movement gives a factor of O(m), giving a convergence complexity |
| 6920 | * of: | 7195 | * of: |
| 6921 | * | 7196 | * |
| @@ -6925,7 +7200,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
| 6925 | * WORK CONSERVING | 7200 | * WORK CONSERVING |
| 6926 | * | 7201 | * |
| 6927 | * In order to avoid CPUs going idle while there's still work to do, new idle | 7202 | * In order to avoid CPUs going idle while there's still work to do, new idle |
| 6928 | * balancing is more aggressive and has the newly idle cpu iterate up the domain | 7203 | * balancing is more aggressive and has the newly idle CPU iterate up the domain |
| 6929 | * tree itself instead of relying on other CPUs to bring it work. | 7204 | * tree itself instead of relying on other CPUs to bring it work. |
| 6930 | * | 7205 | * |
| 6931 | * This adds some complexity to both (5) and (8) but it reduces the total idle | 7206 | * This adds some complexity to both (5) and (8) but it reduces the total idle |
| @@ -6946,7 +7221,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
| 6946 | * | 7221 | * |
| 6947 | * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) | 7222 | * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) |
| 6948 | * | 7223 | * |
| 6949 | * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. | 7224 | * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i. |
| 6950 | * | 7225 | * |
| 6951 | * The big problem is S_k, its a global sum needed to compute a local (W_i) | 7226 | * The big problem is S_k, its a global sum needed to compute a local (W_i) |
| 6952 | * property. | 7227 | * property. |
| @@ -6963,6 +7238,8 @@ enum fbq_type { regular, remote, all }; | |||
| 6963 | #define LBF_NEED_BREAK 0x02 | 7238 | #define LBF_NEED_BREAK 0x02 |
| 6964 | #define LBF_DST_PINNED 0x04 | 7239 | #define LBF_DST_PINNED 0x04 |
| 6965 | #define LBF_SOME_PINNED 0x08 | 7240 | #define LBF_SOME_PINNED 0x08 |
| 7241 | #define LBF_NOHZ_STATS 0x10 | ||
| 7242 | #define LBF_NOHZ_AGAIN 0x20 | ||
| 6966 | 7243 | ||
| 6967 | struct lb_env { | 7244 | struct lb_env { |
| 6968 | struct sched_domain *sd; | 7245 | struct sched_domain *sd; |
| @@ -7110,7 +7387,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 7110 | env->flags |= LBF_SOME_PINNED; | 7387 | env->flags |= LBF_SOME_PINNED; |
| 7111 | 7388 | ||
| 7112 | /* | 7389 | /* |
| 7113 | * Remember if this task can be migrated to any other cpu in | 7390 | * Remember if this task can be migrated to any other CPU in |
| 7114 | * our sched_group. We may want to revisit it if we couldn't | 7391 | * our sched_group. We may want to revisit it if we couldn't |
| 7115 | * meet load balance goals by pulling other tasks on src_cpu. | 7392 | * meet load balance goals by pulling other tasks on src_cpu. |
| 7116 | * | 7393 | * |
| @@ -7120,7 +7397,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 7120 | if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) | 7397 | if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) |
| 7121 | return 0; | 7398 | return 0; |
| 7122 | 7399 | ||
| 7123 | /* Prevent to re-select dst_cpu via env's cpus */ | 7400 | /* Prevent to re-select dst_cpu via env's CPUs: */ |
| 7124 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { | 7401 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { |
| 7125 | if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { | 7402 | if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { |
| 7126 | env->flags |= LBF_DST_PINNED; | 7403 | env->flags |= LBF_DST_PINNED; |
| @@ -7347,6 +7624,17 @@ static void attach_tasks(struct lb_env *env) | |||
| 7347 | rq_unlock(env->dst_rq, &rf); | 7624 | rq_unlock(env->dst_rq, &rf); |
| 7348 | } | 7625 | } |
| 7349 | 7626 | ||
| 7627 | static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) | ||
| 7628 | { | ||
| 7629 | if (cfs_rq->avg.load_avg) | ||
| 7630 | return true; | ||
| 7631 | |||
| 7632 | if (cfs_rq->avg.util_avg) | ||
| 7633 | return true; | ||
| 7634 | |||
| 7635 | return false; | ||
| 7636 | } | ||
| 7637 | |||
| 7350 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7638 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7351 | 7639 | ||
| 7352 | static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) | 7640 | static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) |
| @@ -7371,6 +7659,7 @@ static void update_blocked_averages(int cpu) | |||
| 7371 | struct rq *rq = cpu_rq(cpu); | 7659 | struct rq *rq = cpu_rq(cpu); |
| 7372 | struct cfs_rq *cfs_rq, *pos; | 7660 | struct cfs_rq *cfs_rq, *pos; |
| 7373 | struct rq_flags rf; | 7661 | struct rq_flags rf; |
| 7662 | bool done = true; | ||
| 7374 | 7663 | ||
| 7375 | rq_lock_irqsave(rq, &rf); | 7664 | rq_lock_irqsave(rq, &rf); |
| 7376 | update_rq_clock(rq); | 7665 | update_rq_clock(rq); |
| @@ -7400,7 +7689,17 @@ static void update_blocked_averages(int cpu) | |||
| 7400 | */ | 7689 | */ |
| 7401 | if (cfs_rq_is_decayed(cfs_rq)) | 7690 | if (cfs_rq_is_decayed(cfs_rq)) |
| 7402 | list_del_leaf_cfs_rq(cfs_rq); | 7691 | list_del_leaf_cfs_rq(cfs_rq); |
| 7692 | |||
| 7693 | /* Don't need periodic decay once load/util_avg are null */ | ||
| 7694 | if (cfs_rq_has_blocked(cfs_rq)) | ||
| 7695 | done = false; | ||
| 7403 | } | 7696 | } |
| 7697 | |||
| 7698 | #ifdef CONFIG_NO_HZ_COMMON | ||
| 7699 | rq->last_blocked_load_update_tick = jiffies; | ||
| 7700 | if (done) | ||
| 7701 | rq->has_blocked_load = 0; | ||
| 7702 | #endif | ||
| 7404 | rq_unlock_irqrestore(rq, &rf); | 7703 | rq_unlock_irqrestore(rq, &rf); |
| 7405 | } | 7704 | } |
| 7406 | 7705 | ||
| @@ -7460,6 +7759,11 @@ static inline void update_blocked_averages(int cpu) | |||
| 7460 | rq_lock_irqsave(rq, &rf); | 7759 | rq_lock_irqsave(rq, &rf); |
| 7461 | update_rq_clock(rq); | 7760 | update_rq_clock(rq); |
| 7462 | update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); | 7761 | update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); |
| 7762 | #ifdef CONFIG_NO_HZ_COMMON | ||
| 7763 | rq->last_blocked_load_update_tick = jiffies; | ||
| 7764 | if (!cfs_rq_has_blocked(cfs_rq)) | ||
| 7765 | rq->has_blocked_load = 0; | ||
| 7766 | #endif | ||
| 7463 | rq_unlock_irqrestore(rq, &rf); | 7767 | rq_unlock_irqrestore(rq, &rf); |
| 7464 | } | 7768 | } |
| 7465 | 7769 | ||
| @@ -7694,8 +7998,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) | |||
| 7694 | * Group imbalance indicates (and tries to solve) the problem where balancing | 7998 | * Group imbalance indicates (and tries to solve) the problem where balancing |
| 7695 | * groups is inadequate due to ->cpus_allowed constraints. | 7999 | * groups is inadequate due to ->cpus_allowed constraints. |
| 7696 | * | 8000 | * |
| 7697 | * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a | 8001 | * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a |
| 7698 | * cpumask covering 1 cpu of the first group and 3 cpus of the second group. | 8002 | * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. |
| 7699 | * Something like: | 8003 | * Something like: |
| 7700 | * | 8004 | * |
| 7701 | * { 0 1 2 3 } { 4 5 6 7 } | 8005 | * { 0 1 2 3 } { 4 5 6 7 } |
| @@ -7703,7 +8007,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) | |||
| 7703 | * | 8007 | * |
| 7704 | * If we were to balance group-wise we'd place two tasks in the first group and | 8008 | * If we were to balance group-wise we'd place two tasks in the first group and |
| 7705 | * two tasks in the second group. Clearly this is undesired as it will overload | 8009 | * two tasks in the second group. Clearly this is undesired as it will overload |
| 7706 | * cpu 3 and leave one of the cpus in the second group unused. | 8010 | * cpu 3 and leave one of the CPUs in the second group unused. |
| 7707 | * | 8011 | * |
| 7708 | * The current solution to this issue is detecting the skew in the first group | 8012 | * The current solution to this issue is detecting the skew in the first group |
| 7709 | * by noticing the lower domain failed to reach balance and had difficulty | 8013 | * by noticing the lower domain failed to reach balance and had difficulty |
| @@ -7794,6 +8098,28 @@ group_type group_classify(struct sched_group *group, | |||
| 7794 | return group_other; | 8098 | return group_other; |
| 7795 | } | 8099 | } |
| 7796 | 8100 | ||
| 8101 | static bool update_nohz_stats(struct rq *rq, bool force) | ||
| 8102 | { | ||
| 8103 | #ifdef CONFIG_NO_HZ_COMMON | ||
| 8104 | unsigned int cpu = rq->cpu; | ||
| 8105 | |||
| 8106 | if (!rq->has_blocked_load) | ||
| 8107 | return false; | ||
| 8108 | |||
| 8109 | if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) | ||
| 8110 | return false; | ||
| 8111 | |||
| 8112 | if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick)) | ||
| 8113 | return true; | ||
| 8114 | |||
| 8115 | update_blocked_averages(cpu); | ||
| 8116 | |||
| 8117 | return rq->has_blocked_load; | ||
| 8118 | #else | ||
| 8119 | return false; | ||
| 8120 | #endif | ||
| 8121 | } | ||
| 8122 | |||
| 7797 | /** | 8123 | /** |
| 7798 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 8124 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
| 7799 | * @env: The load balancing environment. | 8125 | * @env: The load balancing environment. |
| @@ -7816,7 +8142,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 7816 | for_each_cpu_and(i, sched_group_span(group), env->cpus) { | 8142 | for_each_cpu_and(i, sched_group_span(group), env->cpus) { |
| 7817 | struct rq *rq = cpu_rq(i); | 8143 | struct rq *rq = cpu_rq(i); |
| 7818 | 8144 | ||
| 7819 | /* Bias balancing toward cpus of our domain */ | 8145 | if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) |
| 8146 | env->flags |= LBF_NOHZ_AGAIN; | ||
| 8147 | |||
| 8148 | /* Bias balancing toward CPUs of our domain: */ | ||
| 7820 | if (local_group) | 8149 | if (local_group) |
| 7821 | load = target_load(i, load_idx); | 8150 | load = target_load(i, load_idx); |
| 7822 | else | 8151 | else |
| @@ -7902,7 +8231,7 @@ asym_packing: | |||
| 7902 | if (!(env->sd->flags & SD_ASYM_PACKING)) | 8231 | if (!(env->sd->flags & SD_ASYM_PACKING)) |
| 7903 | return true; | 8232 | return true; |
| 7904 | 8233 | ||
| 7905 | /* No ASYM_PACKING if target cpu is already busy */ | 8234 | /* No ASYM_PACKING if target CPU is already busy */ |
| 7906 | if (env->idle == CPU_NOT_IDLE) | 8235 | if (env->idle == CPU_NOT_IDLE) |
| 7907 | return true; | 8236 | return true; |
| 7908 | /* | 8237 | /* |
| @@ -7915,7 +8244,7 @@ asym_packing: | |||
| 7915 | if (!sds->busiest) | 8244 | if (!sds->busiest) |
| 7916 | return true; | 8245 | return true; |
| 7917 | 8246 | ||
| 7918 | /* Prefer to move from lowest priority cpu's work */ | 8247 | /* Prefer to move from lowest priority CPU's work */ |
| 7919 | if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, | 8248 | if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, |
| 7920 | sg->asym_prefer_cpu)) | 8249 | sg->asym_prefer_cpu)) |
| 7921 | return true; | 8250 | return true; |
| @@ -7971,6 +8300,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
| 7971 | if (child && child->flags & SD_PREFER_SIBLING) | 8300 | if (child && child->flags & SD_PREFER_SIBLING) |
| 7972 | prefer_sibling = 1; | 8301 | prefer_sibling = 1; |
| 7973 | 8302 | ||
| 8303 | #ifdef CONFIG_NO_HZ_COMMON | ||
| 8304 | if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) | ||
| 8305 | env->flags |= LBF_NOHZ_STATS; | ||
| 8306 | #endif | ||
| 8307 | |||
| 7974 | load_idx = get_sd_load_idx(env->sd, env->idle); | 8308 | load_idx = get_sd_load_idx(env->sd, env->idle); |
| 7975 | 8309 | ||
| 7976 | do { | 8310 | do { |
| @@ -8024,6 +8358,15 @@ next_group: | |||
| 8024 | sg = sg->next; | 8358 | sg = sg->next; |
| 8025 | } while (sg != env->sd->groups); | 8359 | } while (sg != env->sd->groups); |
| 8026 | 8360 | ||
| 8361 | #ifdef CONFIG_NO_HZ_COMMON | ||
| 8362 | if ((env->flags & LBF_NOHZ_AGAIN) && | ||
| 8363 | cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) { | ||
| 8364 | |||
| 8365 | WRITE_ONCE(nohz.next_blocked, | ||
| 8366 | jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD)); | ||
| 8367 | } | ||
| 8368 | #endif | ||
| 8369 | |||
| 8027 | if (env->sd->flags & SD_NUMA) | 8370 | if (env->sd->flags & SD_NUMA) |
| 8028 | env->fbq_type = fbq_classify_group(&sds->busiest_stat); | 8371 | env->fbq_type = fbq_classify_group(&sds->busiest_stat); |
| 8029 | 8372 | ||
| @@ -8168,7 +8511,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 8168 | if (busiest->group_type == group_imbalanced) { | 8511 | if (busiest->group_type == group_imbalanced) { |
| 8169 | /* | 8512 | /* |
| 8170 | * In the group_imb case we cannot rely on group-wide averages | 8513 | * In the group_imb case we cannot rely on group-wide averages |
| 8171 | * to ensure cpu-load equilibrium, look at wider averages. XXX | 8514 | * to ensure CPU-load equilibrium, look at wider averages. XXX |
| 8172 | */ | 8515 | */ |
| 8173 | busiest->load_per_task = | 8516 | busiest->load_per_task = |
| 8174 | min(busiest->load_per_task, sds->avg_load); | 8517 | min(busiest->load_per_task, sds->avg_load); |
| @@ -8187,7 +8530,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 8187 | } | 8530 | } |
| 8188 | 8531 | ||
| 8189 | /* | 8532 | /* |
| 8190 | * If there aren't any idle cpus, avoid creating some. | 8533 | * If there aren't any idle CPUs, avoid creating some. |
| 8191 | */ | 8534 | */ |
| 8192 | if (busiest->group_type == group_overloaded && | 8535 | if (busiest->group_type == group_overloaded && |
| 8193 | local->group_type == group_overloaded) { | 8536 | local->group_type == group_overloaded) { |
| @@ -8201,9 +8544,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 8201 | } | 8544 | } |
| 8202 | 8545 | ||
| 8203 | /* | 8546 | /* |
| 8204 | * We're trying to get all the cpus to the average_load, so we don't | 8547 | * We're trying to get all the CPUs to the average_load, so we don't |
| 8205 | * want to push ourselves above the average load, nor do we wish to | 8548 | * want to push ourselves above the average load, nor do we wish to |
| 8206 | * reduce the max loaded cpu below the average load. At the same time, | 8549 | * reduce the max loaded CPU below the average load. At the same time, |
| 8207 | * we also don't want to reduce the group load below the group | 8550 | * we also don't want to reduce the group load below the group |
| 8208 | * capacity. Thus we look for the minimum possible imbalance. | 8551 | * capacity. Thus we look for the minimum possible imbalance. |
| 8209 | */ | 8552 | */ |
| @@ -8297,9 +8640,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 8297 | 8640 | ||
| 8298 | if (env->idle == CPU_IDLE) { | 8641 | if (env->idle == CPU_IDLE) { |
| 8299 | /* | 8642 | /* |
| 8300 | * This cpu is idle. If the busiest group is not overloaded | 8643 | * This CPU is idle. If the busiest group is not overloaded |
| 8301 | * and there is no imbalance between this and busiest group | 8644 | * and there is no imbalance between this and busiest group |
| 8302 | * wrt idle cpus, it is balanced. The imbalance becomes | 8645 | * wrt idle CPUs, it is balanced. The imbalance becomes |
| 8303 | * significant if the diff is greater than 1 otherwise we | 8646 | * significant if the diff is greater than 1 otherwise we |
| 8304 | * might end up to just move the imbalance on another group | 8647 | * might end up to just move the imbalance on another group |
| 8305 | */ | 8648 | */ |
| @@ -8327,7 +8670,7 @@ out_balanced: | |||
| 8327 | } | 8670 | } |
| 8328 | 8671 | ||
| 8329 | /* | 8672 | /* |
| 8330 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 8673 | * find_busiest_queue - find the busiest runqueue among the CPUs in the group. |
| 8331 | */ | 8674 | */ |
| 8332 | static struct rq *find_busiest_queue(struct lb_env *env, | 8675 | static struct rq *find_busiest_queue(struct lb_env *env, |
| 8333 | struct sched_group *group) | 8676 | struct sched_group *group) |
| @@ -8371,7 +8714,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 8371 | 8714 | ||
| 8372 | /* | 8715 | /* |
| 8373 | * When comparing with imbalance, use weighted_cpuload() | 8716 | * When comparing with imbalance, use weighted_cpuload() |
| 8374 | * which is not scaled with the cpu capacity. | 8717 | * which is not scaled with the CPU capacity. |
| 8375 | */ | 8718 | */ |
| 8376 | 8719 | ||
| 8377 | if (rq->nr_running == 1 && wl > env->imbalance && | 8720 | if (rq->nr_running == 1 && wl > env->imbalance && |
| @@ -8379,9 +8722,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 8379 | continue; | 8722 | continue; |
| 8380 | 8723 | ||
| 8381 | /* | 8724 | /* |
| 8382 | * For the load comparisons with the other cpu's, consider | 8725 | * For the load comparisons with the other CPU's, consider |
| 8383 | * the weighted_cpuload() scaled with the cpu capacity, so | 8726 | * the weighted_cpuload() scaled with the CPU capacity, so |
| 8384 | * that the load can be moved away from the cpu that is | 8727 | * that the load can be moved away from the CPU that is |
| 8385 | * potentially running at a lower capacity. | 8728 | * potentially running at a lower capacity. |
| 8386 | * | 8729 | * |
| 8387 | * Thus we're looking for max(wl_i / capacity_i), crosswise | 8730 | * Thus we're looking for max(wl_i / capacity_i), crosswise |
| @@ -8452,13 +8795,13 @@ static int should_we_balance(struct lb_env *env) | |||
| 8452 | return 0; | 8795 | return 0; |
| 8453 | 8796 | ||
| 8454 | /* | 8797 | /* |
| 8455 | * In the newly idle case, we will allow all the cpu's | 8798 | * In the newly idle case, we will allow all the CPUs |
| 8456 | * to do the newly idle load balance. | 8799 | * to do the newly idle load balance. |
| 8457 | */ | 8800 | */ |
| 8458 | if (env->idle == CPU_NEWLY_IDLE) | 8801 | if (env->idle == CPU_NEWLY_IDLE) |
| 8459 | return 1; | 8802 | return 1; |
| 8460 | 8803 | ||
| 8461 | /* Try to find first idle cpu */ | 8804 | /* Try to find first idle CPU */ |
| 8462 | for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { | 8805 | for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { |
| 8463 | if (!idle_cpu(cpu)) | 8806 | if (!idle_cpu(cpu)) |
| 8464 | continue; | 8807 | continue; |
| @@ -8471,7 +8814,7 @@ static int should_we_balance(struct lb_env *env) | |||
| 8471 | balance_cpu = group_balance_cpu(sg); | 8814 | balance_cpu = group_balance_cpu(sg); |
| 8472 | 8815 | ||
| 8473 | /* | 8816 | /* |
| 8474 | * First idle cpu or the first cpu(busiest) in this sched group | 8817 | * First idle CPU or the first CPU(busiest) in this sched group |
| 8475 | * is eligible for doing load balancing at this and above domains. | 8818 | * is eligible for doing load balancing at this and above domains. |
| 8476 | */ | 8819 | */ |
| 8477 | return balance_cpu == env->dst_cpu; | 8820 | return balance_cpu == env->dst_cpu; |
| @@ -8580,7 +8923,7 @@ more_balance: | |||
| 8580 | * Revisit (affine) tasks on src_cpu that couldn't be moved to | 8923 | * Revisit (affine) tasks on src_cpu that couldn't be moved to |
| 8581 | * us and move them to an alternate dst_cpu in our sched_group | 8924 | * us and move them to an alternate dst_cpu in our sched_group |
| 8582 | * where they can run. The upper limit on how many times we | 8925 | * where they can run. The upper limit on how many times we |
| 8583 | * iterate on same src_cpu is dependent on number of cpus in our | 8926 | * iterate on same src_cpu is dependent on number of CPUs in our |
| 8584 | * sched_group. | 8927 | * sched_group. |
| 8585 | * | 8928 | * |
| 8586 | * This changes load balance semantics a bit on who can move | 8929 | * This changes load balance semantics a bit on who can move |
| @@ -8597,7 +8940,7 @@ more_balance: | |||
| 8597 | */ | 8940 | */ |
| 8598 | if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { | 8941 | if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { |
| 8599 | 8942 | ||
| 8600 | /* Prevent to re-select dst_cpu via env's cpus */ | 8943 | /* Prevent to re-select dst_cpu via env's CPUs */ |
| 8601 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | 8944 | cpumask_clear_cpu(env.dst_cpu, env.cpus); |
| 8602 | 8945 | ||
| 8603 | env.dst_rq = cpu_rq(env.new_dst_cpu); | 8946 | env.dst_rq = cpu_rq(env.new_dst_cpu); |
| @@ -8659,9 +9002,10 @@ more_balance: | |||
| 8659 | 9002 | ||
| 8660 | raw_spin_lock_irqsave(&busiest->lock, flags); | 9003 | raw_spin_lock_irqsave(&busiest->lock, flags); |
| 8661 | 9004 | ||
| 8662 | /* don't kick the active_load_balance_cpu_stop, | 9005 | /* |
| 8663 | * if the curr task on busiest cpu can't be | 9006 | * Don't kick the active_load_balance_cpu_stop, |
| 8664 | * moved to this_cpu | 9007 | * if the curr task on busiest CPU can't be |
| 9008 | * moved to this_cpu: | ||
| 8665 | */ | 9009 | */ |
| 8666 | if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { | 9010 | if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { |
| 8667 | raw_spin_unlock_irqrestore(&busiest->lock, | 9011 | raw_spin_unlock_irqrestore(&busiest->lock, |
| @@ -8773,121 +9117,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance) | |||
| 8773 | } | 9117 | } |
| 8774 | 9118 | ||
| 8775 | /* | 9119 | /* |
| 8776 | * idle_balance is called by schedule() if this_cpu is about to become | 9120 | * active_load_balance_cpu_stop is run by the CPU stopper. It pushes |
| 8777 | * idle. Attempts to pull tasks from other CPUs. | ||
| 8778 | */ | ||
| 8779 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf) | ||
| 8780 | { | ||
| 8781 | unsigned long next_balance = jiffies + HZ; | ||
| 8782 | int this_cpu = this_rq->cpu; | ||
| 8783 | struct sched_domain *sd; | ||
| 8784 | int pulled_task = 0; | ||
| 8785 | u64 curr_cost = 0; | ||
| 8786 | |||
| 8787 | /* | ||
| 8788 | * We must set idle_stamp _before_ calling idle_balance(), such that we | ||
| 8789 | * measure the duration of idle_balance() as idle time. | ||
| 8790 | */ | ||
| 8791 | this_rq->idle_stamp = rq_clock(this_rq); | ||
| 8792 | |||
| 8793 | /* | ||
| 8794 | * Do not pull tasks towards !active CPUs... | ||
| 8795 | */ | ||
| 8796 | if (!cpu_active(this_cpu)) | ||
| 8797 | return 0; | ||
| 8798 | |||
| 8799 | /* | ||
| 8800 | * This is OK, because current is on_cpu, which avoids it being picked | ||
| 8801 | * for load-balance and preemption/IRQs are still disabled avoiding | ||
| 8802 | * further scheduler activity on it and we're being very careful to | ||
| 8803 | * re-start the picking loop. | ||
| 8804 | */ | ||
| 8805 | rq_unpin_lock(this_rq, rf); | ||
| 8806 | |||
| 8807 | if (this_rq->avg_idle < sysctl_sched_migration_cost || | ||
| 8808 | !this_rq->rd->overload) { | ||
| 8809 | rcu_read_lock(); | ||
| 8810 | sd = rcu_dereference_check_sched_domain(this_rq->sd); | ||
| 8811 | if (sd) | ||
| 8812 | update_next_balance(sd, &next_balance); | ||
| 8813 | rcu_read_unlock(); | ||
| 8814 | |||
| 8815 | goto out; | ||
| 8816 | } | ||
| 8817 | |||
| 8818 | raw_spin_unlock(&this_rq->lock); | ||
| 8819 | |||
| 8820 | update_blocked_averages(this_cpu); | ||
| 8821 | rcu_read_lock(); | ||
| 8822 | for_each_domain(this_cpu, sd) { | ||
| 8823 | int continue_balancing = 1; | ||
| 8824 | u64 t0, domain_cost; | ||
| 8825 | |||
| 8826 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
| 8827 | continue; | ||
| 8828 | |||
| 8829 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { | ||
| 8830 | update_next_balance(sd, &next_balance); | ||
| 8831 | break; | ||
| 8832 | } | ||
| 8833 | |||
| 8834 | if (sd->flags & SD_BALANCE_NEWIDLE) { | ||
| 8835 | t0 = sched_clock_cpu(this_cpu); | ||
| 8836 | |||
| 8837 | pulled_task = load_balance(this_cpu, this_rq, | ||
| 8838 | sd, CPU_NEWLY_IDLE, | ||
| 8839 | &continue_balancing); | ||
| 8840 | |||
| 8841 | domain_cost = sched_clock_cpu(this_cpu) - t0; | ||
| 8842 | if (domain_cost > sd->max_newidle_lb_cost) | ||
| 8843 | sd->max_newidle_lb_cost = domain_cost; | ||
| 8844 | |||
| 8845 | curr_cost += domain_cost; | ||
| 8846 | } | ||
| 8847 | |||
| 8848 | update_next_balance(sd, &next_balance); | ||
| 8849 | |||
| 8850 | /* | ||
| 8851 | * Stop searching for tasks to pull if there are | ||
| 8852 | * now runnable tasks on this rq. | ||
| 8853 | */ | ||
| 8854 | if (pulled_task || this_rq->nr_running > 0) | ||
| 8855 | break; | ||
| 8856 | } | ||
| 8857 | rcu_read_unlock(); | ||
| 8858 | |||
| 8859 | raw_spin_lock(&this_rq->lock); | ||
| 8860 | |||
| 8861 | if (curr_cost > this_rq->max_idle_balance_cost) | ||
| 8862 | this_rq->max_idle_balance_cost = curr_cost; | ||
| 8863 | |||
| 8864 | /* | ||
| 8865 | * While browsing the domains, we released the rq lock, a task could | ||
| 8866 | * have been enqueued in the meantime. Since we're not going idle, | ||
| 8867 | * pretend we pulled a task. | ||
| 8868 | */ | ||
| 8869 | if (this_rq->cfs.h_nr_running && !pulled_task) | ||
| 8870 | pulled_task = 1; | ||
| 8871 | |||
| 8872 | out: | ||
| 8873 | /* Move the next balance forward */ | ||
| 8874 | if (time_after(this_rq->next_balance, next_balance)) | ||
| 8875 | this_rq->next_balance = next_balance; | ||
| 8876 | |||
| 8877 | /* Is there a task of a high priority class? */ | ||
| 8878 | if (this_rq->nr_running != this_rq->cfs.h_nr_running) | ||
| 8879 | pulled_task = -1; | ||
| 8880 | |||
| 8881 | if (pulled_task) | ||
| 8882 | this_rq->idle_stamp = 0; | ||
| 8883 | |||
| 8884 | rq_repin_lock(this_rq, rf); | ||
| 8885 | |||
| 8886 | return pulled_task; | ||
| 8887 | } | ||
| 8888 | |||
| 8889 | /* | ||
| 8890 | * active_load_balance_cpu_stop is run by cpu stopper. It pushes | ||
| 8891 | * running tasks off the busiest CPU onto idle CPUs. It requires at | 9121 | * running tasks off the busiest CPU onto idle CPUs. It requires at |
| 8892 | * least 1 task to be running on each physical CPU where possible, and | 9122 | * least 1 task to be running on each physical CPU where possible, and |
| 8893 | * avoids physical / logical imbalances. | 9123 | * avoids physical / logical imbalances. |
| @@ -8911,7 +9141,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
| 8911 | if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) | 9141 | if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) |
| 8912 | goto out_unlock; | 9142 | goto out_unlock; |
| 8913 | 9143 | ||
| 8914 | /* make sure the requested cpu hasn't gone down in the meantime */ | 9144 | /* Make sure the requested CPU hasn't gone down in the meantime: */ |
| 8915 | if (unlikely(busiest_cpu != smp_processor_id() || | 9145 | if (unlikely(busiest_cpu != smp_processor_id() || |
| 8916 | !busiest_rq->active_balance)) | 9146 | !busiest_rq->active_balance)) |
| 8917 | goto out_unlock; | 9147 | goto out_unlock; |
| @@ -8923,7 +9153,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
| 8923 | /* | 9153 | /* |
| 8924 | * This condition is "impossible", if it occurs | 9154 | * This condition is "impossible", if it occurs |
| 8925 | * we need to fix it. Originally reported by | 9155 | * we need to fix it. Originally reported by |
| 8926 | * Bjorn Helgaas on a 128-cpu setup. | 9156 | * Bjorn Helgaas on a 128-CPU setup. |
| 8927 | */ | 9157 | */ |
| 8928 | BUG_ON(busiest_rq == target_rq); | 9158 | BUG_ON(busiest_rq == target_rq); |
| 8929 | 9159 | ||
| @@ -8977,141 +9207,6 @@ out_unlock: | |||
| 8977 | return 0; | 9207 | return 0; |
| 8978 | } | 9208 | } |
| 8979 | 9209 | ||
| 8980 | static inline int on_null_domain(struct rq *rq) | ||
| 8981 | { | ||
| 8982 | return unlikely(!rcu_dereference_sched(rq->sd)); | ||
| 8983 | } | ||
| 8984 | |||
| 8985 | #ifdef CONFIG_NO_HZ_COMMON | ||
| 8986 | /* | ||
| 8987 | * idle load balancing details | ||
| 8988 | * - When one of the busy CPUs notice that there may be an idle rebalancing | ||
| 8989 | * needed, they will kick the idle load balancer, which then does idle | ||
| 8990 | * load balancing for all the idle CPUs. | ||
| 8991 | */ | ||
| 8992 | static struct { | ||
| 8993 | cpumask_var_t idle_cpus_mask; | ||
| 8994 | atomic_t nr_cpus; | ||
| 8995 | unsigned long next_balance; /* in jiffy units */ | ||
| 8996 | } nohz ____cacheline_aligned; | ||
| 8997 | |||
| 8998 | static inline int find_new_ilb(void) | ||
| 8999 | { | ||
| 9000 | int ilb = cpumask_first(nohz.idle_cpus_mask); | ||
| 9001 | |||
| 9002 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) | ||
| 9003 | return ilb; | ||
| 9004 | |||
| 9005 | return nr_cpu_ids; | ||
| 9006 | } | ||
| 9007 | |||
| 9008 | /* | ||
| 9009 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | ||
| 9010 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | ||
| 9011 | * CPU (if there is one). | ||
| 9012 | */ | ||
| 9013 | static void nohz_balancer_kick(void) | ||
| 9014 | { | ||
| 9015 | int ilb_cpu; | ||
| 9016 | |||
| 9017 | nohz.next_balance++; | ||
| 9018 | |||
| 9019 | ilb_cpu = find_new_ilb(); | ||
| 9020 | |||
| 9021 | if (ilb_cpu >= nr_cpu_ids) | ||
| 9022 | return; | ||
| 9023 | |||
| 9024 | if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) | ||
| 9025 | return; | ||
| 9026 | /* | ||
| 9027 | * Use smp_send_reschedule() instead of resched_cpu(). | ||
| 9028 | * This way we generate a sched IPI on the target cpu which | ||
| 9029 | * is idle. And the softirq performing nohz idle load balance | ||
| 9030 | * will be run before returning from the IPI. | ||
| 9031 | */ | ||
| 9032 | smp_send_reschedule(ilb_cpu); | ||
| 9033 | return; | ||
| 9034 | } | ||
| 9035 | |||
| 9036 | void nohz_balance_exit_idle(unsigned int cpu) | ||
| 9037 | { | ||
| 9038 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { | ||
| 9039 | /* | ||
| 9040 | * Completely isolated CPUs don't ever set, so we must test. | ||
| 9041 | */ | ||
| 9042 | if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { | ||
| 9043 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | ||
| 9044 | atomic_dec(&nohz.nr_cpus); | ||
| 9045 | } | ||
| 9046 | clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | ||
| 9047 | } | ||
| 9048 | } | ||
| 9049 | |||
| 9050 | static inline void set_cpu_sd_state_busy(void) | ||
| 9051 | { | ||
| 9052 | struct sched_domain *sd; | ||
| 9053 | int cpu = smp_processor_id(); | ||
| 9054 | |||
| 9055 | rcu_read_lock(); | ||
| 9056 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); | ||
| 9057 | |||
| 9058 | if (!sd || !sd->nohz_idle) | ||
| 9059 | goto unlock; | ||
| 9060 | sd->nohz_idle = 0; | ||
| 9061 | |||
| 9062 | atomic_inc(&sd->shared->nr_busy_cpus); | ||
| 9063 | unlock: | ||
| 9064 | rcu_read_unlock(); | ||
| 9065 | } | ||
| 9066 | |||
| 9067 | void set_cpu_sd_state_idle(void) | ||
| 9068 | { | ||
| 9069 | struct sched_domain *sd; | ||
| 9070 | int cpu = smp_processor_id(); | ||
| 9071 | |||
| 9072 | rcu_read_lock(); | ||
| 9073 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); | ||
| 9074 | |||
| 9075 | if (!sd || sd->nohz_idle) | ||
| 9076 | goto unlock; | ||
| 9077 | sd->nohz_idle = 1; | ||
| 9078 | |||
| 9079 | atomic_dec(&sd->shared->nr_busy_cpus); | ||
| 9080 | unlock: | ||
| 9081 | rcu_read_unlock(); | ||
| 9082 | } | ||
| 9083 | |||
| 9084 | /* | ||
| 9085 | * This routine will record that the cpu is going idle with tick stopped. | ||
| 9086 | * This info will be used in performing idle load balancing in the future. | ||
| 9087 | */ | ||
| 9088 | void nohz_balance_enter_idle(int cpu) | ||
| 9089 | { | ||
| 9090 | /* | ||
| 9091 | * If this cpu is going down, then nothing needs to be done. | ||
| 9092 | */ | ||
| 9093 | if (!cpu_active(cpu)) | ||
| 9094 | return; | ||
| 9095 | |||
| 9096 | /* Spare idle load balancing on CPUs that don't want to be disturbed: */ | ||
| 9097 | if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) | ||
| 9098 | return; | ||
| 9099 | |||
| 9100 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) | ||
| 9101 | return; | ||
| 9102 | |||
| 9103 | /* | ||
| 9104 | * If we're a completely isolated CPU, we don't play. | ||
| 9105 | */ | ||
| 9106 | if (on_null_domain(cpu_rq(cpu))) | ||
| 9107 | return; | ||
| 9108 | |||
| 9109 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); | ||
| 9110 | atomic_inc(&nohz.nr_cpus); | ||
| 9111 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | ||
| 9112 | } | ||
| 9113 | #endif | ||
| 9114 | |||
| 9115 | static DEFINE_SPINLOCK(balancing); | 9210 | static DEFINE_SPINLOCK(balancing); |
| 9116 | 9211 | ||
| 9117 | /* | 9212 | /* |
| @@ -9141,8 +9236,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) | |||
| 9141 | int need_serialize, need_decay = 0; | 9236 | int need_serialize, need_decay = 0; |
| 9142 | u64 max_cost = 0; | 9237 | u64 max_cost = 0; |
| 9143 | 9238 | ||
| 9144 | update_blocked_averages(cpu); | ||
| 9145 | |||
| 9146 | rcu_read_lock(); | 9239 | rcu_read_lock(); |
| 9147 | for_each_domain(cpu, sd) { | 9240 | for_each_domain(cpu, sd) { |
| 9148 | /* | 9241 | /* |
| @@ -9232,68 +9325,56 @@ out: | |||
| 9232 | } | 9325 | } |
| 9233 | } | 9326 | } |
| 9234 | 9327 | ||
| 9328 | static inline int on_null_domain(struct rq *rq) | ||
| 9329 | { | ||
| 9330 | return unlikely(!rcu_dereference_sched(rq->sd)); | ||
| 9331 | } | ||
| 9332 | |||
| 9235 | #ifdef CONFIG_NO_HZ_COMMON | 9333 | #ifdef CONFIG_NO_HZ_COMMON |
| 9236 | /* | 9334 | /* |
| 9237 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the | 9335 | * idle load balancing details |
| 9238 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 9336 | * - When one of the busy CPUs notice that there may be an idle rebalancing |
| 9337 | * needed, they will kick the idle load balancer, which then does idle | ||
| 9338 | * load balancing for all the idle CPUs. | ||
| 9239 | */ | 9339 | */ |
| 9240 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | ||
| 9241 | { | ||
| 9242 | int this_cpu = this_rq->cpu; | ||
| 9243 | struct rq *rq; | ||
| 9244 | int balance_cpu; | ||
| 9245 | /* Earliest time when we have to do rebalance again */ | ||
| 9246 | unsigned long next_balance = jiffies + 60*HZ; | ||
| 9247 | int update_next_balance = 0; | ||
| 9248 | 9340 | ||
| 9249 | if (idle != CPU_IDLE || | 9341 | static inline int find_new_ilb(void) |
| 9250 | !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) | 9342 | { |
| 9251 | goto end; | 9343 | int ilb = cpumask_first(nohz.idle_cpus_mask); |
| 9252 | 9344 | ||
| 9253 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { | 9345 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) |
| 9254 | if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) | 9346 | return ilb; |
| 9255 | continue; | ||
| 9256 | 9347 | ||
| 9257 | /* | 9348 | return nr_cpu_ids; |
| 9258 | * If this cpu gets work to do, stop the load balancing | 9349 | } |
| 9259 | * work being done for other cpus. Next load | ||
| 9260 | * balancing owner will pick it up. | ||
| 9261 | */ | ||
| 9262 | if (need_resched()) | ||
| 9263 | break; | ||
| 9264 | 9350 | ||
| 9265 | rq = cpu_rq(balance_cpu); | 9351 | /* |
| 9352 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | ||
| 9353 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | ||
| 9354 | * CPU (if there is one). | ||
| 9355 | */ | ||
| 9356 | static void kick_ilb(unsigned int flags) | ||
| 9357 | { | ||
| 9358 | int ilb_cpu; | ||
| 9266 | 9359 | ||
| 9267 | /* | 9360 | nohz.next_balance++; |
| 9268 | * If time for next balance is due, | ||
| 9269 | * do the balance. | ||
| 9270 | */ | ||
| 9271 | if (time_after_eq(jiffies, rq->next_balance)) { | ||
| 9272 | struct rq_flags rf; | ||
| 9273 | 9361 | ||
| 9274 | rq_lock_irq(rq, &rf); | 9362 | ilb_cpu = find_new_ilb(); |
| 9275 | update_rq_clock(rq); | ||
| 9276 | cpu_load_update_idle(rq); | ||
| 9277 | rq_unlock_irq(rq, &rf); | ||
| 9278 | 9363 | ||
| 9279 | rebalance_domains(rq, CPU_IDLE); | 9364 | if (ilb_cpu >= nr_cpu_ids) |
| 9280 | } | 9365 | return; |
| 9281 | 9366 | ||
| 9282 | if (time_after(next_balance, rq->next_balance)) { | 9367 | flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); |
| 9283 | next_balance = rq->next_balance; | 9368 | if (flags & NOHZ_KICK_MASK) |
| 9284 | update_next_balance = 1; | 9369 | return; |
| 9285 | } | ||
| 9286 | } | ||
| 9287 | 9370 | ||
| 9288 | /* | 9371 | /* |
| 9289 | * next_balance will be updated only when there is a need. | 9372 | * Use smp_send_reschedule() instead of resched_cpu(). |
| 9290 | * When the CPU is attached to null domain for ex, it will not be | 9373 | * This way we generate a sched IPI on the target CPU which |
| 9291 | * updated. | 9374 | * is idle. And the softirq performing nohz idle load balance |
| 9375 | * will be run before returning from the IPI. | ||
| 9292 | */ | 9376 | */ |
| 9293 | if (likely(update_next_balance)) | 9377 | smp_send_reschedule(ilb_cpu); |
| 9294 | nohz.next_balance = next_balance; | ||
| 9295 | end: | ||
| 9296 | clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); | ||
| 9297 | } | 9378 | } |
| 9298 | 9379 | ||
| 9299 | /* | 9380 | /* |
| @@ -9307,36 +9388,41 @@ end: | |||
| 9307 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 9388 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
| 9308 | * domain span are idle. | 9389 | * domain span are idle. |
| 9309 | */ | 9390 | */ |
| 9310 | static inline bool nohz_kick_needed(struct rq *rq) | 9391 | static void nohz_balancer_kick(struct rq *rq) |
| 9311 | { | 9392 | { |
| 9312 | unsigned long now = jiffies; | 9393 | unsigned long now = jiffies; |
| 9313 | struct sched_domain_shared *sds; | 9394 | struct sched_domain_shared *sds; |
| 9314 | struct sched_domain *sd; | 9395 | struct sched_domain *sd; |
| 9315 | int nr_busy, i, cpu = rq->cpu; | 9396 | int nr_busy, i, cpu = rq->cpu; |
| 9316 | bool kick = false; | 9397 | unsigned int flags = 0; |
| 9317 | 9398 | ||
| 9318 | if (unlikely(rq->idle_balance)) | 9399 | if (unlikely(rq->idle_balance)) |
| 9319 | return false; | 9400 | return; |
| 9320 | 9401 | ||
| 9321 | /* | 9402 | /* |
| 9322 | * We may be recently in ticked or tickless idle mode. At the first | 9403 | * We may be recently in ticked or tickless idle mode. At the first |
| 9323 | * busy tick after returning from idle, we will update the busy stats. | 9404 | * busy tick after returning from idle, we will update the busy stats. |
| 9324 | */ | 9405 | */ |
| 9325 | set_cpu_sd_state_busy(); | 9406 | nohz_balance_exit_idle(rq); |
| 9326 | nohz_balance_exit_idle(cpu); | ||
| 9327 | 9407 | ||
| 9328 | /* | 9408 | /* |
| 9329 | * None are in tickless mode and hence no need for NOHZ idle load | 9409 | * None are in tickless mode and hence no need for NOHZ idle load |
| 9330 | * balancing. | 9410 | * balancing. |
| 9331 | */ | 9411 | */ |
| 9332 | if (likely(!atomic_read(&nohz.nr_cpus))) | 9412 | if (likely(!atomic_read(&nohz.nr_cpus))) |
| 9333 | return false; | 9413 | return; |
| 9414 | |||
| 9415 | if (READ_ONCE(nohz.has_blocked) && | ||
| 9416 | time_after(now, READ_ONCE(nohz.next_blocked))) | ||
| 9417 | flags = NOHZ_STATS_KICK; | ||
| 9334 | 9418 | ||
| 9335 | if (time_before(now, nohz.next_balance)) | 9419 | if (time_before(now, nohz.next_balance)) |
| 9336 | return false; | 9420 | goto out; |
| 9337 | 9421 | ||
| 9338 | if (rq->nr_running >= 2) | 9422 | if (rq->nr_running >= 2) { |
| 9339 | return true; | 9423 | flags = NOHZ_KICK_MASK; |
| 9424 | goto out; | ||
| 9425 | } | ||
| 9340 | 9426 | ||
| 9341 | rcu_read_lock(); | 9427 | rcu_read_lock(); |
| 9342 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | 9428 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); |
| @@ -9347,7 +9433,7 @@ static inline bool nohz_kick_needed(struct rq *rq) | |||
| 9347 | */ | 9433 | */ |
| 9348 | nr_busy = atomic_read(&sds->nr_busy_cpus); | 9434 | nr_busy = atomic_read(&sds->nr_busy_cpus); |
| 9349 | if (nr_busy > 1) { | 9435 | if (nr_busy > 1) { |
| 9350 | kick = true; | 9436 | flags = NOHZ_KICK_MASK; |
| 9351 | goto unlock; | 9437 | goto unlock; |
| 9352 | } | 9438 | } |
| 9353 | 9439 | ||
| @@ -9357,7 +9443,7 @@ static inline bool nohz_kick_needed(struct rq *rq) | |||
| 9357 | if (sd) { | 9443 | if (sd) { |
| 9358 | if ((rq->cfs.h_nr_running >= 1) && | 9444 | if ((rq->cfs.h_nr_running >= 1) && |
| 9359 | check_cpu_capacity(rq, sd)) { | 9445 | check_cpu_capacity(rq, sd)) { |
| 9360 | kick = true; | 9446 | flags = NOHZ_KICK_MASK; |
| 9361 | goto unlock; | 9447 | goto unlock; |
| 9362 | } | 9448 | } |
| 9363 | } | 9449 | } |
| @@ -9370,18 +9456,421 @@ static inline bool nohz_kick_needed(struct rq *rq) | |||
| 9370 | continue; | 9456 | continue; |
| 9371 | 9457 | ||
| 9372 | if (sched_asym_prefer(i, cpu)) { | 9458 | if (sched_asym_prefer(i, cpu)) { |
| 9373 | kick = true; | 9459 | flags = NOHZ_KICK_MASK; |
| 9374 | goto unlock; | 9460 | goto unlock; |
| 9375 | } | 9461 | } |
| 9376 | } | 9462 | } |
| 9377 | } | 9463 | } |
| 9378 | unlock: | 9464 | unlock: |
| 9379 | rcu_read_unlock(); | 9465 | rcu_read_unlock(); |
| 9380 | return kick; | 9466 | out: |
| 9467 | if (flags) | ||
| 9468 | kick_ilb(flags); | ||
| 9469 | } | ||
| 9470 | |||
| 9471 | static void set_cpu_sd_state_busy(int cpu) | ||
| 9472 | { | ||
| 9473 | struct sched_domain *sd; | ||
| 9474 | |||
| 9475 | rcu_read_lock(); | ||
| 9476 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); | ||
| 9477 | |||
| 9478 | if (!sd || !sd->nohz_idle) | ||
| 9479 | goto unlock; | ||
| 9480 | sd->nohz_idle = 0; | ||
| 9481 | |||
| 9482 | atomic_inc(&sd->shared->nr_busy_cpus); | ||
| 9483 | unlock: | ||
| 9484 | rcu_read_unlock(); | ||
| 9485 | } | ||
| 9486 | |||
| 9487 | void nohz_balance_exit_idle(struct rq *rq) | ||
| 9488 | { | ||
| 9489 | SCHED_WARN_ON(rq != this_rq()); | ||
| 9490 | |||
| 9491 | if (likely(!rq->nohz_tick_stopped)) | ||
| 9492 | return; | ||
| 9493 | |||
| 9494 | rq->nohz_tick_stopped = 0; | ||
| 9495 | cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask); | ||
| 9496 | atomic_dec(&nohz.nr_cpus); | ||
| 9497 | |||
| 9498 | set_cpu_sd_state_busy(rq->cpu); | ||
| 9499 | } | ||
| 9500 | |||
| 9501 | static void set_cpu_sd_state_idle(int cpu) | ||
| 9502 | { | ||
| 9503 | struct sched_domain *sd; | ||
| 9504 | |||
| 9505 | rcu_read_lock(); | ||
| 9506 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); | ||
| 9507 | |||
| 9508 | if (!sd || sd->nohz_idle) | ||
| 9509 | goto unlock; | ||
| 9510 | sd->nohz_idle = 1; | ||
| 9511 | |||
| 9512 | atomic_dec(&sd->shared->nr_busy_cpus); | ||
| 9513 | unlock: | ||
| 9514 | rcu_read_unlock(); | ||
| 9515 | } | ||
| 9516 | |||
| 9517 | /* | ||
| 9518 | * This routine will record that the CPU is going idle with tick stopped. | ||
| 9519 | * This info will be used in performing idle load balancing in the future. | ||
| 9520 | */ | ||
| 9521 | void nohz_balance_enter_idle(int cpu) | ||
| 9522 | { | ||
| 9523 | struct rq *rq = cpu_rq(cpu); | ||
| 9524 | |||
| 9525 | SCHED_WARN_ON(cpu != smp_processor_id()); | ||
| 9526 | |||
| 9527 | /* If this CPU is going down, then nothing needs to be done: */ | ||
| 9528 | if (!cpu_active(cpu)) | ||
| 9529 | return; | ||
| 9530 | |||
| 9531 | /* Spare idle load balancing on CPUs that don't want to be disturbed: */ | ||
| 9532 | if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) | ||
| 9533 | return; | ||
| 9534 | |||
| 9535 | /* | ||
| 9536 | * Can be set safely without rq->lock held | ||
| 9537 | * If a clear happens, it will have evaluated last additions because | ||
| 9538 | * rq->lock is held during the check and the clear | ||
| 9539 | */ | ||
| 9540 | rq->has_blocked_load = 1; | ||
| 9541 | |||
| 9542 | /* | ||
| 9543 | * The tick is still stopped but load could have been added in the | ||
| 9544 | * meantime. We set the nohz.has_blocked flag to trig a check of the | ||
| 9545 | * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear | ||
| 9546 | * of nohz.has_blocked can only happen after checking the new load | ||
| 9547 | */ | ||
| 9548 | if (rq->nohz_tick_stopped) | ||
| 9549 | goto out; | ||
| 9550 | |||
| 9551 | /* If we're a completely isolated CPU, we don't play: */ | ||
| 9552 | if (on_null_domain(rq)) | ||
| 9553 | return; | ||
| 9554 | |||
| 9555 | rq->nohz_tick_stopped = 1; | ||
| 9556 | |||
| 9557 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); | ||
| 9558 | atomic_inc(&nohz.nr_cpus); | ||
| 9559 | |||
| 9560 | /* | ||
| 9561 | * Ensures that if nohz_idle_balance() fails to observe our | ||
| 9562 | * @idle_cpus_mask store, it must observe the @has_blocked | ||
| 9563 | * store. | ||
| 9564 | */ | ||
| 9565 | smp_mb__after_atomic(); | ||
| 9566 | |||
| 9567 | set_cpu_sd_state_idle(cpu); | ||
| 9568 | |||
| 9569 | out: | ||
| 9570 | /* | ||
| 9571 | * Each time a cpu enter idle, we assume that it has blocked load and | ||
| 9572 | * enable the periodic update of the load of idle cpus | ||
| 9573 | */ | ||
| 9574 | WRITE_ONCE(nohz.has_blocked, 1); | ||
| 9575 | } | ||
| 9576 | |||
| 9577 | /* | ||
| 9578 | * Internal function that runs load balance for all idle cpus. The load balance | ||
| 9579 | * can be a simple update of blocked load or a complete load balance with | ||
| 9580 | * tasks movement depending of flags. | ||
| 9581 | * The function returns false if the loop has stopped before running | ||
| 9582 | * through all idle CPUs. | ||
| 9583 | */ | ||
| 9584 | static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, | ||
| 9585 | enum cpu_idle_type idle) | ||
| 9586 | { | ||
| 9587 | /* Earliest time when we have to do rebalance again */ | ||
| 9588 | unsigned long now = jiffies; | ||
| 9589 | unsigned long next_balance = now + 60*HZ; | ||
| 9590 | bool has_blocked_load = false; | ||
| 9591 | int update_next_balance = 0; | ||
| 9592 | int this_cpu = this_rq->cpu; | ||
| 9593 | int balance_cpu; | ||
| 9594 | int ret = false; | ||
| 9595 | struct rq *rq; | ||
| 9596 | |||
| 9597 | SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); | ||
| 9598 | |||
| 9599 | /* | ||
| 9600 | * We assume there will be no idle load after this update and clear | ||
| 9601 | * the has_blocked flag. If a cpu enters idle in the mean time, it will | ||
| 9602 | * set the has_blocked flag and trig another update of idle load. | ||
| 9603 | * Because a cpu that becomes idle, is added to idle_cpus_mask before | ||
| 9604 | * setting the flag, we are sure to not clear the state and not | ||
| 9605 | * check the load of an idle cpu. | ||
| 9606 | */ | ||
| 9607 | WRITE_ONCE(nohz.has_blocked, 0); | ||
| 9608 | |||
| 9609 | /* | ||
| 9610 | * Ensures that if we miss the CPU, we must see the has_blocked | ||
| 9611 | * store from nohz_balance_enter_idle(). | ||
| 9612 | */ | ||
| 9613 | smp_mb(); | ||
| 9614 | |||
| 9615 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { | ||
| 9616 | if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) | ||
| 9617 | continue; | ||
| 9618 | |||
| 9619 | /* | ||
| 9620 | * If this CPU gets work to do, stop the load balancing | ||
| 9621 | * work being done for other CPUs. Next load | ||
| 9622 | * balancing owner will pick it up. | ||
| 9623 | */ | ||
| 9624 | if (need_resched()) { | ||
| 9625 | has_blocked_load = true; | ||
| 9626 | goto abort; | ||
| 9627 | } | ||
| 9628 | |||
| 9629 | rq = cpu_rq(balance_cpu); | ||
| 9630 | |||
| 9631 | has_blocked_load |= update_nohz_stats(rq, true); | ||
| 9632 | |||
| 9633 | /* | ||
| 9634 | * If time for next balance is due, | ||
| 9635 | * do the balance. | ||
| 9636 | */ | ||
| 9637 | if (time_after_eq(jiffies, rq->next_balance)) { | ||
| 9638 | struct rq_flags rf; | ||
| 9639 | |||
| 9640 | rq_lock_irqsave(rq, &rf); | ||
| 9641 | update_rq_clock(rq); | ||
| 9642 | cpu_load_update_idle(rq); | ||
| 9643 | rq_unlock_irqrestore(rq, &rf); | ||
| 9644 | |||
| 9645 | if (flags & NOHZ_BALANCE_KICK) | ||
| 9646 | rebalance_domains(rq, CPU_IDLE); | ||
| 9647 | } | ||
| 9648 | |||
| 9649 | if (time_after(next_balance, rq->next_balance)) { | ||
| 9650 | next_balance = rq->next_balance; | ||
| 9651 | update_next_balance = 1; | ||
| 9652 | } | ||
| 9653 | } | ||
| 9654 | |||
| 9655 | /* Newly idle CPU doesn't need an update */ | ||
| 9656 | if (idle != CPU_NEWLY_IDLE) { | ||
| 9657 | update_blocked_averages(this_cpu); | ||
| 9658 | has_blocked_load |= this_rq->has_blocked_load; | ||
| 9659 | } | ||
| 9660 | |||
| 9661 | if (flags & NOHZ_BALANCE_KICK) | ||
| 9662 | rebalance_domains(this_rq, CPU_IDLE); | ||
| 9663 | |||
| 9664 | WRITE_ONCE(nohz.next_blocked, | ||
| 9665 | now + msecs_to_jiffies(LOAD_AVG_PERIOD)); | ||
| 9666 | |||
| 9667 | /* The full idle balance loop has been done */ | ||
| 9668 | ret = true; | ||
| 9669 | |||
| 9670 | abort: | ||
| 9671 | /* There is still blocked load, enable periodic update */ | ||
| 9672 | if (has_blocked_load) | ||
| 9673 | WRITE_ONCE(nohz.has_blocked, 1); | ||
| 9674 | |||
| 9675 | /* | ||
| 9676 | * next_balance will be updated only when there is a need. | ||
| 9677 | * When the CPU is attached to null domain for ex, it will not be | ||
| 9678 | * updated. | ||
| 9679 | */ | ||
| 9680 | if (likely(update_next_balance)) | ||
| 9681 | nohz.next_balance = next_balance; | ||
| 9682 | |||
| 9683 | return ret; | ||
| 9684 | } | ||
| 9685 | |||
| 9686 | /* | ||
| 9687 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the | ||
| 9688 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | ||
| 9689 | */ | ||
| 9690 | static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | ||
| 9691 | { | ||
| 9692 | int this_cpu = this_rq->cpu; | ||
| 9693 | unsigned int flags; | ||
| 9694 | |||
| 9695 | if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) | ||
| 9696 | return false; | ||
| 9697 | |||
| 9698 | if (idle != CPU_IDLE) { | ||
| 9699 | atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); | ||
| 9700 | return false; | ||
| 9701 | } | ||
| 9702 | |||
| 9703 | /* | ||
| 9704 | * barrier, pairs with nohz_balance_enter_idle(), ensures ... | ||
| 9705 | */ | ||
| 9706 | flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); | ||
| 9707 | if (!(flags & NOHZ_KICK_MASK)) | ||
| 9708 | return false; | ||
| 9709 | |||
| 9710 | _nohz_idle_balance(this_rq, flags, idle); | ||
| 9711 | |||
| 9712 | return true; | ||
| 9713 | } | ||
| 9714 | |||
| 9715 | static void nohz_newidle_balance(struct rq *this_rq) | ||
| 9716 | { | ||
| 9717 | int this_cpu = this_rq->cpu; | ||
| 9718 | |||
| 9719 | /* | ||
| 9720 | * This CPU doesn't want to be disturbed by scheduler | ||
| 9721 | * housekeeping | ||
| 9722 | */ | ||
| 9723 | if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) | ||
| 9724 | return; | ||
| 9725 | |||
| 9726 | /* Will wake up very soon. No time for doing anything else*/ | ||
| 9727 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | ||
| 9728 | return; | ||
| 9729 | |||
| 9730 | /* Don't need to update blocked load of idle CPUs*/ | ||
| 9731 | if (!READ_ONCE(nohz.has_blocked) || | ||
| 9732 | time_before(jiffies, READ_ONCE(nohz.next_blocked))) | ||
| 9733 | return; | ||
| 9734 | |||
| 9735 | raw_spin_unlock(&this_rq->lock); | ||
| 9736 | /* | ||
| 9737 | * This CPU is going to be idle and blocked load of idle CPUs | ||
| 9738 | * need to be updated. Run the ilb locally as it is a good | ||
| 9739 | * candidate for ilb instead of waking up another idle CPU. | ||
| 9740 | * Kick an normal ilb if we failed to do the update. | ||
| 9741 | */ | ||
| 9742 | if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)) | ||
| 9743 | kick_ilb(NOHZ_STATS_KICK); | ||
| 9744 | raw_spin_lock(&this_rq->lock); | ||
| 9745 | } | ||
| 9746 | |||
| 9747 | #else /* !CONFIG_NO_HZ_COMMON */ | ||
| 9748 | static inline void nohz_balancer_kick(struct rq *rq) { } | ||
| 9749 | |||
| 9750 | static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | ||
| 9751 | { | ||
| 9752 | return false; | ||
| 9753 | } | ||
| 9754 | |||
| 9755 | static inline void nohz_newidle_balance(struct rq *this_rq) { } | ||
| 9756 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
| 9757 | |||
| 9758 | /* | ||
| 9759 | * idle_balance is called by schedule() if this_cpu is about to become | ||
| 9760 | * idle. Attempts to pull tasks from other CPUs. | ||
| 9761 | */ | ||
| 9762 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf) | ||
| 9763 | { | ||
| 9764 | unsigned long next_balance = jiffies + HZ; | ||
| 9765 | int this_cpu = this_rq->cpu; | ||
| 9766 | struct sched_domain *sd; | ||
| 9767 | int pulled_task = 0; | ||
| 9768 | u64 curr_cost = 0; | ||
| 9769 | |||
| 9770 | /* | ||
| 9771 | * We must set idle_stamp _before_ calling idle_balance(), such that we | ||
| 9772 | * measure the duration of idle_balance() as idle time. | ||
| 9773 | */ | ||
| 9774 | this_rq->idle_stamp = rq_clock(this_rq); | ||
| 9775 | |||
| 9776 | /* | ||
| 9777 | * Do not pull tasks towards !active CPUs... | ||
| 9778 | */ | ||
| 9779 | if (!cpu_active(this_cpu)) | ||
| 9780 | return 0; | ||
| 9781 | |||
| 9782 | /* | ||
| 9783 | * This is OK, because current is on_cpu, which avoids it being picked | ||
| 9784 | * for load-balance and preemption/IRQs are still disabled avoiding | ||
| 9785 | * further scheduler activity on it and we're being very careful to | ||
| 9786 | * re-start the picking loop. | ||
| 9787 | */ | ||
| 9788 | rq_unpin_lock(this_rq, rf); | ||
| 9789 | |||
| 9790 | if (this_rq->avg_idle < sysctl_sched_migration_cost || | ||
| 9791 | !this_rq->rd->overload) { | ||
| 9792 | |||
| 9793 | rcu_read_lock(); | ||
| 9794 | sd = rcu_dereference_check_sched_domain(this_rq->sd); | ||
| 9795 | if (sd) | ||
| 9796 | update_next_balance(sd, &next_balance); | ||
| 9797 | rcu_read_unlock(); | ||
| 9798 | |||
| 9799 | nohz_newidle_balance(this_rq); | ||
| 9800 | |||
| 9801 | goto out; | ||
| 9802 | } | ||
| 9803 | |||
| 9804 | raw_spin_unlock(&this_rq->lock); | ||
| 9805 | |||
| 9806 | update_blocked_averages(this_cpu); | ||
| 9807 | rcu_read_lock(); | ||
| 9808 | for_each_domain(this_cpu, sd) { | ||
| 9809 | int continue_balancing = 1; | ||
| 9810 | u64 t0, domain_cost; | ||
| 9811 | |||
| 9812 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
| 9813 | continue; | ||
| 9814 | |||
| 9815 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { | ||
| 9816 | update_next_balance(sd, &next_balance); | ||
| 9817 | break; | ||
| 9818 | } | ||
| 9819 | |||
| 9820 | if (sd->flags & SD_BALANCE_NEWIDLE) { | ||
| 9821 | t0 = sched_clock_cpu(this_cpu); | ||
| 9822 | |||
| 9823 | pulled_task = load_balance(this_cpu, this_rq, | ||
| 9824 | sd, CPU_NEWLY_IDLE, | ||
| 9825 | &continue_balancing); | ||
| 9826 | |||
| 9827 | domain_cost = sched_clock_cpu(this_cpu) - t0; | ||
| 9828 | if (domain_cost > sd->max_newidle_lb_cost) | ||
| 9829 | sd->max_newidle_lb_cost = domain_cost; | ||
| 9830 | |||
| 9831 | curr_cost += domain_cost; | ||
| 9832 | } | ||
| 9833 | |||
| 9834 | update_next_balance(sd, &next_balance); | ||
| 9835 | |||
| 9836 | /* | ||
| 9837 | * Stop searching for tasks to pull if there are | ||
| 9838 | * now runnable tasks on this rq. | ||
| 9839 | */ | ||
| 9840 | if (pulled_task || this_rq->nr_running > 0) | ||
| 9841 | break; | ||
| 9842 | } | ||
| 9843 | rcu_read_unlock(); | ||
| 9844 | |||
| 9845 | raw_spin_lock(&this_rq->lock); | ||
| 9846 | |||
| 9847 | if (curr_cost > this_rq->max_idle_balance_cost) | ||
| 9848 | this_rq->max_idle_balance_cost = curr_cost; | ||
| 9849 | |||
| 9850 | /* | ||
| 9851 | * While browsing the domains, we released the rq lock, a task could | ||
| 9852 | * have been enqueued in the meantime. Since we're not going idle, | ||
| 9853 | * pretend we pulled a task. | ||
| 9854 | */ | ||
| 9855 | if (this_rq->cfs.h_nr_running && !pulled_task) | ||
| 9856 | pulled_task = 1; | ||
| 9857 | |||
| 9858 | out: | ||
| 9859 | /* Move the next balance forward */ | ||
| 9860 | if (time_after(this_rq->next_balance, next_balance)) | ||
| 9861 | this_rq->next_balance = next_balance; | ||
| 9862 | |||
| 9863 | /* Is there a task of a high priority class? */ | ||
| 9864 | if (this_rq->nr_running != this_rq->cfs.h_nr_running) | ||
| 9865 | pulled_task = -1; | ||
| 9866 | |||
| 9867 | if (pulled_task) | ||
| 9868 | this_rq->idle_stamp = 0; | ||
| 9869 | |||
| 9870 | rq_repin_lock(this_rq, rf); | ||
| 9871 | |||
| 9872 | return pulled_task; | ||
| 9381 | } | 9873 | } |
| 9382 | #else | ||
| 9383 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } | ||
| 9384 | #endif | ||
| 9385 | 9874 | ||
| 9386 | /* | 9875 | /* |
| 9387 | * run_rebalance_domains is triggered when needed from the scheduler tick. | 9876 | * run_rebalance_domains is triggered when needed from the scheduler tick. |
| @@ -9394,14 +9883,18 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) | |||
| 9394 | CPU_IDLE : CPU_NOT_IDLE; | 9883 | CPU_IDLE : CPU_NOT_IDLE; |
| 9395 | 9884 | ||
| 9396 | /* | 9885 | /* |
| 9397 | * If this cpu has a pending nohz_balance_kick, then do the | 9886 | * If this CPU has a pending nohz_balance_kick, then do the |
| 9398 | * balancing on behalf of the other idle cpus whose ticks are | 9887 | * balancing on behalf of the other idle CPUs whose ticks are |
| 9399 | * stopped. Do nohz_idle_balance *before* rebalance_domains to | 9888 | * stopped. Do nohz_idle_balance *before* rebalance_domains to |
| 9400 | * give the idle cpus a chance to load balance. Else we may | 9889 | * give the idle CPUs a chance to load balance. Else we may |
| 9401 | * load balance only within the local sched_domain hierarchy | 9890 | * load balance only within the local sched_domain hierarchy |
| 9402 | * and abort nohz_idle_balance altogether if we pull some load. | 9891 | * and abort nohz_idle_balance altogether if we pull some load. |
| 9403 | */ | 9892 | */ |
| 9404 | nohz_idle_balance(this_rq, idle); | 9893 | if (nohz_idle_balance(this_rq, idle)) |
| 9894 | return; | ||
| 9895 | |||
| 9896 | /* normal load balance */ | ||
| 9897 | update_blocked_averages(this_rq->cpu); | ||
| 9405 | rebalance_domains(this_rq, idle); | 9898 | rebalance_domains(this_rq, idle); |
| 9406 | } | 9899 | } |
| 9407 | 9900 | ||
| @@ -9416,10 +9909,8 @@ void trigger_load_balance(struct rq *rq) | |||
| 9416 | 9909 | ||
| 9417 | if (time_after_eq(jiffies, rq->next_balance)) | 9910 | if (time_after_eq(jiffies, rq->next_balance)) |
| 9418 | raise_softirq(SCHED_SOFTIRQ); | 9911 | raise_softirq(SCHED_SOFTIRQ); |
| 9419 | #ifdef CONFIG_NO_HZ_COMMON | 9912 | |
| 9420 | if (nohz_kick_needed(rq)) | 9913 | nohz_balancer_kick(rq); |
| 9421 | nohz_balancer_kick(); | ||
| 9422 | #endif | ||
| 9423 | } | 9914 | } |
| 9424 | 9915 | ||
| 9425 | static void rq_online_fair(struct rq *rq) | 9916 | static void rq_online_fair(struct rq *rq) |
| @@ -9440,7 +9931,12 @@ static void rq_offline_fair(struct rq *rq) | |||
| 9440 | #endif /* CONFIG_SMP */ | 9931 | #endif /* CONFIG_SMP */ |
| 9441 | 9932 | ||
| 9442 | /* | 9933 | /* |
| 9443 | * scheduler tick hitting a task of our scheduling class: | 9934 | * scheduler tick hitting a task of our scheduling class. |
| 9935 | * | ||
| 9936 | * NOTE: This function can be called remotely by the tick offload that | ||
| 9937 | * goes along full dynticks. Therefore no local assumption can be made | ||
| 9938 | * and everything must be accessed through the @rq and @curr passed in | ||
| 9939 | * parameters. | ||
| 9444 | */ | 9940 | */ |
| 9445 | static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | 9941 | static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) |
| 9446 | { | 9942 | { |
| @@ -9591,7 +10087,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) | |||
| 9591 | 10087 | ||
| 9592 | /* Synchronize entity with its cfs_rq */ | 10088 | /* Synchronize entity with its cfs_rq */ |
| 9593 | update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); | 10089 | update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); |
| 9594 | attach_entity_load_avg(cfs_rq, se); | 10090 | attach_entity_load_avg(cfs_rq, se, 0); |
| 9595 | update_tg_load_avg(cfs_rq, false); | 10091 | update_tg_load_avg(cfs_rq, false); |
| 9596 | propagate_entity_cfs_rq(se); | 10092 | propagate_entity_cfs_rq(se); |
| 9597 | } | 10093 | } |
| @@ -9993,6 +10489,7 @@ __init void init_sched_fair_class(void) | |||
| 9993 | 10489 | ||
| 9994 | #ifdef CONFIG_NO_HZ_COMMON | 10490 | #ifdef CONFIG_NO_HZ_COMMON |
| 9995 | nohz.next_balance = jiffies; | 10491 | nohz.next_balance = jiffies; |
| 10492 | nohz.next_blocked = jiffies; | ||
| 9996 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 10493 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
| 9997 | #endif | 10494 | #endif |
| 9998 | #endif /* SMP */ | 10495 | #endif /* SMP */ |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 9552fd5854bf..85ae8488039c 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
| @@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true) | |||
| 85 | SCHED_FEAT(WA_IDLE, true) | 85 | SCHED_FEAT(WA_IDLE, true) |
| 86 | SCHED_FEAT(WA_WEIGHT, true) | 86 | SCHED_FEAT(WA_WEIGHT, true) |
| 87 | SCHED_FEAT(WA_BIAS, true) | 87 | SCHED_FEAT(WA_BIAS, true) |
| 88 | |||
| 89 | /* | ||
| 90 | * UtilEstimation. Use estimated CPU utilization. | ||
| 91 | */ | ||
| 92 | SCHED_FEAT(UTIL_EST, true) | ||
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 7dae9eb8c042..2975f195e1c4 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
| @@ -1,23 +1,14 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * Generic entry point for the idle threads | 2 | * Generic entry points for the idle threads and |
| 3 | * implementation of the idle task scheduling class. | ||
| 4 | * | ||
| 5 | * (NOTE: these are not related to SCHED_IDLE batch scheduled | ||
| 6 | * tasks which are handled in sched/fair.c ) | ||
| 3 | */ | 7 | */ |
| 4 | #include <linux/sched.h> | 8 | #include "sched.h" |
| 5 | #include <linux/sched/idle.h> | ||
| 6 | #include <linux/cpu.h> | ||
| 7 | #include <linux/cpuidle.h> | ||
| 8 | #include <linux/cpuhotplug.h> | ||
| 9 | #include <linux/tick.h> | ||
| 10 | #include <linux/mm.h> | ||
| 11 | #include <linux/stackprotector.h> | ||
| 12 | #include <linux/suspend.h> | ||
| 13 | #include <linux/livepatch.h> | ||
| 14 | |||
| 15 | #include <asm/tlb.h> | ||
| 16 | 9 | ||
| 17 | #include <trace/events/power.h> | 10 | #include <trace/events/power.h> |
| 18 | 11 | ||
| 19 | #include "sched.h" | ||
| 20 | |||
| 21 | /* Linker adds these: start and end of __cpuidle functions */ | 12 | /* Linker adds these: start and end of __cpuidle functions */ |
| 22 | extern char __cpuidle_text_start[], __cpuidle_text_end[]; | 13 | extern char __cpuidle_text_start[], __cpuidle_text_end[]; |
| 23 | 14 | ||
| @@ -46,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable) | |||
| 46 | static int __init cpu_idle_poll_setup(char *__unused) | 37 | static int __init cpu_idle_poll_setup(char *__unused) |
| 47 | { | 38 | { |
| 48 | cpu_idle_force_poll = 1; | 39 | cpu_idle_force_poll = 1; |
| 40 | |||
| 49 | return 1; | 41 | return 1; |
| 50 | } | 42 | } |
| 51 | __setup("nohlt", cpu_idle_poll_setup); | 43 | __setup("nohlt", cpu_idle_poll_setup); |
| @@ -53,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup); | |||
| 53 | static int __init cpu_idle_nopoll_setup(char *__unused) | 45 | static int __init cpu_idle_nopoll_setup(char *__unused) |
| 54 | { | 46 | { |
| 55 | cpu_idle_force_poll = 0; | 47 | cpu_idle_force_poll = 0; |
| 48 | |||
| 56 | return 1; | 49 | return 1; |
| 57 | } | 50 | } |
| 58 | __setup("hlt", cpu_idle_nopoll_setup); | 51 | __setup("hlt", cpu_idle_nopoll_setup); |
| @@ -64,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void) | |||
| 64 | trace_cpu_idle_rcuidle(0, smp_processor_id()); | 57 | trace_cpu_idle_rcuidle(0, smp_processor_id()); |
| 65 | local_irq_enable(); | 58 | local_irq_enable(); |
| 66 | stop_critical_timings(); | 59 | stop_critical_timings(); |
| 60 | |||
| 67 | while (!tif_need_resched() && | 61 | while (!tif_need_resched() && |
| 68 | (cpu_idle_force_poll || tick_check_broadcast_expired())) | 62 | (cpu_idle_force_poll || tick_check_broadcast_expired())) |
| 69 | cpu_relax(); | 63 | cpu_relax(); |
| 70 | start_critical_timings(); | 64 | start_critical_timings(); |
| 71 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); | 65 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
| 72 | rcu_idle_exit(); | 66 | rcu_idle_exit(); |
| 67 | |||
| 73 | return 1; | 68 | return 1; |
| 74 | } | 69 | } |
| 75 | 70 | ||
| @@ -332,8 +327,8 @@ void cpu_startup_entry(enum cpuhp_state state) | |||
| 332 | { | 327 | { |
| 333 | /* | 328 | /* |
| 334 | * This #ifdef needs to die, but it's too late in the cycle to | 329 | * This #ifdef needs to die, but it's too late in the cycle to |
| 335 | * make this generic (arm and sh have never invoked the canary | 330 | * make this generic (ARM and SH have never invoked the canary |
| 336 | * init for the non boot cpus!). Will be fixed in 3.11 | 331 | * init for the non boot CPUs!). Will be fixed in 3.11 |
| 337 | */ | 332 | */ |
| 338 | #ifdef CONFIG_X86 | 333 | #ifdef CONFIG_X86 |
| 339 | /* | 334 | /* |
| @@ -350,3 +345,116 @@ void cpu_startup_entry(enum cpuhp_state state) | |||
| 350 | while (1) | 345 | while (1) |
| 351 | do_idle(); | 346 | do_idle(); |
| 352 | } | 347 | } |
| 348 | |||
| 349 | /* | ||
| 350 | * idle-task scheduling class. | ||
| 351 | */ | ||
| 352 | |||
| 353 | #ifdef CONFIG_SMP | ||
| 354 | static int | ||
| 355 | select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) | ||
| 356 | { | ||
| 357 | return task_cpu(p); /* IDLE tasks as never migrated */ | ||
| 358 | } | ||
| 359 | #endif | ||
| 360 | |||
| 361 | /* | ||
| 362 | * Idle tasks are unconditionally rescheduled: | ||
| 363 | */ | ||
| 364 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) | ||
| 365 | { | ||
| 366 | resched_curr(rq); | ||
| 367 | } | ||
| 368 | |||
| 369 | static struct task_struct * | ||
| 370 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | ||
| 371 | { | ||
| 372 | put_prev_task(rq, prev); | ||
| 373 | update_idle_core(rq); | ||
| 374 | schedstat_inc(rq->sched_goidle); | ||
| 375 | |||
| 376 | return rq->idle; | ||
| 377 | } | ||
| 378 | |||
| 379 | /* | ||
| 380 | * It is not legal to sleep in the idle task - print a warning | ||
| 381 | * message if some code attempts to do it: | ||
| 382 | */ | ||
| 383 | static void | ||
| 384 | dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) | ||
| 385 | { | ||
| 386 | raw_spin_unlock_irq(&rq->lock); | ||
| 387 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); | ||
| 388 | dump_stack(); | ||
| 389 | raw_spin_lock_irq(&rq->lock); | ||
| 390 | } | ||
| 391 | |||
| 392 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) | ||
| 393 | { | ||
| 394 | } | ||
| 395 | |||
| 396 | /* | ||
| 397 | * scheduler tick hitting a task of our scheduling class. | ||
| 398 | * | ||
| 399 | * NOTE: This function can be called remotely by the tick offload that | ||
| 400 | * goes along full dynticks. Therefore no local assumption can be made | ||
| 401 | * and everything must be accessed through the @rq and @curr passed in | ||
| 402 | * parameters. | ||
| 403 | */ | ||
| 404 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) | ||
| 405 | { | ||
| 406 | } | ||
| 407 | |||
| 408 | static void set_curr_task_idle(struct rq *rq) | ||
| 409 | { | ||
| 410 | } | ||
| 411 | |||
| 412 | static void switched_to_idle(struct rq *rq, struct task_struct *p) | ||
| 413 | { | ||
| 414 | BUG(); | ||
| 415 | } | ||
| 416 | |||
| 417 | static void | ||
| 418 | prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) | ||
| 419 | { | ||
| 420 | BUG(); | ||
| 421 | } | ||
| 422 | |||
| 423 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) | ||
| 424 | { | ||
| 425 | return 0; | ||
| 426 | } | ||
| 427 | |||
| 428 | static void update_curr_idle(struct rq *rq) | ||
| 429 | { | ||
| 430 | } | ||
| 431 | |||
| 432 | /* | ||
| 433 | * Simple, special scheduling class for the per-CPU idle tasks: | ||
| 434 | */ | ||
| 435 | const struct sched_class idle_sched_class = { | ||
| 436 | /* .next is NULL */ | ||
| 437 | /* no enqueue/yield_task for idle tasks */ | ||
| 438 | |||
| 439 | /* dequeue is not valid, we print a debug message there: */ | ||
| 440 | .dequeue_task = dequeue_task_idle, | ||
| 441 | |||
| 442 | .check_preempt_curr = check_preempt_curr_idle, | ||
| 443 | |||
| 444 | .pick_next_task = pick_next_task_idle, | ||
| 445 | .put_prev_task = put_prev_task_idle, | ||
| 446 | |||
| 447 | #ifdef CONFIG_SMP | ||
| 448 | .select_task_rq = select_task_rq_idle, | ||
| 449 | .set_cpus_allowed = set_cpus_allowed_common, | ||
| 450 | #endif | ||
| 451 | |||
| 452 | .set_curr_task = set_curr_task_idle, | ||
| 453 | .task_tick = task_tick_idle, | ||
| 454 | |||
| 455 | .get_rr_interval = get_rr_interval_idle, | ||
| 456 | |||
| 457 | .prio_changed = prio_changed_idle, | ||
| 458 | .switched_to = switched_to_idle, | ||
| 459 | .update_curr = update_curr_idle, | ||
| 460 | }; | ||
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c deleted file mode 100644 index d518664cce4f..000000000000 --- a/kernel/sched/idle_task.c +++ /dev/null | |||
| @@ -1,110 +0,0 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | ||
| 2 | #include "sched.h" | ||
| 3 | |||
| 4 | /* | ||
| 5 | * idle-task scheduling class. | ||
| 6 | * | ||
| 7 | * (NOTE: these are not related to SCHED_IDLE tasks which are | ||
| 8 | * handled in sched/fair.c) | ||
| 9 | */ | ||
| 10 | |||
| 11 | #ifdef CONFIG_SMP | ||
| 12 | static int | ||
| 13 | select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) | ||
| 14 | { | ||
| 15 | return task_cpu(p); /* IDLE tasks as never migrated */ | ||
| 16 | } | ||
| 17 | #endif /* CONFIG_SMP */ | ||
| 18 | |||
| 19 | /* | ||
| 20 | * Idle tasks are unconditionally rescheduled: | ||
| 21 | */ | ||
| 22 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) | ||
| 23 | { | ||
| 24 | resched_curr(rq); | ||
| 25 | } | ||
| 26 | |||
| 27 | static struct task_struct * | ||
| 28 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | ||
| 29 | { | ||
| 30 | put_prev_task(rq, prev); | ||
| 31 | update_idle_core(rq); | ||
| 32 | schedstat_inc(rq->sched_goidle); | ||
| 33 | return rq->idle; | ||
| 34 | } | ||
| 35 | |||
| 36 | /* | ||
| 37 | * It is not legal to sleep in the idle task - print a warning | ||
| 38 | * message if some code attempts to do it: | ||
| 39 | */ | ||
| 40 | static void | ||
| 41 | dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) | ||
| 42 | { | ||
| 43 | raw_spin_unlock_irq(&rq->lock); | ||
| 44 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); | ||
| 45 | dump_stack(); | ||
| 46 | raw_spin_lock_irq(&rq->lock); | ||
| 47 | } | ||
| 48 | |||
| 49 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) | ||
| 50 | { | ||
| 51 | rq_last_tick_reset(rq); | ||
| 52 | } | ||
| 53 | |||
| 54 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) | ||
| 55 | { | ||
| 56 | } | ||
| 57 | |||
| 58 | static void set_curr_task_idle(struct rq *rq) | ||
| 59 | { | ||
| 60 | } | ||
| 61 | |||
| 62 | static void switched_to_idle(struct rq *rq, struct task_struct *p) | ||
| 63 | { | ||
| 64 | BUG(); | ||
| 65 | } | ||
| 66 | |||
| 67 | static void | ||
| 68 | prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) | ||
| 69 | { | ||
| 70 | BUG(); | ||
| 71 | } | ||
| 72 | |||
| 73 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) | ||
| 74 | { | ||
| 75 | return 0; | ||
| 76 | } | ||
| 77 | |||
| 78 | static void update_curr_idle(struct rq *rq) | ||
| 79 | { | ||
| 80 | } | ||
| 81 | |||
| 82 | /* | ||
| 83 | * Simple, special scheduling class for the per-CPU idle tasks: | ||
| 84 | */ | ||
| 85 | const struct sched_class idle_sched_class = { | ||
| 86 | /* .next is NULL */ | ||
| 87 | /* no enqueue/yield_task for idle tasks */ | ||
| 88 | |||
| 89 | /* dequeue is not valid, we print a debug message there: */ | ||
| 90 | .dequeue_task = dequeue_task_idle, | ||
| 91 | |||
| 92 | .check_preempt_curr = check_preempt_curr_idle, | ||
| 93 | |||
| 94 | .pick_next_task = pick_next_task_idle, | ||
| 95 | .put_prev_task = put_prev_task_idle, | ||
| 96 | |||
| 97 | #ifdef CONFIG_SMP | ||
| 98 | .select_task_rq = select_task_rq_idle, | ||
| 99 | .set_cpus_allowed = set_cpus_allowed_common, | ||
| 100 | #endif | ||
| 101 | |||
| 102 | .set_curr_task = set_curr_task_idle, | ||
| 103 | .task_tick = task_tick_idle, | ||
| 104 | |||
| 105 | .get_rr_interval = get_rr_interval_idle, | ||
| 106 | |||
| 107 | .prio_changed = prio_changed_idle, | ||
| 108 | .switched_to = switched_to_idle, | ||
| 109 | .update_curr = update_curr_idle, | ||
| 110 | }; | ||
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index b71b436f59f2..e6802181900f 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c | |||
| @@ -3,15 +3,10 @@ | |||
| 3 | * any CPU: unbound workqueues, timers, kthreads and any offloadable work. | 3 | * any CPU: unbound workqueues, timers, kthreads and any offloadable work. |
| 4 | * | 4 | * |
| 5 | * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker | 5 | * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker |
| 6 | * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker | ||
| 6 | * | 7 | * |
| 7 | */ | 8 | */ |
| 8 | 9 | #include "sched.h" | |
| 9 | #include <linux/sched/isolation.h> | ||
| 10 | #include <linux/tick.h> | ||
| 11 | #include <linux/init.h> | ||
| 12 | #include <linux/kernel.h> | ||
| 13 | #include <linux/static_key.h> | ||
| 14 | #include <linux/ctype.h> | ||
| 15 | 10 | ||
| 16 | DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); | 11 | DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); |
| 17 | EXPORT_SYMBOL_GPL(housekeeping_overriden); | 12 | EXPORT_SYMBOL_GPL(housekeeping_overriden); |
| @@ -60,6 +55,9 @@ void __init housekeeping_init(void) | |||
| 60 | 55 | ||
| 61 | static_branch_enable(&housekeeping_overriden); | 56 | static_branch_enable(&housekeeping_overriden); |
| 62 | 57 | ||
| 58 | if (housekeeping_flags & HK_FLAG_TICK) | ||
| 59 | sched_tick_offload_init(); | ||
| 60 | |||
| 63 | /* We need at least one CPU to handle housekeeping work */ | 61 | /* We need at least one CPU to handle housekeeping work */ |
| 64 | WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); | 62 | WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); |
| 65 | } | 63 | } |
| @@ -119,7 +117,7 @@ static int __init housekeeping_nohz_full_setup(char *str) | |||
| 119 | { | 117 | { |
| 120 | unsigned int flags; | 118 | unsigned int flags; |
| 121 | 119 | ||
| 122 | flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; | 120 | flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; |
| 123 | 121 | ||
| 124 | return housekeeping_setup(str, flags); | 122 | return housekeeping_setup(str, flags); |
| 125 | } | 123 | } |
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 89a989e4d758..a171c1258109 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c | |||
| @@ -6,10 +6,6 @@ | |||
| 6 | * figure. Its a silly number but people think its important. We go through | 6 | * figure. Its a silly number but people think its important. We go through |
| 7 | * great pains to make it work on big machines and tickless kernels. | 7 | * great pains to make it work on big machines and tickless kernels. |
| 8 | */ | 8 | */ |
| 9 | |||
| 10 | #include <linux/export.h> | ||
| 11 | #include <linux/sched/loadavg.h> | ||
| 12 | |||
| 13 | #include "sched.h" | 9 | #include "sched.h" |
| 14 | 10 | ||
| 15 | /* | 11 | /* |
| @@ -32,29 +28,29 @@ | |||
| 32 | * Due to a number of reasons the above turns in the mess below: | 28 | * Due to a number of reasons the above turns in the mess below: |
| 33 | * | 29 | * |
| 34 | * - for_each_possible_cpu() is prohibitively expensive on machines with | 30 | * - for_each_possible_cpu() is prohibitively expensive on machines with |
| 35 | * serious number of cpus, therefore we need to take a distributed approach | 31 | * serious number of CPUs, therefore we need to take a distributed approach |
| 36 | * to calculating nr_active. | 32 | * to calculating nr_active. |
| 37 | * | 33 | * |
| 38 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | 34 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 |
| 39 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | 35 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } |
| 40 | * | 36 | * |
| 41 | * So assuming nr_active := 0 when we start out -- true per definition, we | 37 | * So assuming nr_active := 0 when we start out -- true per definition, we |
| 42 | * can simply take per-cpu deltas and fold those into a global accumulate | 38 | * can simply take per-CPU deltas and fold those into a global accumulate |
| 43 | * to obtain the same result. See calc_load_fold_active(). | 39 | * to obtain the same result. See calc_load_fold_active(). |
| 44 | * | 40 | * |
| 45 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | 41 | * Furthermore, in order to avoid synchronizing all per-CPU delta folding |
| 46 | * across the machine, we assume 10 ticks is sufficient time for every | 42 | * across the machine, we assume 10 ticks is sufficient time for every |
| 47 | * cpu to have completed this task. | 43 | * CPU to have completed this task. |
| 48 | * | 44 | * |
| 49 | * This places an upper-bound on the IRQ-off latency of the machine. Then | 45 | * This places an upper-bound on the IRQ-off latency of the machine. Then |
| 50 | * again, being late doesn't loose the delta, just wrecks the sample. | 46 | * again, being late doesn't loose the delta, just wrecks the sample. |
| 51 | * | 47 | * |
| 52 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | 48 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because |
| 53 | * this would add another cross-cpu cacheline miss and atomic operation | 49 | * this would add another cross-CPU cacheline miss and atomic operation |
| 54 | * to the wakeup path. Instead we increment on whatever cpu the task ran | 50 | * to the wakeup path. Instead we increment on whatever CPU the task ran |
| 55 | * when it went into uninterruptible state and decrement on whatever cpu | 51 | * when it went into uninterruptible state and decrement on whatever CPU |
| 56 | * did the wakeup. This means that only the sum of nr_uninterruptible over | 52 | * did the wakeup. This means that only the sum of nr_uninterruptible over |
| 57 | * all cpus yields the correct result. | 53 | * all CPUs yields the correct result. |
| 58 | * | 54 | * |
| 59 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | 55 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. |
| 60 | */ | 56 | */ |
| @@ -115,11 +111,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
| 115 | * Handle NO_HZ for the global load-average. | 111 | * Handle NO_HZ for the global load-average. |
| 116 | * | 112 | * |
| 117 | * Since the above described distributed algorithm to compute the global | 113 | * Since the above described distributed algorithm to compute the global |
| 118 | * load-average relies on per-cpu sampling from the tick, it is affected by | 114 | * load-average relies on per-CPU sampling from the tick, it is affected by |
| 119 | * NO_HZ. | 115 | * NO_HZ. |
| 120 | * | 116 | * |
| 121 | * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon | 117 | * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon |
| 122 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | 118 | * entering NO_HZ state such that we can include this as an 'extra' CPU delta |
| 123 | * when we read the global state. | 119 | * when we read the global state. |
| 124 | * | 120 | * |
| 125 | * Obviously reality has to ruin such a delightfully simple scheme: | 121 | * Obviously reality has to ruin such a delightfully simple scheme: |
| @@ -146,9 +142,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
| 146 | * busy state. | 142 | * busy state. |
| 147 | * | 143 | * |
| 148 | * This is solved by pushing the window forward, and thus skipping the | 144 | * This is solved by pushing the window forward, and thus skipping the |
| 149 | * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which | 145 | * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which |
| 150 | * was in effect at the time the window opened). This also solves the issue | 146 | * was in effect at the time the window opened). This also solves the issue |
| 151 | * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ | 147 | * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ |
| 152 | * intervals. | 148 | * intervals. |
| 153 | * | 149 | * |
| 154 | * When making the ILB scale, we should try to pull this in as well. | 150 | * When making the ILB scale, we should try to pull this in as well. |
| @@ -299,7 +295,7 @@ calc_load_n(unsigned long load, unsigned long exp, | |||
| 299 | } | 295 | } |
| 300 | 296 | ||
| 301 | /* | 297 | /* |
| 302 | * NO_HZ can leave us missing all per-cpu ticks calling | 298 | * NO_HZ can leave us missing all per-CPU ticks calling |
| 303 | * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into | 299 | * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into |
| 304 | * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold | 300 | * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold |
| 305 | * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. | 301 | * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. |
| @@ -363,7 +359,7 @@ void calc_global_load(unsigned long ticks) | |||
| 363 | return; | 359 | return; |
| 364 | 360 | ||
| 365 | /* | 361 | /* |
| 366 | * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. | 362 | * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs. |
| 367 | */ | 363 | */ |
| 368 | delta = calc_load_nohz_fold(); | 364 | delta = calc_load_nohz_fold(); |
| 369 | if (delta) | 365 | if (delta) |
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 5d0762633639..76e0eaf4654e 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c | |||
| @@ -13,32 +13,25 @@ | |||
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | * GNU General Public License for more details. | 14 | * GNU General Public License for more details. |
| 15 | */ | 15 | */ |
| 16 | 16 | #include "sched.h" | |
| 17 | #include <linux/syscalls.h> | ||
| 18 | #include <linux/membarrier.h> | ||
| 19 | #include <linux/tick.h> | ||
| 20 | #include <linux/cpumask.h> | ||
| 21 | #include <linux/atomic.h> | ||
| 22 | |||
| 23 | #include "sched.h" /* for cpu_rq(). */ | ||
| 24 | 17 | ||
| 25 | /* | 18 | /* |
| 26 | * Bitmask made from a "or" of all commands within enum membarrier_cmd, | 19 | * Bitmask made from a "or" of all commands within enum membarrier_cmd, |
| 27 | * except MEMBARRIER_CMD_QUERY. | 20 | * except MEMBARRIER_CMD_QUERY. |
| 28 | */ | 21 | */ |
| 29 | #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE | 22 | #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE |
| 30 | #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ | 23 | #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ |
| 31 | (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ | 24 | (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ |
| 32 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) | 25 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) |
| 33 | #else | 26 | #else |
| 34 | #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 | 27 | #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 |
| 35 | #endif | 28 | #endif |
| 36 | 29 | ||
| 37 | #define MEMBARRIER_CMD_BITMASK \ | 30 | #define MEMBARRIER_CMD_BITMASK \ |
| 38 | (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ | 31 | (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ |
| 39 | | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ | 32 | | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ |
| 40 | | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ | 33 | | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ |
| 41 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ | 34 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ |
| 42 | | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) | 35 | | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) |
| 43 | 36 | ||
| 44 | static void ipi_mb(void *info) | 37 | static void ipi_mb(void *info) |
| @@ -85,6 +78,7 @@ static int membarrier_global_expedited(void) | |||
| 85 | */ | 78 | */ |
| 86 | if (cpu == raw_smp_processor_id()) | 79 | if (cpu == raw_smp_processor_id()) |
| 87 | continue; | 80 | continue; |
| 81 | |||
| 88 | rcu_read_lock(); | 82 | rcu_read_lock(); |
| 89 | p = task_rcu_dereference(&cpu_rq(cpu)->curr); | 83 | p = task_rcu_dereference(&cpu_rq(cpu)->curr); |
| 90 | if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & | 84 | if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & |
| @@ -188,6 +182,7 @@ static int membarrier_private_expedited(int flags) | |||
| 188 | * rq->curr modification in scheduler. | 182 | * rq->curr modification in scheduler. |
| 189 | */ | 183 | */ |
| 190 | smp_mb(); /* exit from system call is not a mb */ | 184 | smp_mb(); /* exit from system call is not a mb */ |
| 185 | |||
| 191 | return 0; | 186 | return 0; |
| 192 | } | 187 | } |
| 193 | 188 | ||
| @@ -219,6 +214,7 @@ static int membarrier_register_global_expedited(void) | |||
| 219 | } | 214 | } |
| 220 | atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, | 215 | atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, |
| 221 | &mm->membarrier_state); | 216 | &mm->membarrier_state); |
| 217 | |||
| 222 | return 0; | 218 | return 0; |
| 223 | } | 219 | } |
| 224 | 220 | ||
| @@ -253,6 +249,7 @@ static int membarrier_register_private_expedited(int flags) | |||
| 253 | synchronize_sched(); | 249 | synchronize_sched(); |
| 254 | } | 250 | } |
| 255 | atomic_or(state, &mm->membarrier_state); | 251 | atomic_or(state, &mm->membarrier_state); |
| 252 | |||
| 256 | return 0; | 253 | return 0; |
| 257 | } | 254 | } |
| 258 | 255 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index aad49451584e..86b77987435e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -3,12 +3,8 @@ | |||
| 3 | * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR | 3 | * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR |
| 4 | * policies) | 4 | * policies) |
| 5 | */ | 5 | */ |
| 6 | |||
| 7 | #include "sched.h" | 6 | #include "sched.h" |
| 8 | 7 | ||
| 9 | #include <linux/slab.h> | ||
| 10 | #include <linux/irq_work.h> | ||
| 11 | |||
| 12 | int sched_rr_timeslice = RR_TIMESLICE; | 8 | int sched_rr_timeslice = RR_TIMESLICE; |
| 13 | int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; | 9 | int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; |
| 14 | 10 | ||
| @@ -359,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head); | |||
| 359 | static void push_rt_tasks(struct rq *); | 355 | static void push_rt_tasks(struct rq *); |
| 360 | static void pull_rt_task(struct rq *); | 356 | static void pull_rt_task(struct rq *); |
| 361 | 357 | ||
| 362 | static inline void queue_push_tasks(struct rq *rq) | 358 | static inline void rt_queue_push_tasks(struct rq *rq) |
| 363 | { | 359 | { |
| 364 | if (!has_pushable_tasks(rq)) | 360 | if (!has_pushable_tasks(rq)) |
| 365 | return; | 361 | return; |
| @@ -367,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq) | |||
| 367 | queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); | 363 | queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); |
| 368 | } | 364 | } |
| 369 | 365 | ||
| 370 | static inline void queue_pull_task(struct rq *rq) | 366 | static inline void rt_queue_pull_task(struct rq *rq) |
| 371 | { | 367 | { |
| 372 | queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); | 368 | queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); |
| 373 | } | 369 | } |
| @@ -425,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq) | |||
| 425 | { | 421 | { |
| 426 | } | 422 | } |
| 427 | 423 | ||
| 428 | static inline void queue_push_tasks(struct rq *rq) | 424 | static inline void rt_queue_push_tasks(struct rq *rq) |
| 429 | { | 425 | { |
| 430 | } | 426 | } |
| 431 | #endif /* CONFIG_SMP */ | 427 | #endif /* CONFIG_SMP */ |
| @@ -961,9 +957,6 @@ static void update_curr_rt(struct rq *rq) | |||
| 961 | if (unlikely((s64)delta_exec <= 0)) | 957 | if (unlikely((s64)delta_exec <= 0)) |
| 962 | return; | 958 | return; |
| 963 | 959 | ||
| 964 | /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ | ||
| 965 | cpufreq_update_util(rq, SCHED_CPUFREQ_RT); | ||
| 966 | |||
| 967 | schedstat_set(curr->se.statistics.exec_max, | 960 | schedstat_set(curr->se.statistics.exec_max, |
| 968 | max(curr->se.statistics.exec_max, delta_exec)); | 961 | max(curr->se.statistics.exec_max, delta_exec)); |
| 969 | 962 | ||
| @@ -1005,6 +998,9 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) | |||
| 1005 | 998 | ||
| 1006 | sub_nr_running(rq, rt_rq->rt_nr_running); | 999 | sub_nr_running(rq, rt_rq->rt_nr_running); |
| 1007 | rt_rq->rt_queued = 0; | 1000 | rt_rq->rt_queued = 0; |
| 1001 | |||
| 1002 | /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ | ||
| 1003 | cpufreq_update_util(rq, 0); | ||
| 1008 | } | 1004 | } |
| 1009 | 1005 | ||
| 1010 | static void | 1006 | static void |
| @@ -1021,6 +1017,9 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) | |||
| 1021 | 1017 | ||
| 1022 | add_nr_running(rq, rt_rq->rt_nr_running); | 1018 | add_nr_running(rq, rt_rq->rt_nr_running); |
| 1023 | rt_rq->rt_queued = 1; | 1019 | rt_rq->rt_queued = 1; |
| 1020 | |||
| 1021 | /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ | ||
| 1022 | cpufreq_update_util(rq, 0); | ||
| 1024 | } | 1023 | } |
| 1025 | 1024 | ||
| 1026 | #if defined CONFIG_SMP | 1025 | #if defined CONFIG_SMP |
| @@ -1453,9 +1452,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | |||
| 1453 | return; | 1452 | return; |
| 1454 | 1453 | ||
| 1455 | /* | 1454 | /* |
| 1456 | * There appears to be other cpus that can accept | 1455 | * There appear to be other CPUs that can accept |
| 1457 | * current and none to run 'p', so lets reschedule | 1456 | * the current task but none can run 'p', so lets reschedule |
| 1458 | * to try and push current away: | 1457 | * to try and push the current task away: |
| 1459 | */ | 1458 | */ |
| 1460 | requeue_task_rt(rq, p, 1); | 1459 | requeue_task_rt(rq, p, 1); |
| 1461 | resched_curr(rq); | 1460 | resched_curr(rq); |
| @@ -1569,7 +1568,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | |||
| 1569 | /* The running task is never eligible for pushing */ | 1568 | /* The running task is never eligible for pushing */ |
| 1570 | dequeue_pushable_task(rq, p); | 1569 | dequeue_pushable_task(rq, p); |
| 1571 | 1570 | ||
| 1572 | queue_push_tasks(rq); | 1571 | rt_queue_push_tasks(rq); |
| 1573 | 1572 | ||
| 1574 | return p; | 1573 | return p; |
| 1575 | } | 1574 | } |
| @@ -1596,12 +1595,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | |||
| 1596 | if (!task_running(rq, p) && | 1595 | if (!task_running(rq, p) && |
| 1597 | cpumask_test_cpu(cpu, &p->cpus_allowed)) | 1596 | cpumask_test_cpu(cpu, &p->cpus_allowed)) |
| 1598 | return 1; | 1597 | return 1; |
| 1598 | |||
| 1599 | return 0; | 1599 | return 0; |
| 1600 | } | 1600 | } |
| 1601 | 1601 | ||
| 1602 | /* | 1602 | /* |
| 1603 | * Return the highest pushable rq's task, which is suitable to be executed | 1603 | * Return the highest pushable rq's task, which is suitable to be executed |
| 1604 | * on the cpu, NULL otherwise | 1604 | * on the CPU, NULL otherwise |
| 1605 | */ | 1605 | */ |
| 1606 | static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) | 1606 | static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) |
| 1607 | { | 1607 | { |
| @@ -1639,11 +1639,11 @@ static int find_lowest_rq(struct task_struct *task) | |||
| 1639 | return -1; /* No targets found */ | 1639 | return -1; /* No targets found */ |
| 1640 | 1640 | ||
| 1641 | /* | 1641 | /* |
| 1642 | * At this point we have built a mask of cpus representing the | 1642 | * At this point we have built a mask of CPUs representing the |
| 1643 | * lowest priority tasks in the system. Now we want to elect | 1643 | * lowest priority tasks in the system. Now we want to elect |
| 1644 | * the best one based on our affinity and topology. | 1644 | * the best one based on our affinity and topology. |
| 1645 | * | 1645 | * |
| 1646 | * We prioritize the last cpu that the task executed on since | 1646 | * We prioritize the last CPU that the task executed on since |
| 1647 | * it is most likely cache-hot in that location. | 1647 | * it is most likely cache-hot in that location. |
| 1648 | */ | 1648 | */ |
| 1649 | if (cpumask_test_cpu(cpu, lowest_mask)) | 1649 | if (cpumask_test_cpu(cpu, lowest_mask)) |
| @@ -1651,7 +1651,7 @@ static int find_lowest_rq(struct task_struct *task) | |||
| 1651 | 1651 | ||
| 1652 | /* | 1652 | /* |
| 1653 | * Otherwise, we consult the sched_domains span maps to figure | 1653 | * Otherwise, we consult the sched_domains span maps to figure |
| 1654 | * out which cpu is logically closest to our hot cache data. | 1654 | * out which CPU is logically closest to our hot cache data. |
| 1655 | */ | 1655 | */ |
| 1656 | if (!cpumask_test_cpu(this_cpu, lowest_mask)) | 1656 | if (!cpumask_test_cpu(this_cpu, lowest_mask)) |
| 1657 | this_cpu = -1; /* Skip this_cpu opt if not among lowest */ | 1657 | this_cpu = -1; /* Skip this_cpu opt if not among lowest */ |
| @@ -1692,6 +1692,7 @@ static int find_lowest_rq(struct task_struct *task) | |||
| 1692 | cpu = cpumask_any(lowest_mask); | 1692 | cpu = cpumask_any(lowest_mask); |
| 1693 | if (cpu < nr_cpu_ids) | 1693 | if (cpu < nr_cpu_ids) |
| 1694 | return cpu; | 1694 | return cpu; |
| 1695 | |||
| 1695 | return -1; | 1696 | return -1; |
| 1696 | } | 1697 | } |
| 1697 | 1698 | ||
| @@ -1827,7 +1828,7 @@ retry: | |||
| 1827 | * The task hasn't migrated, and is still the next | 1828 | * The task hasn't migrated, and is still the next |
| 1828 | * eligible task, but we failed to find a run-queue | 1829 | * eligible task, but we failed to find a run-queue |
| 1829 | * to push it to. Do not retry in this case, since | 1830 | * to push it to. Do not retry in this case, since |
| 1830 | * other cpus will pull from us when ready. | 1831 | * other CPUs will pull from us when ready. |
| 1831 | */ | 1832 | */ |
| 1832 | goto out; | 1833 | goto out; |
| 1833 | } | 1834 | } |
| @@ -1919,7 +1920,7 @@ static int rto_next_cpu(struct root_domain *rd) | |||
| 1919 | * rt_next_cpu() will simply return the first CPU found in | 1920 | * rt_next_cpu() will simply return the first CPU found in |
| 1920 | * the rto_mask. | 1921 | * the rto_mask. |
| 1921 | * | 1922 | * |
| 1922 | * If rto_next_cpu() is called with rto_cpu is a valid cpu, it | 1923 | * If rto_next_cpu() is called with rto_cpu is a valid CPU, it |
| 1923 | * will return the next CPU found in the rto_mask. | 1924 | * will return the next CPU found in the rto_mask. |
| 1924 | * | 1925 | * |
| 1925 | * If there are no more CPUs left in the rto_mask, then a check is made | 1926 | * If there are no more CPUs left in the rto_mask, then a check is made |
| @@ -1980,7 +1981,7 @@ static void tell_cpu_to_push(struct rq *rq) | |||
| 1980 | raw_spin_lock(&rq->rd->rto_lock); | 1981 | raw_spin_lock(&rq->rd->rto_lock); |
| 1981 | 1982 | ||
| 1982 | /* | 1983 | /* |
| 1983 | * The rto_cpu is updated under the lock, if it has a valid cpu | 1984 | * The rto_cpu is updated under the lock, if it has a valid CPU |
| 1984 | * then the IPI is still running and will continue due to the | 1985 | * then the IPI is still running and will continue due to the |
| 1985 | * update to loop_next, and nothing needs to be done here. | 1986 | * update to loop_next, and nothing needs to be done here. |
| 1986 | * Otherwise it is finishing up and an ipi needs to be sent. | 1987 | * Otherwise it is finishing up and an ipi needs to be sent. |
| @@ -2105,7 +2106,7 @@ static void pull_rt_task(struct rq *this_rq) | |||
| 2105 | 2106 | ||
| 2106 | /* | 2107 | /* |
| 2107 | * There's a chance that p is higher in priority | 2108 | * There's a chance that p is higher in priority |
| 2108 | * than what's currently running on its cpu. | 2109 | * than what's currently running on its CPU. |
| 2109 | * This is just that p is wakeing up and hasn't | 2110 | * This is just that p is wakeing up and hasn't |
| 2110 | * had a chance to schedule. We only pull | 2111 | * had a chance to schedule. We only pull |
| 2111 | * p if it is lower in priority than the | 2112 | * p if it is lower in priority than the |
| @@ -2187,7 +2188,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
| 2187 | if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) | 2188 | if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) |
| 2188 | return; | 2189 | return; |
| 2189 | 2190 | ||
| 2190 | queue_pull_task(rq); | 2191 | rt_queue_pull_task(rq); |
| 2191 | } | 2192 | } |
| 2192 | 2193 | ||
| 2193 | void __init init_sched_rt_class(void) | 2194 | void __init init_sched_rt_class(void) |
| @@ -2218,7 +2219,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
| 2218 | if (task_on_rq_queued(p) && rq->curr != p) { | 2219 | if (task_on_rq_queued(p) && rq->curr != p) { |
| 2219 | #ifdef CONFIG_SMP | 2220 | #ifdef CONFIG_SMP |
| 2220 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) | 2221 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) |
| 2221 | queue_push_tasks(rq); | 2222 | rt_queue_push_tasks(rq); |
| 2222 | #endif /* CONFIG_SMP */ | 2223 | #endif /* CONFIG_SMP */ |
| 2223 | if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) | 2224 | if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) |
| 2224 | resched_curr(rq); | 2225 | resched_curr(rq); |
| @@ -2242,7 +2243,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) | |||
| 2242 | * may need to pull tasks to this runqueue. | 2243 | * may need to pull tasks to this runqueue. |
| 2243 | */ | 2244 | */ |
| 2244 | if (oldprio < p->prio) | 2245 | if (oldprio < p->prio) |
| 2245 | queue_pull_task(rq); | 2246 | rt_queue_pull_task(rq); |
| 2246 | 2247 | ||
| 2247 | /* | 2248 | /* |
| 2248 | * If there's a higher priority task waiting to run | 2249 | * If there's a higher priority task waiting to run |
| @@ -2292,6 +2293,14 @@ static void watchdog(struct rq *rq, struct task_struct *p) | |||
| 2292 | static inline void watchdog(struct rq *rq, struct task_struct *p) { } | 2293 | static inline void watchdog(struct rq *rq, struct task_struct *p) { } |
| 2293 | #endif | 2294 | #endif |
| 2294 | 2295 | ||
| 2296 | /* | ||
| 2297 | * scheduler tick hitting a task of our scheduling class. | ||
| 2298 | * | ||
| 2299 | * NOTE: This function can be called remotely by the tick offload that | ||
| 2300 | * goes along full dynticks. Therefore no local assumption can be made | ||
| 2301 | * and everything must be accessed through the @rq and @curr passed in | ||
| 2302 | * parameters. | ||
| 2303 | */ | ||
| 2295 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | 2304 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) |
| 2296 | { | 2305 | { |
| 2297 | struct sched_rt_entity *rt_se = &p->rt; | 2306 | struct sched_rt_entity *rt_se = &p->rt; |
| @@ -2685,6 +2694,7 @@ int sched_rr_handler(struct ctl_table *table, int write, | |||
| 2685 | msecs_to_jiffies(sysctl_sched_rr_timeslice); | 2694 | msecs_to_jiffies(sysctl_sched_rr_timeslice); |
| 2686 | } | 2695 | } |
| 2687 | mutex_unlock(&mutex); | 2696 | mutex_unlock(&mutex); |
| 2697 | |||
| 2688 | return ret; | 2698 | return ret; |
| 2689 | } | 2699 | } |
| 2690 | 2700 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fb5fc458547f..c3deaee7a7a2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -1,39 +1,73 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | 2 | /* | |
| 3 | * Scheduler internal types and methods: | ||
| 4 | */ | ||
| 3 | #include <linux/sched.h> | 5 | #include <linux/sched.h> |
| 6 | |||
| 4 | #include <linux/sched/autogroup.h> | 7 | #include <linux/sched/autogroup.h> |
| 5 | #include <linux/sched/sysctl.h> | ||
| 6 | #include <linux/sched/topology.h> | ||
| 7 | #include <linux/sched/rt.h> | ||
| 8 | #include <linux/sched/deadline.h> | ||
| 9 | #include <linux/sched/clock.h> | 8 | #include <linux/sched/clock.h> |
| 10 | #include <linux/sched/wake_q.h> | 9 | #include <linux/sched/coredump.h> |
| 11 | #include <linux/sched/signal.h> | ||
| 12 | #include <linux/sched/numa_balancing.h> | ||
| 13 | #include <linux/sched/mm.h> | ||
| 14 | #include <linux/sched/cpufreq.h> | 10 | #include <linux/sched/cpufreq.h> |
| 15 | #include <linux/sched/stat.h> | 11 | #include <linux/sched/cputime.h> |
| 16 | #include <linux/sched/nohz.h> | 12 | #include <linux/sched/deadline.h> |
| 17 | #include <linux/sched/debug.h> | 13 | #include <linux/sched/debug.h> |
| 18 | #include <linux/sched/hotplug.h> | 14 | #include <linux/sched/hotplug.h> |
| 15 | #include <linux/sched/idle.h> | ||
| 16 | #include <linux/sched/init.h> | ||
| 17 | #include <linux/sched/isolation.h> | ||
| 18 | #include <linux/sched/jobctl.h> | ||
| 19 | #include <linux/sched/loadavg.h> | ||
| 20 | #include <linux/sched/mm.h> | ||
| 21 | #include <linux/sched/nohz.h> | ||
| 22 | #include <linux/sched/numa_balancing.h> | ||
| 23 | #include <linux/sched/prio.h> | ||
| 24 | #include <linux/sched/rt.h> | ||
| 25 | #include <linux/sched/signal.h> | ||
| 26 | #include <linux/sched/stat.h> | ||
| 27 | #include <linux/sched/sysctl.h> | ||
| 19 | #include <linux/sched/task.h> | 28 | #include <linux/sched/task.h> |
| 20 | #include <linux/sched/task_stack.h> | 29 | #include <linux/sched/task_stack.h> |
| 21 | #include <linux/sched/cputime.h> | 30 | #include <linux/sched/topology.h> |
| 22 | #include <linux/sched/init.h> | 31 | #include <linux/sched/user.h> |
| 32 | #include <linux/sched/wake_q.h> | ||
| 33 | #include <linux/sched/xacct.h> | ||
| 34 | |||
| 35 | #include <uapi/linux/sched/types.h> | ||
| 23 | 36 | ||
| 24 | #include <linux/u64_stats_sync.h> | ||
| 25 | #include <linux/kernel_stat.h> | ||
| 26 | #include <linux/binfmts.h> | 37 | #include <linux/binfmts.h> |
| 27 | #include <linux/mutex.h> | 38 | #include <linux/blkdev.h> |
| 28 | #include <linux/spinlock.h> | 39 | #include <linux/compat.h> |
| 40 | #include <linux/context_tracking.h> | ||
| 41 | #include <linux/cpufreq.h> | ||
| 42 | #include <linux/cpuidle.h> | ||
| 43 | #include <linux/cpuset.h> | ||
| 44 | #include <linux/ctype.h> | ||
| 45 | #include <linux/debugfs.h> | ||
| 46 | #include <linux/delayacct.h> | ||
| 47 | #include <linux/init_task.h> | ||
| 48 | #include <linux/kprobes.h> | ||
| 49 | #include <linux/kthread.h> | ||
| 50 | #include <linux/membarrier.h> | ||
| 51 | #include <linux/migrate.h> | ||
| 52 | #include <linux/mmu_context.h> | ||
| 53 | #include <linux/nmi.h> | ||
| 54 | #include <linux/proc_fs.h> | ||
| 55 | #include <linux/prefetch.h> | ||
| 56 | #include <linux/profile.h> | ||
| 57 | #include <linux/rcupdate_wait.h> | ||
| 58 | #include <linux/security.h> | ||
| 59 | #include <linux/stackprotector.h> | ||
| 29 | #include <linux/stop_machine.h> | 60 | #include <linux/stop_machine.h> |
| 30 | #include <linux/irq_work.h> | 61 | #include <linux/suspend.h> |
| 31 | #include <linux/tick.h> | 62 | #include <linux/swait.h> |
| 32 | #include <linux/slab.h> | 63 | #include <linux/syscalls.h> |
| 33 | #include <linux/cgroup.h> | 64 | #include <linux/task_work.h> |
| 65 | #include <linux/tsacct_kern.h> | ||
| 66 | |||
| 67 | #include <asm/tlb.h> | ||
| 34 | 68 | ||
| 35 | #ifdef CONFIG_PARAVIRT | 69 | #ifdef CONFIG_PARAVIRT |
| 36 | #include <asm/paravirt.h> | 70 | # include <asm/paravirt.h> |
| 37 | #endif | 71 | #endif |
| 38 | 72 | ||
| 39 | #include "cpupri.h" | 73 | #include "cpupri.h" |
| @@ -79,11 +113,11 @@ static inline void cpu_load_update_active(struct rq *this_rq) { } | |||
| 79 | * and does not change the user-interface for setting shares/weights. | 113 | * and does not change the user-interface for setting shares/weights. |
| 80 | * | 114 | * |
| 81 | * We increase resolution only if we have enough bits to allow this increased | 115 | * We increase resolution only if we have enough bits to allow this increased |
| 82 | * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are | 116 | * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit |
| 83 | * pretty high and the returns do not justify the increased costs. | 117 | * are pretty high and the returns do not justify the increased costs. |
| 84 | * | 118 | * |
| 85 | * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to | 119 | * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to |
| 86 | * increase coverage and consistency always enable it on 64bit platforms. | 120 | * increase coverage and consistency always enable it on 64-bit platforms. |
| 87 | */ | 121 | */ |
| 88 | #ifdef CONFIG_64BIT | 122 | #ifdef CONFIG_64BIT |
| 89 | # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) | 123 | # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) |
| @@ -111,16 +145,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { } | |||
| 111 | * 10 -> just above 1us | 145 | * 10 -> just above 1us |
| 112 | * 9 -> just above 0.5us | 146 | * 9 -> just above 0.5us |
| 113 | */ | 147 | */ |
| 114 | #define DL_SCALE (10) | 148 | #define DL_SCALE 10 |
| 115 | 149 | ||
| 116 | /* | 150 | /* |
| 117 | * These are the 'tuning knobs' of the scheduler: | 151 | * Single value that denotes runtime == period, ie unlimited time. |
| 118 | */ | 152 | */ |
| 119 | 153 | #define RUNTIME_INF ((u64)~0ULL) | |
| 120 | /* | ||
| 121 | * single value that denotes runtime == period, ie unlimited time. | ||
| 122 | */ | ||
| 123 | #define RUNTIME_INF ((u64)~0ULL) | ||
| 124 | 154 | ||
| 125 | static inline int idle_policy(int policy) | 155 | static inline int idle_policy(int policy) |
| 126 | { | 156 | { |
| @@ -235,9 +265,9 @@ void __dl_clear_params(struct task_struct *p); | |||
| 235 | * control. | 265 | * control. |
| 236 | */ | 266 | */ |
| 237 | struct dl_bandwidth { | 267 | struct dl_bandwidth { |
| 238 | raw_spinlock_t dl_runtime_lock; | 268 | raw_spinlock_t dl_runtime_lock; |
| 239 | u64 dl_runtime; | 269 | u64 dl_runtime; |
| 240 | u64 dl_period; | 270 | u64 dl_period; |
| 241 | }; | 271 | }; |
| 242 | 272 | ||
| 243 | static inline int dl_bandwidth_enabled(void) | 273 | static inline int dl_bandwidth_enabled(void) |
| @@ -246,8 +276,9 @@ static inline int dl_bandwidth_enabled(void) | |||
| 246 | } | 276 | } |
| 247 | 277 | ||
| 248 | struct dl_bw { | 278 | struct dl_bw { |
| 249 | raw_spinlock_t lock; | 279 | raw_spinlock_t lock; |
| 250 | u64 bw, total_bw; | 280 | u64 bw; |
| 281 | u64 total_bw; | ||
| 251 | }; | 282 | }; |
| 252 | 283 | ||
| 253 | static inline void __dl_update(struct dl_bw *dl_b, s64 bw); | 284 | static inline void __dl_update(struct dl_bw *dl_b, s64 bw); |
| @@ -273,20 +304,17 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | |||
| 273 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | 304 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; |
| 274 | } | 305 | } |
| 275 | 306 | ||
| 276 | void dl_change_utilization(struct task_struct *p, u64 new_bw); | 307 | extern void dl_change_utilization(struct task_struct *p, u64 new_bw); |
| 277 | extern void init_dl_bw(struct dl_bw *dl_b); | 308 | extern void init_dl_bw(struct dl_bw *dl_b); |
| 278 | extern int sched_dl_global_validate(void); | 309 | extern int sched_dl_global_validate(void); |
| 279 | extern void sched_dl_do_global(void); | 310 | extern void sched_dl_do_global(void); |
| 280 | extern int sched_dl_overflow(struct task_struct *p, int policy, | 311 | extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); |
| 281 | const struct sched_attr *attr); | ||
| 282 | extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); | 312 | extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); |
| 283 | extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); | 313 | extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); |
| 284 | extern bool __checkparam_dl(const struct sched_attr *attr); | 314 | extern bool __checkparam_dl(const struct sched_attr *attr); |
| 285 | extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); | 315 | extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); |
| 286 | extern int dl_task_can_attach(struct task_struct *p, | 316 | extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); |
| 287 | const struct cpumask *cs_cpus_allowed); | 317 | extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); |
| 288 | extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, | ||
| 289 | const struct cpumask *trial); | ||
| 290 | extern bool dl_cpu_busy(unsigned int cpu); | 318 | extern bool dl_cpu_busy(unsigned int cpu); |
| 291 | 319 | ||
| 292 | #ifdef CONFIG_CGROUP_SCHED | 320 | #ifdef CONFIG_CGROUP_SCHED |
| @@ -300,32 +328,36 @@ extern struct list_head task_groups; | |||
| 300 | 328 | ||
| 301 | struct cfs_bandwidth { | 329 | struct cfs_bandwidth { |
| 302 | #ifdef CONFIG_CFS_BANDWIDTH | 330 | #ifdef CONFIG_CFS_BANDWIDTH |
| 303 | raw_spinlock_t lock; | 331 | raw_spinlock_t lock; |
| 304 | ktime_t period; | 332 | ktime_t period; |
| 305 | u64 quota, runtime; | 333 | u64 quota; |
| 306 | s64 hierarchical_quota; | 334 | u64 runtime; |
| 307 | u64 runtime_expires; | 335 | s64 hierarchical_quota; |
| 308 | 336 | u64 runtime_expires; | |
| 309 | int idle, period_active; | 337 | |
| 310 | struct hrtimer period_timer, slack_timer; | 338 | int idle; |
| 311 | struct list_head throttled_cfs_rq; | 339 | int period_active; |
| 312 | 340 | struct hrtimer period_timer; | |
| 313 | /* statistics */ | 341 | struct hrtimer slack_timer; |
| 314 | int nr_periods, nr_throttled; | 342 | struct list_head throttled_cfs_rq; |
| 315 | u64 throttled_time; | 343 | |
| 344 | /* Statistics: */ | ||
| 345 | int nr_periods; | ||
| 346 | int nr_throttled; | ||
| 347 | u64 throttled_time; | ||
| 316 | #endif | 348 | #endif |
| 317 | }; | 349 | }; |
| 318 | 350 | ||
| 319 | /* task group related information */ | 351 | /* Task group related information */ |
| 320 | struct task_group { | 352 | struct task_group { |
| 321 | struct cgroup_subsys_state css; | 353 | struct cgroup_subsys_state css; |
| 322 | 354 | ||
| 323 | #ifdef CONFIG_FAIR_GROUP_SCHED | 355 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 324 | /* schedulable entities of this group on each cpu */ | 356 | /* schedulable entities of this group on each CPU */ |
| 325 | struct sched_entity **se; | 357 | struct sched_entity **se; |
| 326 | /* runqueue "owned" by this group on each cpu */ | 358 | /* runqueue "owned" by this group on each CPU */ |
| 327 | struct cfs_rq **cfs_rq; | 359 | struct cfs_rq **cfs_rq; |
| 328 | unsigned long shares; | 360 | unsigned long shares; |
| 329 | 361 | ||
| 330 | #ifdef CONFIG_SMP | 362 | #ifdef CONFIG_SMP |
| 331 | /* | 363 | /* |
| @@ -333,29 +365,29 @@ struct task_group { | |||
| 333 | * it in its own cacheline separated from the fields above which | 365 | * it in its own cacheline separated from the fields above which |
| 334 | * will also be accessed at each tick. | 366 | * will also be accessed at each tick. |
| 335 | */ | 367 | */ |
| 336 | atomic_long_t load_avg ____cacheline_aligned; | 368 | atomic_long_t load_avg ____cacheline_aligned; |
| 337 | #endif | 369 | #endif |
| 338 | #endif | 370 | #endif |
| 339 | 371 | ||
| 340 | #ifdef CONFIG_RT_GROUP_SCHED | 372 | #ifdef CONFIG_RT_GROUP_SCHED |
| 341 | struct sched_rt_entity **rt_se; | 373 | struct sched_rt_entity **rt_se; |
| 342 | struct rt_rq **rt_rq; | 374 | struct rt_rq **rt_rq; |
| 343 | 375 | ||
| 344 | struct rt_bandwidth rt_bandwidth; | 376 | struct rt_bandwidth rt_bandwidth; |
| 345 | #endif | 377 | #endif |
| 346 | 378 | ||
| 347 | struct rcu_head rcu; | 379 | struct rcu_head rcu; |
| 348 | struct list_head list; | 380 | struct list_head list; |
| 349 | 381 | ||
| 350 | struct task_group *parent; | 382 | struct task_group *parent; |
| 351 | struct list_head siblings; | 383 | struct list_head siblings; |
| 352 | struct list_head children; | 384 | struct list_head children; |
| 353 | 385 | ||
| 354 | #ifdef CONFIG_SCHED_AUTOGROUP | 386 | #ifdef CONFIG_SCHED_AUTOGROUP |
| 355 | struct autogroup *autogroup; | 387 | struct autogroup *autogroup; |
| 356 | #endif | 388 | #endif |
| 357 | 389 | ||
| 358 | struct cfs_bandwidth cfs_bandwidth; | 390 | struct cfs_bandwidth cfs_bandwidth; |
| 359 | }; | 391 | }; |
| 360 | 392 | ||
| 361 | #ifdef CONFIG_FAIR_GROUP_SCHED | 393 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| @@ -369,8 +401,8 @@ struct task_group { | |||
| 369 | * (The default weight is 1024 - so there's no practical | 401 | * (The default weight is 1024 - so there's no practical |
| 370 | * limitation from this.) | 402 | * limitation from this.) |
| 371 | */ | 403 | */ |
| 372 | #define MIN_SHARES (1UL << 1) | 404 | #define MIN_SHARES (1UL << 1) |
| 373 | #define MAX_SHARES (1UL << 18) | 405 | #define MAX_SHARES (1UL << 18) |
| 374 | #endif | 406 | #endif |
| 375 | 407 | ||
| 376 | typedef int (*tg_visitor)(struct task_group *, void *); | 408 | typedef int (*tg_visitor)(struct task_group *, void *); |
| @@ -443,35 +475,39 @@ struct cfs_bandwidth { }; | |||
| 443 | 475 | ||
| 444 | /* CFS-related fields in a runqueue */ | 476 | /* CFS-related fields in a runqueue */ |
| 445 | struct cfs_rq { | 477 | struct cfs_rq { |
| 446 | struct load_weight load; | 478 | struct load_weight load; |
| 447 | unsigned long runnable_weight; | 479 | unsigned long runnable_weight; |
| 448 | unsigned int nr_running, h_nr_running; | 480 | unsigned int nr_running; |
| 481 | unsigned int h_nr_running; | ||
| 449 | 482 | ||
| 450 | u64 exec_clock; | 483 | u64 exec_clock; |
| 451 | u64 min_vruntime; | 484 | u64 min_vruntime; |
| 452 | #ifndef CONFIG_64BIT | 485 | #ifndef CONFIG_64BIT |
| 453 | u64 min_vruntime_copy; | 486 | u64 min_vruntime_copy; |
| 454 | #endif | 487 | #endif |
| 455 | 488 | ||
| 456 | struct rb_root_cached tasks_timeline; | 489 | struct rb_root_cached tasks_timeline; |
| 457 | 490 | ||
| 458 | /* | 491 | /* |
| 459 | * 'curr' points to currently running entity on this cfs_rq. | 492 | * 'curr' points to currently running entity on this cfs_rq. |
| 460 | * It is set to NULL otherwise (i.e when none are currently running). | 493 | * It is set to NULL otherwise (i.e when none are currently running). |
| 461 | */ | 494 | */ |
| 462 | struct sched_entity *curr, *next, *last, *skip; | 495 | struct sched_entity *curr; |
| 496 | struct sched_entity *next; | ||
| 497 | struct sched_entity *last; | ||
| 498 | struct sched_entity *skip; | ||
| 463 | 499 | ||
| 464 | #ifdef CONFIG_SCHED_DEBUG | 500 | #ifdef CONFIG_SCHED_DEBUG |
| 465 | unsigned int nr_spread_over; | 501 | unsigned int nr_spread_over; |
| 466 | #endif | 502 | #endif |
| 467 | 503 | ||
| 468 | #ifdef CONFIG_SMP | 504 | #ifdef CONFIG_SMP |
| 469 | /* | 505 | /* |
| 470 | * CFS load tracking | 506 | * CFS load tracking |
| 471 | */ | 507 | */ |
| 472 | struct sched_avg avg; | 508 | struct sched_avg avg; |
| 473 | #ifndef CONFIG_64BIT | 509 | #ifndef CONFIG_64BIT |
| 474 | u64 load_last_update_time_copy; | 510 | u64 load_last_update_time_copy; |
| 475 | #endif | 511 | #endif |
| 476 | struct { | 512 | struct { |
| 477 | raw_spinlock_t lock ____cacheline_aligned; | 513 | raw_spinlock_t lock ____cacheline_aligned; |
| @@ -482,9 +518,9 @@ struct cfs_rq { | |||
| 482 | } removed; | 518 | } removed; |
| 483 | 519 | ||
| 484 | #ifdef CONFIG_FAIR_GROUP_SCHED | 520 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 485 | unsigned long tg_load_avg_contrib; | 521 | unsigned long tg_load_avg_contrib; |
| 486 | long propagate; | 522 | long propagate; |
| 487 | long prop_runnable_sum; | 523 | long prop_runnable_sum; |
| 488 | 524 | ||
| 489 | /* | 525 | /* |
| 490 | * h_load = weight * f(tg) | 526 | * h_load = weight * f(tg) |
| @@ -492,36 +528,38 @@ struct cfs_rq { | |||
| 492 | * Where f(tg) is the recursive weight fraction assigned to | 528 | * Where f(tg) is the recursive weight fraction assigned to |
| 493 | * this group. | 529 | * this group. |
| 494 | */ | 530 | */ |
| 495 | unsigned long h_load; | 531 | unsigned long h_load; |
| 496 | u64 last_h_load_update; | 532 | u64 last_h_load_update; |
| 497 | struct sched_entity *h_load_next; | 533 | struct sched_entity *h_load_next; |
| 498 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 534 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 499 | #endif /* CONFIG_SMP */ | 535 | #endif /* CONFIG_SMP */ |
| 500 | 536 | ||
| 501 | #ifdef CONFIG_FAIR_GROUP_SCHED | 537 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 502 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 538 | struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ |
| 503 | 539 | ||
| 504 | /* | 540 | /* |
| 505 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | 541 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in |
| 506 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | 542 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities |
| 507 | * (like users, containers etc.) | 543 | * (like users, containers etc.) |
| 508 | * | 544 | * |
| 509 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 545 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. |
| 510 | * list is used during load balance. | 546 | * This list is used during load balance. |
| 511 | */ | 547 | */ |
| 512 | int on_list; | 548 | int on_list; |
| 513 | struct list_head leaf_cfs_rq_list; | 549 | struct list_head leaf_cfs_rq_list; |
| 514 | struct task_group *tg; /* group that "owns" this runqueue */ | 550 | struct task_group *tg; /* group that "owns" this runqueue */ |
| 515 | 551 | ||
| 516 | #ifdef CONFIG_CFS_BANDWIDTH | 552 | #ifdef CONFIG_CFS_BANDWIDTH |
| 517 | int runtime_enabled; | 553 | int runtime_enabled; |
| 518 | u64 runtime_expires; | 554 | u64 runtime_expires; |
| 519 | s64 runtime_remaining; | 555 | s64 runtime_remaining; |
| 520 | 556 | ||
| 521 | u64 throttled_clock, throttled_clock_task; | 557 | u64 throttled_clock; |
| 522 | u64 throttled_clock_task_time; | 558 | u64 throttled_clock_task; |
| 523 | int throttled, throttle_count; | 559 | u64 throttled_clock_task_time; |
| 524 | struct list_head throttled_list; | 560 | int throttled; |
| 561 | int throttle_count; | ||
| 562 | struct list_head throttled_list; | ||
| 525 | #endif /* CONFIG_CFS_BANDWIDTH */ | 563 | #endif /* CONFIG_CFS_BANDWIDTH */ |
| 526 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 564 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 527 | }; | 565 | }; |
| @@ -538,45 +576,45 @@ static inline int rt_bandwidth_enabled(void) | |||
| 538 | 576 | ||
| 539 | /* Real-Time classes' related field in a runqueue: */ | 577 | /* Real-Time classes' related field in a runqueue: */ |
| 540 | struct rt_rq { | 578 | struct rt_rq { |
| 541 | struct rt_prio_array active; | 579 | struct rt_prio_array active; |
| 542 | unsigned int rt_nr_running; | 580 | unsigned int rt_nr_running; |
| 543 | unsigned int rr_nr_running; | 581 | unsigned int rr_nr_running; |
| 544 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | 582 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED |
| 545 | struct { | 583 | struct { |
| 546 | int curr; /* highest queued rt task prio */ | 584 | int curr; /* highest queued rt task prio */ |
| 547 | #ifdef CONFIG_SMP | 585 | #ifdef CONFIG_SMP |
| 548 | int next; /* next highest */ | 586 | int next; /* next highest */ |
| 549 | #endif | 587 | #endif |
| 550 | } highest_prio; | 588 | } highest_prio; |
| 551 | #endif | 589 | #endif |
| 552 | #ifdef CONFIG_SMP | 590 | #ifdef CONFIG_SMP |
| 553 | unsigned long rt_nr_migratory; | 591 | unsigned long rt_nr_migratory; |
| 554 | unsigned long rt_nr_total; | 592 | unsigned long rt_nr_total; |
| 555 | int overloaded; | 593 | int overloaded; |
| 556 | struct plist_head pushable_tasks; | 594 | struct plist_head pushable_tasks; |
| 557 | #endif /* CONFIG_SMP */ | 595 | #endif /* CONFIG_SMP */ |
| 558 | int rt_queued; | 596 | int rt_queued; |
| 559 | 597 | ||
| 560 | int rt_throttled; | 598 | int rt_throttled; |
| 561 | u64 rt_time; | 599 | u64 rt_time; |
| 562 | u64 rt_runtime; | 600 | u64 rt_runtime; |
| 563 | /* Nests inside the rq lock: */ | 601 | /* Nests inside the rq lock: */ |
| 564 | raw_spinlock_t rt_runtime_lock; | 602 | raw_spinlock_t rt_runtime_lock; |
| 565 | 603 | ||
| 566 | #ifdef CONFIG_RT_GROUP_SCHED | 604 | #ifdef CONFIG_RT_GROUP_SCHED |
| 567 | unsigned long rt_nr_boosted; | 605 | unsigned long rt_nr_boosted; |
| 568 | 606 | ||
| 569 | struct rq *rq; | 607 | struct rq *rq; |
| 570 | struct task_group *tg; | 608 | struct task_group *tg; |
| 571 | #endif | 609 | #endif |
| 572 | }; | 610 | }; |
| 573 | 611 | ||
| 574 | /* Deadline class' related fields in a runqueue */ | 612 | /* Deadline class' related fields in a runqueue */ |
| 575 | struct dl_rq { | 613 | struct dl_rq { |
| 576 | /* runqueue is an rbtree, ordered by deadline */ | 614 | /* runqueue is an rbtree, ordered by deadline */ |
| 577 | struct rb_root_cached root; | 615 | struct rb_root_cached root; |
| 578 | 616 | ||
| 579 | unsigned long dl_nr_running; | 617 | unsigned long dl_nr_running; |
| 580 | 618 | ||
| 581 | #ifdef CONFIG_SMP | 619 | #ifdef CONFIG_SMP |
| 582 | /* | 620 | /* |
| @@ -586,28 +624,28 @@ struct dl_rq { | |||
| 586 | * should migrate somewhere else. | 624 | * should migrate somewhere else. |
| 587 | */ | 625 | */ |
| 588 | struct { | 626 | struct { |
| 589 | u64 curr; | 627 | u64 curr; |
| 590 | u64 next; | 628 | u64 next; |
| 591 | } earliest_dl; | 629 | } earliest_dl; |
| 592 | 630 | ||
| 593 | unsigned long dl_nr_migratory; | 631 | unsigned long dl_nr_migratory; |
| 594 | int overloaded; | 632 | int overloaded; |
| 595 | 633 | ||
| 596 | /* | 634 | /* |
| 597 | * Tasks on this rq that can be pushed away. They are kept in | 635 | * Tasks on this rq that can be pushed away. They are kept in |
| 598 | * an rb-tree, ordered by tasks' deadlines, with caching | 636 | * an rb-tree, ordered by tasks' deadlines, with caching |
| 599 | * of the leftmost (earliest deadline) element. | 637 | * of the leftmost (earliest deadline) element. |
| 600 | */ | 638 | */ |
| 601 | struct rb_root_cached pushable_dl_tasks_root; | 639 | struct rb_root_cached pushable_dl_tasks_root; |
| 602 | #else | 640 | #else |
| 603 | struct dl_bw dl_bw; | 641 | struct dl_bw dl_bw; |
| 604 | #endif | 642 | #endif |
| 605 | /* | 643 | /* |
| 606 | * "Active utilization" for this runqueue: increased when a | 644 | * "Active utilization" for this runqueue: increased when a |
| 607 | * task wakes up (becomes TASK_RUNNING) and decreased when a | 645 | * task wakes up (becomes TASK_RUNNING) and decreased when a |
| 608 | * task blocks | 646 | * task blocks |
| 609 | */ | 647 | */ |
| 610 | u64 running_bw; | 648 | u64 running_bw; |
| 611 | 649 | ||
| 612 | /* | 650 | /* |
| 613 | * Utilization of the tasks "assigned" to this runqueue (including | 651 | * Utilization of the tasks "assigned" to this runqueue (including |
| @@ -618,14 +656,14 @@ struct dl_rq { | |||
| 618 | * This is needed to compute the "inactive utilization" for the | 656 | * This is needed to compute the "inactive utilization" for the |
| 619 | * runqueue (inactive utilization = this_bw - running_bw). | 657 | * runqueue (inactive utilization = this_bw - running_bw). |
| 620 | */ | 658 | */ |
| 621 | u64 this_bw; | 659 | u64 this_bw; |
| 622 | u64 extra_bw; | 660 | u64 extra_bw; |
| 623 | 661 | ||
| 624 | /* | 662 | /* |
| 625 | * Inverse of the fraction of CPU utilization that can be reclaimed | 663 | * Inverse of the fraction of CPU utilization that can be reclaimed |
| 626 | * by the GRUB algorithm. | 664 | * by the GRUB algorithm. |
| 627 | */ | 665 | */ |
| 628 | u64 bw_ratio; | 666 | u64 bw_ratio; |
| 629 | }; | 667 | }; |
| 630 | 668 | ||
| 631 | #ifdef CONFIG_SMP | 669 | #ifdef CONFIG_SMP |
| @@ -638,51 +676,51 @@ static inline bool sched_asym_prefer(int a, int b) | |||
| 638 | /* | 676 | /* |
| 639 | * We add the notion of a root-domain which will be used to define per-domain | 677 | * We add the notion of a root-domain which will be used to define per-domain |
| 640 | * variables. Each exclusive cpuset essentially defines an island domain by | 678 | * variables. Each exclusive cpuset essentially defines an island domain by |
| 641 | * fully partitioning the member cpus from any other cpuset. Whenever a new | 679 | * fully partitioning the member CPUs from any other cpuset. Whenever a new |
| 642 | * exclusive cpuset is created, we also create and attach a new root-domain | 680 | * exclusive cpuset is created, we also create and attach a new root-domain |
| 643 | * object. | 681 | * object. |
| 644 | * | 682 | * |
| 645 | */ | 683 | */ |
| 646 | struct root_domain { | 684 | struct root_domain { |
| 647 | atomic_t refcount; | 685 | atomic_t refcount; |
| 648 | atomic_t rto_count; | 686 | atomic_t rto_count; |
| 649 | struct rcu_head rcu; | 687 | struct rcu_head rcu; |
| 650 | cpumask_var_t span; | 688 | cpumask_var_t span; |
| 651 | cpumask_var_t online; | 689 | cpumask_var_t online; |
| 652 | 690 | ||
| 653 | /* Indicate more than one runnable task for any CPU */ | 691 | /* Indicate more than one runnable task for any CPU */ |
| 654 | bool overload; | 692 | bool overload; |
| 655 | 693 | ||
| 656 | /* | 694 | /* |
| 657 | * The bit corresponding to a CPU gets set here if such CPU has more | 695 | * The bit corresponding to a CPU gets set here if such CPU has more |
| 658 | * than one runnable -deadline task (as it is below for RT tasks). | 696 | * than one runnable -deadline task (as it is below for RT tasks). |
| 659 | */ | 697 | */ |
| 660 | cpumask_var_t dlo_mask; | 698 | cpumask_var_t dlo_mask; |
| 661 | atomic_t dlo_count; | 699 | atomic_t dlo_count; |
| 662 | struct dl_bw dl_bw; | 700 | struct dl_bw dl_bw; |
| 663 | struct cpudl cpudl; | 701 | struct cpudl cpudl; |
| 664 | 702 | ||
| 665 | #ifdef HAVE_RT_PUSH_IPI | 703 | #ifdef HAVE_RT_PUSH_IPI |
| 666 | /* | 704 | /* |
| 667 | * For IPI pull requests, loop across the rto_mask. | 705 | * For IPI pull requests, loop across the rto_mask. |
| 668 | */ | 706 | */ |
| 669 | struct irq_work rto_push_work; | 707 | struct irq_work rto_push_work; |
| 670 | raw_spinlock_t rto_lock; | 708 | raw_spinlock_t rto_lock; |
| 671 | /* These are only updated and read within rto_lock */ | 709 | /* These are only updated and read within rto_lock */ |
| 672 | int rto_loop; | 710 | int rto_loop; |
| 673 | int rto_cpu; | 711 | int rto_cpu; |
| 674 | /* These atomics are updated outside of a lock */ | 712 | /* These atomics are updated outside of a lock */ |
| 675 | atomic_t rto_loop_next; | 713 | atomic_t rto_loop_next; |
| 676 | atomic_t rto_loop_start; | 714 | atomic_t rto_loop_start; |
| 677 | #endif | 715 | #endif |
| 678 | /* | 716 | /* |
| 679 | * The "RT overload" flag: it gets set if a CPU has more than | 717 | * The "RT overload" flag: it gets set if a CPU has more than |
| 680 | * one runnable RT task. | 718 | * one runnable RT task. |
| 681 | */ | 719 | */ |
| 682 | cpumask_var_t rto_mask; | 720 | cpumask_var_t rto_mask; |
| 683 | struct cpupri cpupri; | 721 | struct cpupri cpupri; |
| 684 | 722 | ||
| 685 | unsigned long max_cpu_capacity; | 723 | unsigned long max_cpu_capacity; |
| 686 | }; | 724 | }; |
| 687 | 725 | ||
| 688 | extern struct root_domain def_root_domain; | 726 | extern struct root_domain def_root_domain; |
| @@ -708,41 +746,42 @@ extern void rto_push_irq_work_func(struct irq_work *work); | |||
| 708 | */ | 746 | */ |
| 709 | struct rq { | 747 | struct rq { |
| 710 | /* runqueue lock: */ | 748 | /* runqueue lock: */ |
| 711 | raw_spinlock_t lock; | 749 | raw_spinlock_t lock; |
| 712 | 750 | ||
| 713 | /* | 751 | /* |
| 714 | * nr_running and cpu_load should be in the same cacheline because | 752 | * nr_running and cpu_load should be in the same cacheline because |
| 715 | * remote CPUs use both these fields when doing load calculation. | 753 | * remote CPUs use both these fields when doing load calculation. |
| 716 | */ | 754 | */ |
| 717 | unsigned int nr_running; | 755 | unsigned int nr_running; |
| 718 | #ifdef CONFIG_NUMA_BALANCING | 756 | #ifdef CONFIG_NUMA_BALANCING |
| 719 | unsigned int nr_numa_running; | 757 | unsigned int nr_numa_running; |
| 720 | unsigned int nr_preferred_running; | 758 | unsigned int nr_preferred_running; |
| 721 | #endif | 759 | #endif |
| 722 | #define CPU_LOAD_IDX_MAX 5 | 760 | #define CPU_LOAD_IDX_MAX 5 |
| 723 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 761 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
| 724 | #ifdef CONFIG_NO_HZ_COMMON | 762 | #ifdef CONFIG_NO_HZ_COMMON |
| 725 | #ifdef CONFIG_SMP | 763 | #ifdef CONFIG_SMP |
| 726 | unsigned long last_load_update_tick; | 764 | unsigned long last_load_update_tick; |
| 765 | unsigned long last_blocked_load_update_tick; | ||
| 766 | unsigned int has_blocked_load; | ||
| 727 | #endif /* CONFIG_SMP */ | 767 | #endif /* CONFIG_SMP */ |
| 728 | unsigned long nohz_flags; | 768 | unsigned int nohz_tick_stopped; |
| 769 | atomic_t nohz_flags; | ||
| 729 | #endif /* CONFIG_NO_HZ_COMMON */ | 770 | #endif /* CONFIG_NO_HZ_COMMON */ |
| 730 | #ifdef CONFIG_NO_HZ_FULL | ||
| 731 | unsigned long last_sched_tick; | ||
| 732 | #endif | ||
| 733 | /* capture load from *all* tasks on this cpu: */ | ||
| 734 | struct load_weight load; | ||
| 735 | unsigned long nr_load_updates; | ||
| 736 | u64 nr_switches; | ||
| 737 | 771 | ||
| 738 | struct cfs_rq cfs; | 772 | /* capture load from *all* tasks on this CPU: */ |
| 739 | struct rt_rq rt; | 773 | struct load_weight load; |
| 740 | struct dl_rq dl; | 774 | unsigned long nr_load_updates; |
| 775 | u64 nr_switches; | ||
| 776 | |||
| 777 | struct cfs_rq cfs; | ||
| 778 | struct rt_rq rt; | ||
| 779 | struct dl_rq dl; | ||
| 741 | 780 | ||
| 742 | #ifdef CONFIG_FAIR_GROUP_SCHED | 781 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 743 | /* list of leaf cfs_rq on this cpu: */ | 782 | /* list of leaf cfs_rq on this CPU: */ |
| 744 | struct list_head leaf_cfs_rq_list; | 783 | struct list_head leaf_cfs_rq_list; |
| 745 | struct list_head *tmp_alone_branch; | 784 | struct list_head *tmp_alone_branch; |
| 746 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 785 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 747 | 786 | ||
| 748 | /* | 787 | /* |
| @@ -751,94 +790,98 @@ struct rq { | |||
| 751 | * one CPU and if it got migrated afterwards it may decrease | 790 | * one CPU and if it got migrated afterwards it may decrease |
| 752 | * it on another CPU. Always updated under the runqueue lock: | 791 | * it on another CPU. Always updated under the runqueue lock: |
| 753 | */ | 792 | */ |
| 754 | unsigned long nr_uninterruptible; | 793 | unsigned long nr_uninterruptible; |
| 755 | 794 | ||
| 756 | struct task_struct *curr, *idle, *stop; | 795 | struct task_struct *curr; |
| 757 | unsigned long next_balance; | 796 | struct task_struct *idle; |
| 758 | struct mm_struct *prev_mm; | 797 | struct task_struct *stop; |
| 798 | unsigned long next_balance; | ||
| 799 | struct mm_struct *prev_mm; | ||
| 759 | 800 | ||
| 760 | unsigned int clock_update_flags; | 801 | unsigned int clock_update_flags; |
| 761 | u64 clock; | 802 | u64 clock; |
| 762 | u64 clock_task; | 803 | u64 clock_task; |
| 763 | 804 | ||
| 764 | atomic_t nr_iowait; | 805 | atomic_t nr_iowait; |
| 765 | 806 | ||
| 766 | #ifdef CONFIG_SMP | 807 | #ifdef CONFIG_SMP |
| 767 | struct root_domain *rd; | 808 | struct root_domain *rd; |
| 768 | struct sched_domain *sd; | 809 | struct sched_domain *sd; |
| 769 | 810 | ||
| 770 | unsigned long cpu_capacity; | 811 | unsigned long cpu_capacity; |
| 771 | unsigned long cpu_capacity_orig; | 812 | unsigned long cpu_capacity_orig; |
| 772 | 813 | ||
| 773 | struct callback_head *balance_callback; | 814 | struct callback_head *balance_callback; |
| 815 | |||
| 816 | unsigned char idle_balance; | ||
| 774 | 817 | ||
| 775 | unsigned char idle_balance; | ||
| 776 | /* For active balancing */ | 818 | /* For active balancing */ |
| 777 | int active_balance; | 819 | int active_balance; |
| 778 | int push_cpu; | 820 | int push_cpu; |
| 779 | struct cpu_stop_work active_balance_work; | 821 | struct cpu_stop_work active_balance_work; |
| 780 | /* cpu of this runqueue: */ | 822 | |
| 781 | int cpu; | 823 | /* CPU of this runqueue: */ |
| 782 | int online; | 824 | int cpu; |
| 825 | int online; | ||
| 783 | 826 | ||
| 784 | struct list_head cfs_tasks; | 827 | struct list_head cfs_tasks; |
| 785 | 828 | ||
| 786 | u64 rt_avg; | 829 | u64 rt_avg; |
| 787 | u64 age_stamp; | 830 | u64 age_stamp; |
| 788 | u64 idle_stamp; | 831 | u64 idle_stamp; |
| 789 | u64 avg_idle; | 832 | u64 avg_idle; |
| 790 | 833 | ||
| 791 | /* This is used to determine avg_idle's max value */ | 834 | /* This is used to determine avg_idle's max value */ |
| 792 | u64 max_idle_balance_cost; | 835 | u64 max_idle_balance_cost; |
| 793 | #endif | 836 | #endif |
| 794 | 837 | ||
| 795 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 838 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
| 796 | u64 prev_irq_time; | 839 | u64 prev_irq_time; |
| 797 | #endif | 840 | #endif |
| 798 | #ifdef CONFIG_PARAVIRT | 841 | #ifdef CONFIG_PARAVIRT |
| 799 | u64 prev_steal_time; | 842 | u64 prev_steal_time; |
| 800 | #endif | 843 | #endif |
| 801 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | 844 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING |
| 802 | u64 prev_steal_time_rq; | 845 | u64 prev_steal_time_rq; |
| 803 | #endif | 846 | #endif |
| 804 | 847 | ||
| 805 | /* calc_load related fields */ | 848 | /* calc_load related fields */ |
| 806 | unsigned long calc_load_update; | 849 | unsigned long calc_load_update; |
| 807 | long calc_load_active; | 850 | long calc_load_active; |
| 808 | 851 | ||
| 809 | #ifdef CONFIG_SCHED_HRTICK | 852 | #ifdef CONFIG_SCHED_HRTICK |
| 810 | #ifdef CONFIG_SMP | 853 | #ifdef CONFIG_SMP |
| 811 | int hrtick_csd_pending; | 854 | int hrtick_csd_pending; |
| 812 | call_single_data_t hrtick_csd; | 855 | call_single_data_t hrtick_csd; |
| 813 | #endif | 856 | #endif |
| 814 | struct hrtimer hrtick_timer; | 857 | struct hrtimer hrtick_timer; |
| 815 | #endif | 858 | #endif |
| 816 | 859 | ||
| 817 | #ifdef CONFIG_SCHEDSTATS | 860 | #ifdef CONFIG_SCHEDSTATS |
| 818 | /* latency stats */ | 861 | /* latency stats */ |
| 819 | struct sched_info rq_sched_info; | 862 | struct sched_info rq_sched_info; |
| 820 | unsigned long long rq_cpu_time; | 863 | unsigned long long rq_cpu_time; |
| 821 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ | 864 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ |
| 822 | 865 | ||
| 823 | /* sys_sched_yield() stats */ | 866 | /* sys_sched_yield() stats */ |
| 824 | unsigned int yld_count; | 867 | unsigned int yld_count; |
| 825 | 868 | ||
| 826 | /* schedule() stats */ | 869 | /* schedule() stats */ |
| 827 | unsigned int sched_count; | 870 | unsigned int sched_count; |
| 828 | unsigned int sched_goidle; | 871 | unsigned int sched_goidle; |
| 829 | 872 | ||
| 830 | /* try_to_wake_up() stats */ | 873 | /* try_to_wake_up() stats */ |
| 831 | unsigned int ttwu_count; | 874 | unsigned int ttwu_count; |
| 832 | unsigned int ttwu_local; | 875 | unsigned int ttwu_local; |
| 833 | #endif | 876 | #endif |
| 834 | 877 | ||
| 835 | #ifdef CONFIG_SMP | 878 | #ifdef CONFIG_SMP |
| 836 | struct llist_head wake_list; | 879 | struct llist_head wake_list; |
| 837 | #endif | 880 | #endif |
| 838 | 881 | ||
| 839 | #ifdef CONFIG_CPU_IDLE | 882 | #ifdef CONFIG_CPU_IDLE |
| 840 | /* Must be inspected within a rcu lock section */ | 883 | /* Must be inspected within a rcu lock section */ |
| 841 | struct cpuidle_state *idle_state; | 884 | struct cpuidle_state *idle_state; |
| 842 | #endif | 885 | #endif |
| 843 | }; | 886 | }; |
| 844 | 887 | ||
| @@ -904,9 +947,9 @@ static inline u64 __rq_clock_broken(struct rq *rq) | |||
| 904 | * one position though, because the next rq_unpin_lock() will shift it | 947 | * one position though, because the next rq_unpin_lock() will shift it |
| 905 | * back. | 948 | * back. |
| 906 | */ | 949 | */ |
| 907 | #define RQCF_REQ_SKIP 0x01 | 950 | #define RQCF_REQ_SKIP 0x01 |
| 908 | #define RQCF_ACT_SKIP 0x02 | 951 | #define RQCF_ACT_SKIP 0x02 |
| 909 | #define RQCF_UPDATED 0x04 | 952 | #define RQCF_UPDATED 0x04 |
| 910 | 953 | ||
| 911 | static inline void assert_clock_updated(struct rq *rq) | 954 | static inline void assert_clock_updated(struct rq *rq) |
| 912 | { | 955 | { |
| @@ -1059,12 +1102,12 @@ extern void sched_ttwu_pending(void); | |||
| 1059 | 1102 | ||
| 1060 | /** | 1103 | /** |
| 1061 | * highest_flag_domain - Return highest sched_domain containing flag. | 1104 | * highest_flag_domain - Return highest sched_domain containing flag. |
| 1062 | * @cpu: The cpu whose highest level of sched domain is to | 1105 | * @cpu: The CPU whose highest level of sched domain is to |
| 1063 | * be returned. | 1106 | * be returned. |
| 1064 | * @flag: The flag to check for the highest sched_domain | 1107 | * @flag: The flag to check for the highest sched_domain |
| 1065 | * for the given cpu. | 1108 | * for the given CPU. |
| 1066 | * | 1109 | * |
| 1067 | * Returns the highest sched_domain of a cpu which contains the given flag. | 1110 | * Returns the highest sched_domain of a CPU which contains the given flag. |
| 1068 | */ | 1111 | */ |
| 1069 | static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | 1112 | static inline struct sched_domain *highest_flag_domain(int cpu, int flag) |
| 1070 | { | 1113 | { |
| @@ -1099,30 +1142,30 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); | |||
| 1099 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); | 1142 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); |
| 1100 | 1143 | ||
| 1101 | struct sched_group_capacity { | 1144 | struct sched_group_capacity { |
| 1102 | atomic_t ref; | 1145 | atomic_t ref; |
| 1103 | /* | 1146 | /* |
| 1104 | * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity | 1147 | * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity |
| 1105 | * for a single CPU. | 1148 | * for a single CPU. |
| 1106 | */ | 1149 | */ |
| 1107 | unsigned long capacity; | 1150 | unsigned long capacity; |
| 1108 | unsigned long min_capacity; /* Min per-CPU capacity in group */ | 1151 | unsigned long min_capacity; /* Min per-CPU capacity in group */ |
| 1109 | unsigned long next_update; | 1152 | unsigned long next_update; |
| 1110 | int imbalance; /* XXX unrelated to capacity but shared group state */ | 1153 | int imbalance; /* XXX unrelated to capacity but shared group state */ |
| 1111 | 1154 | ||
| 1112 | #ifdef CONFIG_SCHED_DEBUG | 1155 | #ifdef CONFIG_SCHED_DEBUG |
| 1113 | int id; | 1156 | int id; |
| 1114 | #endif | 1157 | #endif |
| 1115 | 1158 | ||
| 1116 | unsigned long cpumask[0]; /* balance mask */ | 1159 | unsigned long cpumask[0]; /* Balance mask */ |
| 1117 | }; | 1160 | }; |
| 1118 | 1161 | ||
| 1119 | struct sched_group { | 1162 | struct sched_group { |
| 1120 | struct sched_group *next; /* Must be a circular list */ | 1163 | struct sched_group *next; /* Must be a circular list */ |
| 1121 | atomic_t ref; | 1164 | atomic_t ref; |
| 1122 | 1165 | ||
| 1123 | unsigned int group_weight; | 1166 | unsigned int group_weight; |
| 1124 | struct sched_group_capacity *sgc; | 1167 | struct sched_group_capacity *sgc; |
| 1125 | int asym_prefer_cpu; /* cpu of highest priority in group */ | 1168 | int asym_prefer_cpu; /* CPU of highest priority in group */ |
| 1126 | 1169 | ||
| 1127 | /* | 1170 | /* |
| 1128 | * The CPUs this group covers. | 1171 | * The CPUs this group covers. |
| @@ -1131,7 +1174,7 @@ struct sched_group { | |||
| 1131 | * by attaching extra space to the end of the structure, | 1174 | * by attaching extra space to the end of the structure, |
| 1132 | * depending on how many CPUs the kernel has booted up with) | 1175 | * depending on how many CPUs the kernel has booted up with) |
| 1133 | */ | 1176 | */ |
| 1134 | unsigned long cpumask[0]; | 1177 | unsigned long cpumask[0]; |
| 1135 | }; | 1178 | }; |
| 1136 | 1179 | ||
| 1137 | static inline struct cpumask *sched_group_span(struct sched_group *sg) | 1180 | static inline struct cpumask *sched_group_span(struct sched_group *sg) |
| @@ -1148,8 +1191,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) | |||
| 1148 | } | 1191 | } |
| 1149 | 1192 | ||
| 1150 | /** | 1193 | /** |
| 1151 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | 1194 | * group_first_cpu - Returns the first CPU in the cpumask of a sched_group. |
| 1152 | * @group: The group whose first cpu is to be returned. | 1195 | * @group: The group whose first CPU is to be returned. |
| 1153 | */ | 1196 | */ |
| 1154 | static inline unsigned int group_first_cpu(struct sched_group *group) | 1197 | static inline unsigned int group_first_cpu(struct sched_group *group) |
| 1155 | { | 1198 | { |
| @@ -1349,19 +1392,12 @@ static inline int task_on_rq_migrating(struct task_struct *p) | |||
| 1349 | return p->on_rq == TASK_ON_RQ_MIGRATING; | 1392 | return p->on_rq == TASK_ON_RQ_MIGRATING; |
| 1350 | } | 1393 | } |
| 1351 | 1394 | ||
| 1352 | #ifndef prepare_arch_switch | ||
| 1353 | # define prepare_arch_switch(next) do { } while (0) | ||
| 1354 | #endif | ||
| 1355 | #ifndef finish_arch_post_lock_switch | ||
| 1356 | # define finish_arch_post_lock_switch() do { } while (0) | ||
| 1357 | #endif | ||
| 1358 | |||
| 1359 | /* | 1395 | /* |
| 1360 | * wake flags | 1396 | * wake flags |
| 1361 | */ | 1397 | */ |
| 1362 | #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ | 1398 | #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ |
| 1363 | #define WF_FORK 0x02 /* child wakeup after fork */ | 1399 | #define WF_FORK 0x02 /* Child wakeup after fork */ |
| 1364 | #define WF_MIGRATED 0x4 /* internal use, task got migrated */ | 1400 | #define WF_MIGRATED 0x4 /* Internal use, task got migrated */ |
| 1365 | 1401 | ||
| 1366 | /* | 1402 | /* |
| 1367 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 1403 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
| @@ -1372,11 +1408,11 @@ static inline int task_on_rq_migrating(struct task_struct *p) | |||
| 1372 | * slice expiry etc. | 1408 | * slice expiry etc. |
| 1373 | */ | 1409 | */ |
| 1374 | 1410 | ||
| 1375 | #define WEIGHT_IDLEPRIO 3 | 1411 | #define WEIGHT_IDLEPRIO 3 |
| 1376 | #define WMULT_IDLEPRIO 1431655765 | 1412 | #define WMULT_IDLEPRIO 1431655765 |
| 1377 | 1413 | ||
| 1378 | extern const int sched_prio_to_weight[40]; | 1414 | extern const int sched_prio_to_weight[40]; |
| 1379 | extern const u32 sched_prio_to_wmult[40]; | 1415 | extern const u32 sched_prio_to_wmult[40]; |
| 1380 | 1416 | ||
| 1381 | /* | 1417 | /* |
| 1382 | * {de,en}queue flags: | 1418 | * {de,en}queue flags: |
| @@ -1398,9 +1434,9 @@ extern const u32 sched_prio_to_wmult[40]; | |||
| 1398 | */ | 1434 | */ |
| 1399 | 1435 | ||
| 1400 | #define DEQUEUE_SLEEP 0x01 | 1436 | #define DEQUEUE_SLEEP 0x01 |
| 1401 | #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ | 1437 | #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ |
| 1402 | #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ | 1438 | #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ |
| 1403 | #define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ | 1439 | #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ |
| 1404 | 1440 | ||
| 1405 | #define ENQUEUE_WAKEUP 0x01 | 1441 | #define ENQUEUE_WAKEUP 0x01 |
| 1406 | #define ENQUEUE_RESTORE 0x02 | 1442 | #define ENQUEUE_RESTORE 0x02 |
| @@ -1422,10 +1458,10 @@ struct sched_class { | |||
| 1422 | 1458 | ||
| 1423 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); | 1459 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); |
| 1424 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); | 1460 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); |
| 1425 | void (*yield_task) (struct rq *rq); | 1461 | void (*yield_task) (struct rq *rq); |
| 1426 | bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); | 1462 | bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); |
| 1427 | 1463 | ||
| 1428 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); | 1464 | void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); |
| 1429 | 1465 | ||
| 1430 | /* | 1466 | /* |
| 1431 | * It is the responsibility of the pick_next_task() method that will | 1467 | * It is the responsibility of the pick_next_task() method that will |
| @@ -1435,16 +1471,16 @@ struct sched_class { | |||
| 1435 | * May return RETRY_TASK when it finds a higher prio class has runnable | 1471 | * May return RETRY_TASK when it finds a higher prio class has runnable |
| 1436 | * tasks. | 1472 | * tasks. |
| 1437 | */ | 1473 | */ |
| 1438 | struct task_struct * (*pick_next_task) (struct rq *rq, | 1474 | struct task_struct * (*pick_next_task)(struct rq *rq, |
| 1439 | struct task_struct *prev, | 1475 | struct task_struct *prev, |
| 1440 | struct rq_flags *rf); | 1476 | struct rq_flags *rf); |
| 1441 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | 1477 | void (*put_prev_task)(struct rq *rq, struct task_struct *p); |
| 1442 | 1478 | ||
| 1443 | #ifdef CONFIG_SMP | 1479 | #ifdef CONFIG_SMP |
| 1444 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); | 1480 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); |
| 1445 | void (*migrate_task_rq)(struct task_struct *p); | 1481 | void (*migrate_task_rq)(struct task_struct *p); |
| 1446 | 1482 | ||
| 1447 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); | 1483 | void (*task_woken)(struct rq *this_rq, struct task_struct *task); |
| 1448 | 1484 | ||
| 1449 | void (*set_cpus_allowed)(struct task_struct *p, | 1485 | void (*set_cpus_allowed)(struct task_struct *p, |
| 1450 | const struct cpumask *newmask); | 1486 | const struct cpumask *newmask); |
| @@ -1453,31 +1489,31 @@ struct sched_class { | |||
| 1453 | void (*rq_offline)(struct rq *rq); | 1489 | void (*rq_offline)(struct rq *rq); |
| 1454 | #endif | 1490 | #endif |
| 1455 | 1491 | ||
| 1456 | void (*set_curr_task) (struct rq *rq); | 1492 | void (*set_curr_task)(struct rq *rq); |
| 1457 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); | 1493 | void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); |
| 1458 | void (*task_fork) (struct task_struct *p); | 1494 | void (*task_fork)(struct task_struct *p); |
| 1459 | void (*task_dead) (struct task_struct *p); | 1495 | void (*task_dead)(struct task_struct *p); |
| 1460 | 1496 | ||
| 1461 | /* | 1497 | /* |
| 1462 | * The switched_from() call is allowed to drop rq->lock, therefore we | 1498 | * The switched_from() call is allowed to drop rq->lock, therefore we |
| 1463 | * cannot assume the switched_from/switched_to pair is serliazed by | 1499 | * cannot assume the switched_from/switched_to pair is serliazed by |
| 1464 | * rq->lock. They are however serialized by p->pi_lock. | 1500 | * rq->lock. They are however serialized by p->pi_lock. |
| 1465 | */ | 1501 | */ |
| 1466 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); | 1502 | void (*switched_from)(struct rq *this_rq, struct task_struct *task); |
| 1467 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); | 1503 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); |
| 1468 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, | 1504 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, |
| 1469 | int oldprio); | 1505 | int oldprio); |
| 1470 | 1506 | ||
| 1471 | unsigned int (*get_rr_interval) (struct rq *rq, | 1507 | unsigned int (*get_rr_interval)(struct rq *rq, |
| 1472 | struct task_struct *task); | 1508 | struct task_struct *task); |
| 1473 | 1509 | ||
| 1474 | void (*update_curr) (struct rq *rq); | 1510 | void (*update_curr)(struct rq *rq); |
| 1475 | 1511 | ||
| 1476 | #define TASK_SET_GROUP 0 | 1512 | #define TASK_SET_GROUP 0 |
| 1477 | #define TASK_MOVE_GROUP 1 | 1513 | #define TASK_MOVE_GROUP 1 |
| 1478 | 1514 | ||
| 1479 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1515 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 1480 | void (*task_change_group) (struct task_struct *p, int type); | 1516 | void (*task_change_group)(struct task_struct *p, int type); |
| 1481 | #endif | 1517 | #endif |
| 1482 | }; | 1518 | }; |
| 1483 | 1519 | ||
| @@ -1526,6 +1562,7 @@ static inline void idle_set_state(struct rq *rq, | |||
| 1526 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) | 1562 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) |
| 1527 | { | 1563 | { |
| 1528 | SCHED_WARN_ON(!rcu_read_lock_held()); | 1564 | SCHED_WARN_ON(!rcu_read_lock_held()); |
| 1565 | |||
| 1529 | return rq->idle_state; | 1566 | return rq->idle_state; |
| 1530 | } | 1567 | } |
| 1531 | #else | 1568 | #else |
| @@ -1564,9 +1601,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); | |||
| 1564 | extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); | 1601 | extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); |
| 1565 | extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); | 1602 | extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); |
| 1566 | 1603 | ||
| 1567 | #define BW_SHIFT 20 | 1604 | #define BW_SHIFT 20 |
| 1568 | #define BW_UNIT (1 << BW_SHIFT) | 1605 | #define BW_UNIT (1 << BW_SHIFT) |
| 1569 | #define RATIO_SHIFT 8 | 1606 | #define RATIO_SHIFT 8 |
| 1570 | unsigned long to_ratio(u64 period, u64 runtime); | 1607 | unsigned long to_ratio(u64 period, u64 runtime); |
| 1571 | 1608 | ||
| 1572 | extern void init_entity_runnable_average(struct sched_entity *se); | 1609 | extern void init_entity_runnable_average(struct sched_entity *se); |
| @@ -1574,6 +1611,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se); | |||
| 1574 | 1611 | ||
| 1575 | #ifdef CONFIG_NO_HZ_FULL | 1612 | #ifdef CONFIG_NO_HZ_FULL |
| 1576 | extern bool sched_can_stop_tick(struct rq *rq); | 1613 | extern bool sched_can_stop_tick(struct rq *rq); |
| 1614 | extern int __init sched_tick_offload_init(void); | ||
| 1577 | 1615 | ||
| 1578 | /* | 1616 | /* |
| 1579 | * Tick may be needed by tasks in the runqueue depending on their policy and | 1617 | * Tick may be needed by tasks in the runqueue depending on their policy and |
| @@ -1598,6 +1636,7 @@ static inline void sched_update_tick_dependency(struct rq *rq) | |||
| 1598 | tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); | 1636 | tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); |
| 1599 | } | 1637 | } |
| 1600 | #else | 1638 | #else |
| 1639 | static inline int sched_tick_offload_init(void) { return 0; } | ||
| 1601 | static inline void sched_update_tick_dependency(struct rq *rq) { } | 1640 | static inline void sched_update_tick_dependency(struct rq *rq) { } |
| 1602 | #endif | 1641 | #endif |
| 1603 | 1642 | ||
| @@ -1624,13 +1663,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) | |||
| 1624 | sched_update_tick_dependency(rq); | 1663 | sched_update_tick_dependency(rq); |
| 1625 | } | 1664 | } |
| 1626 | 1665 | ||
| 1627 | static inline void rq_last_tick_reset(struct rq *rq) | ||
| 1628 | { | ||
| 1629 | #ifdef CONFIG_NO_HZ_FULL | ||
| 1630 | rq->last_sched_tick = jiffies; | ||
| 1631 | #endif | ||
| 1632 | } | ||
| 1633 | |||
| 1634 | extern void update_rq_clock(struct rq *rq); | 1666 | extern void update_rq_clock(struct rq *rq); |
| 1635 | 1667 | ||
| 1636 | extern void activate_task(struct rq *rq, struct task_struct *p, int flags); | 1668 | extern void activate_task(struct rq *rq, struct task_struct *p, int flags); |
| @@ -1821,8 +1853,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
| 1821 | /* | 1853 | /* |
| 1822 | * Unfair double_lock_balance: Optimizes throughput at the expense of | 1854 | * Unfair double_lock_balance: Optimizes throughput at the expense of |
| 1823 | * latency by eliminating extra atomic operations when the locks are | 1855 | * latency by eliminating extra atomic operations when the locks are |
| 1824 | * already in proper order on entry. This favors lower cpu-ids and will | 1856 | * already in proper order on entry. This favors lower CPU-ids and will |
| 1825 | * grant the double lock to lower cpus over higher ids under contention, | 1857 | * grant the double lock to lower CPUs over higher ids under contention, |
| 1826 | * regardless of entry order into the function. | 1858 | * regardless of entry order into the function. |
| 1827 | */ | 1859 | */ |
| 1828 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | 1860 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) |
| @@ -1854,7 +1886,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
| 1854 | static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) | 1886 | static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) |
| 1855 | { | 1887 | { |
| 1856 | if (unlikely(!irqs_disabled())) { | 1888 | if (unlikely(!irqs_disabled())) { |
| 1857 | /* printk() doesn't work good under rq->lock */ | 1889 | /* printk() doesn't work well under rq->lock */ |
| 1858 | raw_spin_unlock(&this_rq->lock); | 1890 | raw_spin_unlock(&this_rq->lock); |
| 1859 | BUG_ON(1); | 1891 | BUG_ON(1); |
| 1860 | } | 1892 | } |
| @@ -2005,16 +2037,19 @@ extern void cfs_bandwidth_usage_inc(void); | |||
| 2005 | extern void cfs_bandwidth_usage_dec(void); | 2037 | extern void cfs_bandwidth_usage_dec(void); |
| 2006 | 2038 | ||
| 2007 | #ifdef CONFIG_NO_HZ_COMMON | 2039 | #ifdef CONFIG_NO_HZ_COMMON |
| 2008 | enum rq_nohz_flag_bits { | 2040 | #define NOHZ_BALANCE_KICK_BIT 0 |
| 2009 | NOHZ_TICK_STOPPED, | 2041 | #define NOHZ_STATS_KICK_BIT 1 |
| 2010 | NOHZ_BALANCE_KICK, | 2042 | |
| 2011 | }; | 2043 | #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) |
| 2044 | #define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) | ||
| 2045 | |||
| 2046 | #define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) | ||
| 2012 | 2047 | ||
| 2013 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) | 2048 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) |
| 2014 | 2049 | ||
| 2015 | extern void nohz_balance_exit_idle(unsigned int cpu); | 2050 | extern void nohz_balance_exit_idle(struct rq *rq); |
| 2016 | #else | 2051 | #else |
| 2017 | static inline void nohz_balance_exit_idle(unsigned int cpu) { } | 2052 | static inline void nohz_balance_exit_idle(struct rq *rq) { } |
| 2018 | #endif | 2053 | #endif |
| 2019 | 2054 | ||
| 2020 | 2055 | ||
| @@ -2113,15 +2148,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} | |||
| 2113 | #endif /* CONFIG_CPU_FREQ */ | 2148 | #endif /* CONFIG_CPU_FREQ */ |
| 2114 | 2149 | ||
| 2115 | #ifdef arch_scale_freq_capacity | 2150 | #ifdef arch_scale_freq_capacity |
| 2116 | #ifndef arch_scale_freq_invariant | 2151 | # ifndef arch_scale_freq_invariant |
| 2117 | #define arch_scale_freq_invariant() (true) | 2152 | # define arch_scale_freq_invariant() true |
| 2118 | #endif | 2153 | # endif |
| 2119 | #else /* arch_scale_freq_capacity */ | 2154 | #else |
| 2120 | #define arch_scale_freq_invariant() (false) | 2155 | # define arch_scale_freq_invariant() false |
| 2121 | #endif | 2156 | #endif |
| 2122 | 2157 | ||
| 2123 | #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL | 2158 | #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL |
| 2124 | |||
| 2125 | static inline unsigned long cpu_util_dl(struct rq *rq) | 2159 | static inline unsigned long cpu_util_dl(struct rq *rq) |
| 2126 | { | 2160 | { |
| 2127 | return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; | 2161 | return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; |
| @@ -2129,7 +2163,13 @@ static inline unsigned long cpu_util_dl(struct rq *rq) | |||
| 2129 | 2163 | ||
| 2130 | static inline unsigned long cpu_util_cfs(struct rq *rq) | 2164 | static inline unsigned long cpu_util_cfs(struct rq *rq) |
| 2131 | { | 2165 | { |
| 2132 | return rq->cfs.avg.util_avg; | 2166 | unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); |
| 2133 | } | 2167 | |
| 2168 | if (sched_feat(UTIL_EST)) { | ||
| 2169 | util = max_t(unsigned long, util, | ||
| 2170 | READ_ONCE(rq->cfs.avg.util_est.enqueued)); | ||
| 2171 | } | ||
| 2134 | 2172 | ||
| 2173 | return util; | ||
| 2174 | } | ||
| 2135 | #endif | 2175 | #endif |
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 940b1fa1d2ce..ab112cbfd7c8 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c | |||
| @@ -1,14 +1,13 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | 2 | /* | |
| 3 | #include <linux/slab.h> | 3 | * /proc/schedstat implementation |
| 4 | #include <linux/fs.h> | 4 | */ |
| 5 | #include <linux/seq_file.h> | ||
| 6 | #include <linux/proc_fs.h> | ||
| 7 | |||
| 8 | #include "sched.h" | 5 | #include "sched.h" |
| 9 | 6 | ||
| 10 | /* | 7 | /* |
| 11 | * bump this up when changing the output format or the meaning of an existing | 8 | * Current schedstat API version. |
| 9 | * | ||
| 10 | * Bump this up when changing the output format or the meaning of an existing | ||
| 12 | * format, so that tools can adapt (or abort) | 11 | * format, so that tools can adapt (or abort) |
| 13 | */ | 12 | */ |
| 14 | #define SCHEDSTAT_VERSION 15 | 13 | #define SCHEDSTAT_VERSION 15 |
| @@ -78,8 +77,8 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
| 78 | * This itererator needs some explanation. | 77 | * This itererator needs some explanation. |
| 79 | * It returns 1 for the header position. | 78 | * It returns 1 for the header position. |
| 80 | * This means 2 is cpu 0. | 79 | * This means 2 is cpu 0. |
| 81 | * In a hotplugged system some cpus, including cpu 0, may be missing so we have | 80 | * In a hotplugged system some CPUs, including cpu 0, may be missing so we have |
| 82 | * to use cpumask_* to iterate over the cpus. | 81 | * to use cpumask_* to iterate over the CPUs. |
| 83 | */ | 82 | */ |
| 84 | static void *schedstat_start(struct seq_file *file, loff_t *offset) | 83 | static void *schedstat_start(struct seq_file *file, loff_t *offset) |
| 85 | { | 84 | { |
| @@ -99,12 +98,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset) | |||
| 99 | 98 | ||
| 100 | if (n < nr_cpu_ids) | 99 | if (n < nr_cpu_ids) |
| 101 | return (void *)(unsigned long)(n + 2); | 100 | return (void *)(unsigned long)(n + 2); |
| 101 | |||
| 102 | return NULL; | 102 | return NULL; |
| 103 | } | 103 | } |
| 104 | 104 | ||
| 105 | static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) | 105 | static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) |
| 106 | { | 106 | { |
| 107 | (*offset)++; | 107 | (*offset)++; |
| 108 | |||
| 108 | return schedstat_start(file, offset); | 109 | return schedstat_start(file, offset); |
| 109 | } | 110 | } |
| 110 | 111 | ||
| @@ -134,6 +135,7 @@ static const struct file_operations proc_schedstat_operations = { | |||
| 134 | static int __init proc_schedstat_init(void) | 135 | static int __init proc_schedstat_init(void) |
| 135 | { | 136 | { |
| 136 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); | 137 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); |
| 138 | |||
| 137 | return 0; | 139 | return 0; |
| 138 | } | 140 | } |
| 139 | subsys_initcall(proc_schedstat_init); | 141 | subsys_initcall(proc_schedstat_init); |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8e7b58de61e7..8aea199a39b4 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
| @@ -30,35 +30,29 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | |||
| 30 | if (rq) | 30 | if (rq) |
| 31 | rq->rq_sched_info.run_delay += delta; | 31 | rq->rq_sched_info.run_delay += delta; |
| 32 | } | 32 | } |
| 33 | #define schedstat_enabled() static_branch_unlikely(&sched_schedstats) | 33 | #define schedstat_enabled() static_branch_unlikely(&sched_schedstats) |
| 34 | #define __schedstat_inc(var) do { var++; } while (0) | 34 | #define __schedstat_inc(var) do { var++; } while (0) |
| 35 | #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) | 35 | #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) |
| 36 | #define __schedstat_add(var, amt) do { var += (amt); } while (0) | 36 | #define __schedstat_add(var, amt) do { var += (amt); } while (0) |
| 37 | #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) | 37 | #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) |
| 38 | #define __schedstat_set(var, val) do { var = (val); } while (0) | 38 | #define __schedstat_set(var, val) do { var = (val); } while (0) |
| 39 | #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) | 39 | #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) |
| 40 | #define schedstat_val(var) (var) | 40 | #define schedstat_val(var) (var) |
| 41 | #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) | 41 | #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) |
| 42 | 42 | ||
| 43 | #else /* !CONFIG_SCHEDSTATS */ | 43 | #else /* !CONFIG_SCHEDSTATS: */ |
| 44 | static inline void | 44 | static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } |
| 45 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) | 45 | static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { } |
| 46 | {} | 46 | static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } |
| 47 | static inline void | 47 | # define schedstat_enabled() 0 |
| 48 | rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | 48 | # define __schedstat_inc(var) do { } while (0) |
| 49 | {} | 49 | # define schedstat_inc(var) do { } while (0) |
| 50 | static inline void | 50 | # define __schedstat_add(var, amt) do { } while (0) |
| 51 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) | 51 | # define schedstat_add(var, amt) do { } while (0) |
| 52 | {} | 52 | # define __schedstat_set(var, val) do { } while (0) |
| 53 | #define schedstat_enabled() 0 | 53 | # define schedstat_set(var, val) do { } while (0) |
| 54 | #define __schedstat_inc(var) do { } while (0) | 54 | # define schedstat_val(var) 0 |
| 55 | #define schedstat_inc(var) do { } while (0) | 55 | # define schedstat_val_or_zero(var) 0 |
| 56 | #define __schedstat_add(var, amt) do { } while (0) | ||
| 57 | #define schedstat_add(var, amt) do { } while (0) | ||
| 58 | #define __schedstat_set(var, val) do { } while (0) | ||
| 59 | #define schedstat_set(var, val) do { } while (0) | ||
| 60 | #define schedstat_val(var) 0 | ||
| 61 | #define schedstat_val_or_zero(var) 0 | ||
| 62 | #endif /* CONFIG_SCHEDSTATS */ | 56 | #endif /* CONFIG_SCHEDSTATS */ |
| 63 | 57 | ||
| 64 | #ifdef CONFIG_SCHED_INFO | 58 | #ifdef CONFIG_SCHED_INFO |
| @@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) | |||
| 69 | 63 | ||
| 70 | /* | 64 | /* |
| 71 | * We are interested in knowing how long it was from the *first* time a | 65 | * We are interested in knowing how long it was from the *first* time a |
| 72 | * task was queued to the time that it finally hit a cpu, we call this routine | 66 | * task was queued to the time that it finally hit a CPU, we call this routine |
| 73 | * from dequeue_task() to account for possible rq->clock skew across cpus. The | 67 | * from dequeue_task() to account for possible rq->clock skew across CPUs. The |
| 74 | * delta taken on each cpu would annul the skew. | 68 | * delta taken on each CPU would annul the skew. |
| 75 | */ | 69 | */ |
| 76 | static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) | 70 | static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) |
| 77 | { | 71 | { |
| @@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) | |||
| 87 | } | 81 | } |
| 88 | 82 | ||
| 89 | /* | 83 | /* |
| 90 | * Called when a task finally hits the cpu. We can now calculate how | 84 | * Called when a task finally hits the CPU. We can now calculate how |
| 91 | * long it was waiting to run. We also note when it began so that we | 85 | * long it was waiting to run. We also note when it began so that we |
| 92 | * can keep stats on how long its timeslice is. | 86 | * can keep stats on how long its timeslice is. |
| 93 | */ | 87 | */ |
| @@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) | |||
| 112 | */ | 106 | */ |
| 113 | static inline void sched_info_queued(struct rq *rq, struct task_struct *t) | 107 | static inline void sched_info_queued(struct rq *rq, struct task_struct *t) |
| 114 | { | 108 | { |
| 115 | if (unlikely(sched_info_on())) | 109 | if (unlikely(sched_info_on())) { |
| 116 | if (!t->sched_info.last_queued) | 110 | if (!t->sched_info.last_queued) |
| 117 | t->sched_info.last_queued = rq_clock(rq); | 111 | t->sched_info.last_queued = rq_clock(rq); |
| 112 | } | ||
| 118 | } | 113 | } |
| 119 | 114 | ||
| 120 | /* | 115 | /* |
| @@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) | |||
| 127 | */ | 122 | */ |
| 128 | static inline void sched_info_depart(struct rq *rq, struct task_struct *t) | 123 | static inline void sched_info_depart(struct rq *rq, struct task_struct *t) |
| 129 | { | 124 | { |
| 130 | unsigned long long delta = rq_clock(rq) - | 125 | unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival; |
| 131 | t->sched_info.last_arrival; | ||
| 132 | 126 | ||
| 133 | rq_sched_info_depart(rq, delta); | 127 | rq_sched_info_depart(rq, delta); |
| 134 | 128 | ||
| @@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) | |||
| 142 | * the idle task.) We are only called when prev != next. | 136 | * the idle task.) We are only called when prev != next. |
| 143 | */ | 137 | */ |
| 144 | static inline void | 138 | static inline void |
| 145 | __sched_info_switch(struct rq *rq, | 139 | __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) |
| 146 | struct task_struct *prev, struct task_struct *next) | ||
| 147 | { | 140 | { |
| 148 | /* | 141 | /* |
| 149 | * prev now departs the cpu. It's not interesting to record | 142 | * prev now departs the CPU. It's not interesting to record |
| 150 | * stats about how efficient we were at scheduling the idle | 143 | * stats about how efficient we were at scheduling the idle |
| 151 | * process, however. | 144 | * process, however. |
| 152 | */ | 145 | */ |
| @@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq, | |||
| 156 | if (next != rq->idle) | 149 | if (next != rq->idle) |
| 157 | sched_info_arrive(rq, next); | 150 | sched_info_arrive(rq, next); |
| 158 | } | 151 | } |
| 152 | |||
| 159 | static inline void | 153 | static inline void |
| 160 | sched_info_switch(struct rq *rq, | 154 | sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) |
| 161 | struct task_struct *prev, struct task_struct *next) | ||
| 162 | { | 155 | { |
| 163 | if (unlikely(sched_info_on())) | 156 | if (unlikely(sched_info_on())) |
| 164 | __sched_info_switch(rq, prev, next); | 157 | __sched_info_switch(rq, prev, next); |
| 165 | } | 158 | } |
| 166 | #else | 159 | |
| 167 | #define sched_info_queued(rq, t) do { } while (0) | 160 | #else /* !CONFIG_SCHED_INFO: */ |
| 168 | #define sched_info_reset_dequeued(t) do { } while (0) | 161 | # define sched_info_queued(rq, t) do { } while (0) |
| 169 | #define sched_info_dequeued(rq, t) do { } while (0) | 162 | # define sched_info_reset_dequeued(t) do { } while (0) |
| 170 | #define sched_info_depart(rq, t) do { } while (0) | 163 | # define sched_info_dequeued(rq, t) do { } while (0) |
| 171 | #define sched_info_arrive(rq, next) do { } while (0) | 164 | # define sched_info_depart(rq, t) do { } while (0) |
| 172 | #define sched_info_switch(rq, t, next) do { } while (0) | 165 | # define sched_info_arrive(rq, next) do { } while (0) |
| 166 | # define sched_info_switch(rq, t, next) do { } while (0) | ||
| 173 | #endif /* CONFIG_SCHED_INFO */ | 167 | #endif /* CONFIG_SCHED_INFO */ |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 210b1f2146ff..c183b790ca54 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
| @@ -1,6 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | #include "sched.h" | ||
| 3 | |||
| 4 | /* | 2 | /* |
| 5 | * stop-task scheduling class. | 3 | * stop-task scheduling class. |
| 6 | * | 4 | * |
| @@ -9,6 +7,7 @@ | |||
| 9 | * | 7 | * |
| 10 | * See kernel/stop_machine.c | 8 | * See kernel/stop_machine.c |
| 11 | */ | 9 | */ |
| 10 | #include "sched.h" | ||
| 12 | 11 | ||
| 13 | #ifdef CONFIG_SMP | 12 | #ifdef CONFIG_SMP |
| 14 | static int | 13 | static int |
| @@ -75,6 +74,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) | |||
| 75 | cgroup_account_cputime(curr, delta_exec); | 74 | cgroup_account_cputime(curr, delta_exec); |
| 76 | } | 75 | } |
| 77 | 76 | ||
| 77 | /* | ||
| 78 | * scheduler tick hitting a task of our scheduling class. | ||
| 79 | * | ||
| 80 | * NOTE: This function can be called remotely by the tick offload that | ||
| 81 | * goes along full dynticks. Therefore no local assumption can be made | ||
| 82 | * and everything must be accessed through the @rq and @curr passed in | ||
| 83 | * parameters. | ||
| 84 | */ | ||
| 78 | static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) | 85 | static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) |
| 79 | { | 86 | { |
| 80 | } | 87 | } |
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 9ff1555341ed..b6fb2c3b3ff7 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c | |||
| @@ -1,6 +1,8 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | #include <linux/sched/signal.h> | 2 | /* |
| 3 | #include <linux/swait.h> | 3 | * <linux/swait.h> (simple wait queues ) implementation: |
| 4 | */ | ||
| 5 | #include "sched.h" | ||
| 4 | 6 | ||
| 5 | void __init_swait_queue_head(struct swait_queue_head *q, const char *name, | 7 | void __init_swait_queue_head(struct swait_queue_head *q, const char *name, |
| 6 | struct lock_class_key *key) | 8 | struct lock_class_key *key) |
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 519b024f4e94..64cc564f5255 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c | |||
| @@ -2,10 +2,6 @@ | |||
| 2 | /* | 2 | /* |
| 3 | * Scheduler topology setup/handling methods | 3 | * Scheduler topology setup/handling methods |
| 4 | */ | 4 | */ |
| 5 | #include <linux/sched.h> | ||
| 6 | #include <linux/mutex.h> | ||
| 7 | #include <linux/sched/isolation.h> | ||
| 8 | |||
| 9 | #include "sched.h" | 5 | #include "sched.h" |
| 10 | 6 | ||
| 11 | DEFINE_MUTEX(sched_domains_mutex); | 7 | DEFINE_MUTEX(sched_domains_mutex); |
| @@ -41,8 +37,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 41 | if (!(sd->flags & SD_LOAD_BALANCE)) { | 37 | if (!(sd->flags & SD_LOAD_BALANCE)) { |
| 42 | printk("does not load-balance\n"); | 38 | printk("does not load-balance\n"); |
| 43 | if (sd->parent) | 39 | if (sd->parent) |
| 44 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" | 40 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); |
| 45 | " has parent"); | ||
| 46 | return -1; | 41 | return -1; |
| 47 | } | 42 | } |
| 48 | 43 | ||
| @@ -50,12 +45,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 50 | cpumask_pr_args(sched_domain_span(sd)), sd->name); | 45 | cpumask_pr_args(sched_domain_span(sd)), sd->name); |
| 51 | 46 | ||
| 52 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { | 47 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { |
| 53 | printk(KERN_ERR "ERROR: domain->span does not contain " | 48 | printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); |
| 54 | "CPU%d\n", cpu); | ||
| 55 | } | 49 | } |
| 56 | if (!cpumask_test_cpu(cpu, sched_group_span(group))) { | 50 | if (!cpumask_test_cpu(cpu, sched_group_span(group))) { |
| 57 | printk(KERN_ERR "ERROR: domain->groups does not contain" | 51 | printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); |
| 58 | " CPU%d\n", cpu); | ||
| 59 | } | 52 | } |
| 60 | 53 | ||
| 61 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); | 54 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); |
| @@ -115,8 +108,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 115 | 108 | ||
| 116 | if (sd->parent && | 109 | if (sd->parent && |
| 117 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) | 110 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) |
| 118 | printk(KERN_ERR "ERROR: parent span is not a superset " | 111 | printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); |
| 119 | "of domain->span\n"); | ||
| 120 | return 0; | 112 | return 0; |
| 121 | } | 113 | } |
| 122 | 114 | ||
| @@ -595,7 +587,7 @@ int group_balance_cpu(struct sched_group *sg) | |||
| 595 | * are not. | 587 | * are not. |
| 596 | * | 588 | * |
| 597 | * This leads to a few particularly weird cases where the sched_domain's are | 589 | * This leads to a few particularly weird cases where the sched_domain's are |
| 598 | * not of the same number for each cpu. Consider: | 590 | * not of the same number for each CPU. Consider: |
| 599 | * | 591 | * |
| 600 | * NUMA-2 0-3 0-3 | 592 | * NUMA-2 0-3 0-3 |
| 601 | * groups: {0-2},{1-3} {1-3},{0-2} | 593 | * groups: {0-2},{1-3} {1-3},{0-2} |
| @@ -780,7 +772,7 @@ fail: | |||
| 780 | * ^ ^ ^ ^ | 772 | * ^ ^ ^ ^ |
| 781 | * `-' `-' | 773 | * `-' `-' |
| 782 | * | 774 | * |
| 783 | * The sched_domains are per-cpu and have a two way link (parent & child) and | 775 | * The sched_domains are per-CPU and have a two way link (parent & child) and |
| 784 | * denote the ever growing mask of CPUs belonging to that level of topology. | 776 | * denote the ever growing mask of CPUs belonging to that level of topology. |
| 785 | * | 777 | * |
| 786 | * Each sched_domain has a circular (double) linked list of sched_group's, each | 778 | * Each sched_domain has a circular (double) linked list of sched_group's, each |
| @@ -1021,6 +1013,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) | |||
| 1021 | d->rd = alloc_rootdomain(); | 1013 | d->rd = alloc_rootdomain(); |
| 1022 | if (!d->rd) | 1014 | if (!d->rd) |
| 1023 | return sa_sd; | 1015 | return sa_sd; |
| 1016 | |||
| 1024 | return sa_rootdomain; | 1017 | return sa_rootdomain; |
| 1025 | } | 1018 | } |
| 1026 | 1019 | ||
| @@ -1047,12 +1040,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd) | |||
| 1047 | } | 1040 | } |
| 1048 | 1041 | ||
| 1049 | #ifdef CONFIG_NUMA | 1042 | #ifdef CONFIG_NUMA |
| 1050 | static int sched_domains_numa_levels; | ||
| 1051 | enum numa_topology_type sched_numa_topology_type; | 1043 | enum numa_topology_type sched_numa_topology_type; |
| 1052 | static int *sched_domains_numa_distance; | 1044 | |
| 1053 | int sched_max_numa_distance; | 1045 | static int sched_domains_numa_levels; |
| 1054 | static struct cpumask ***sched_domains_numa_masks; | 1046 | static int sched_domains_curr_level; |
| 1055 | static int sched_domains_curr_level; | 1047 | |
| 1048 | int sched_max_numa_distance; | ||
| 1049 | static int *sched_domains_numa_distance; | ||
| 1050 | static struct cpumask ***sched_domains_numa_masks; | ||
| 1056 | #endif | 1051 | #endif |
| 1057 | 1052 | ||
| 1058 | /* | 1053 | /* |
| @@ -1074,11 +1069,11 @@ static int sched_domains_curr_level; | |||
| 1074 | * SD_ASYM_PACKING - describes SMT quirks | 1069 | * SD_ASYM_PACKING - describes SMT quirks |
| 1075 | */ | 1070 | */ |
| 1076 | #define TOPOLOGY_SD_FLAGS \ | 1071 | #define TOPOLOGY_SD_FLAGS \ |
| 1077 | (SD_SHARE_CPUCAPACITY | \ | 1072 | (SD_SHARE_CPUCAPACITY | \ |
| 1078 | SD_SHARE_PKG_RESOURCES | \ | 1073 | SD_SHARE_PKG_RESOURCES | \ |
| 1079 | SD_NUMA | \ | 1074 | SD_NUMA | \ |
| 1080 | SD_ASYM_PACKING | \ | 1075 | SD_ASYM_PACKING | \ |
| 1081 | SD_ASYM_CPUCAPACITY | \ | 1076 | SD_ASYM_CPUCAPACITY | \ |
| 1082 | SD_SHARE_POWERDOMAIN) | 1077 | SD_SHARE_POWERDOMAIN) |
| 1083 | 1078 | ||
| 1084 | static struct sched_domain * | 1079 | static struct sched_domain * |
| @@ -1628,7 +1623,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve | |||
| 1628 | pr_err(" the %s domain not a subset of the %s domain\n", | 1623 | pr_err(" the %s domain not a subset of the %s domain\n", |
| 1629 | child->name, sd->name); | 1624 | child->name, sd->name); |
| 1630 | #endif | 1625 | #endif |
| 1631 | /* Fixup, ensure @sd has at least @child cpus. */ | 1626 | /* Fixup, ensure @sd has at least @child CPUs. */ |
| 1632 | cpumask_or(sched_domain_span(sd), | 1627 | cpumask_or(sched_domain_span(sd), |
| 1633 | sched_domain_span(sd), | 1628 | sched_domain_span(sd), |
| 1634 | sched_domain_span(child)); | 1629 | sched_domain_span(child)); |
| @@ -1720,6 +1715,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att | |||
| 1720 | ret = 0; | 1715 | ret = 0; |
| 1721 | error: | 1716 | error: |
| 1722 | __free_domain_allocs(&d, alloc_state, cpu_map); | 1717 | __free_domain_allocs(&d, alloc_state, cpu_map); |
| 1718 | |||
| 1723 | return ret; | 1719 | return ret; |
| 1724 | } | 1720 | } |
| 1725 | 1721 | ||
| @@ -1824,6 +1820,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | |||
| 1824 | return 1; | 1820 | return 1; |
| 1825 | 1821 | ||
| 1826 | tmp = SD_ATTR_INIT; | 1822 | tmp = SD_ATTR_INIT; |
| 1823 | |||
| 1827 | return !memcmp(cur ? (cur + idx_cur) : &tmp, | 1824 | return !memcmp(cur ? (cur + idx_cur) : &tmp, |
| 1828 | new ? (new + idx_new) : &tmp, | 1825 | new ? (new + idx_new) : &tmp, |
| 1829 | sizeof(struct sched_domain_attr)); | 1826 | sizeof(struct sched_domain_attr)); |
| @@ -1929,4 +1926,3 @@ match2: | |||
| 1929 | 1926 | ||
| 1930 | mutex_unlock(&sched_domains_mutex); | 1927 | mutex_unlock(&sched_domains_mutex); |
| 1931 | } | 1928 | } |
| 1932 | |||
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 929ecb7d6b78..928be527477e 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
| @@ -3,14 +3,7 @@ | |||
| 3 | * | 3 | * |
| 4 | * (C) 2004 Nadia Yvette Chambers, Oracle | 4 | * (C) 2004 Nadia Yvette Chambers, Oracle |
| 5 | */ | 5 | */ |
| 6 | #include <linux/init.h> | 6 | #include "sched.h" |
| 7 | #include <linux/export.h> | ||
| 8 | #include <linux/sched/signal.h> | ||
| 9 | #include <linux/sched/debug.h> | ||
| 10 | #include <linux/mm.h> | ||
| 11 | #include <linux/wait.h> | ||
| 12 | #include <linux/hash.h> | ||
| 13 | #include <linux/kthread.h> | ||
| 14 | 7 | ||
| 15 | void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) | 8 | void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) |
| 16 | { | 9 | { |
| @@ -107,6 +100,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, | |||
| 107 | break; | 100 | break; |
| 108 | } | 101 | } |
| 109 | } | 102 | } |
| 103 | |||
| 110 | return nr_exclusive; | 104 | return nr_exclusive; |
| 111 | } | 105 | } |
| 112 | 106 | ||
| @@ -317,6 +311,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait) | |||
| 317 | spin_unlock(&wq->lock); | 311 | spin_unlock(&wq->lock); |
| 318 | schedule(); | 312 | schedule(); |
| 319 | spin_lock(&wq->lock); | 313 | spin_lock(&wq->lock); |
| 314 | |||
| 320 | return 0; | 315 | return 0; |
| 321 | } | 316 | } |
| 322 | EXPORT_SYMBOL(do_wait_intr); | 317 | EXPORT_SYMBOL(do_wait_intr); |
| @@ -333,6 +328,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait) | |||
| 333 | spin_unlock_irq(&wq->lock); | 328 | spin_unlock_irq(&wq->lock); |
| 334 | schedule(); | 329 | schedule(); |
| 335 | spin_lock_irq(&wq->lock); | 330 | spin_lock_irq(&wq->lock); |
| 331 | |||
| 336 | return 0; | 332 | return 0; |
| 337 | } | 333 | } |
| 338 | EXPORT_SYMBOL(do_wait_intr_irq); | 334 | EXPORT_SYMBOL(do_wait_intr_irq); |
| @@ -378,6 +374,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i | |||
| 378 | 374 | ||
| 379 | if (ret) | 375 | if (ret) |
| 380 | list_del_init(&wq_entry->entry); | 376 | list_del_init(&wq_entry->entry); |
| 377 | |||
| 381 | return ret; | 378 | return ret; |
| 382 | } | 379 | } |
| 383 | EXPORT_SYMBOL(autoremove_wake_function); | 380 | EXPORT_SYMBOL(autoremove_wake_function); |
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 84cb3acd9260..c67c6d24adc2 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c | |||
| @@ -1,10 +1,7 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * The implementation of the wait_bit*() and related waiting APIs: | 2 | * The implementation of the wait_bit*() and related waiting APIs: |
| 3 | */ | 3 | */ |
| 4 | #include <linux/wait_bit.h> | 4 | #include "sched.h" |
| 5 | #include <linux/sched/signal.h> | ||
| 6 | #include <linux/sched/debug.h> | ||
| 7 | #include <linux/hash.h> | ||
| 8 | 5 | ||
| 9 | #define WAIT_TABLE_BITS 8 | 6 | #define WAIT_TABLE_BITS 8 |
| 10 | #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) | 7 | #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) |
| @@ -29,8 +26,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync | |||
| 29 | wait_bit->key.bit_nr != key->bit_nr || | 26 | wait_bit->key.bit_nr != key->bit_nr || |
| 30 | test_bit(key->bit_nr, key->flags)) | 27 | test_bit(key->bit_nr, key->flags)) |
| 31 | return 0; | 28 | return 0; |
| 32 | else | 29 | |
| 33 | return autoremove_wake_function(wq_entry, mode, sync, key); | 30 | return autoremove_wake_function(wq_entry, mode, sync, key); |
| 34 | } | 31 | } |
| 35 | EXPORT_SYMBOL(wake_bit_function); | 32 | EXPORT_SYMBOL(wake_bit_function); |
| 36 | 33 | ||
| @@ -50,7 +47,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ | |||
| 50 | if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) | 47 | if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) |
| 51 | ret = (*action)(&wbq_entry->key, mode); | 48 | ret = (*action)(&wbq_entry->key, mode); |
| 52 | } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); | 49 | } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); |
| 50 | |||
| 53 | finish_wait(wq_head, &wbq_entry->wq_entry); | 51 | finish_wait(wq_head, &wbq_entry->wq_entry); |
| 52 | |||
| 54 | return ret; | 53 | return ret; |
| 55 | } | 54 | } |
| 56 | EXPORT_SYMBOL(__wait_on_bit); | 55 | EXPORT_SYMBOL(__wait_on_bit); |
| @@ -73,6 +72,7 @@ int __sched out_of_line_wait_on_bit_timeout( | |||
| 73 | DEFINE_WAIT_BIT(wq_entry, word, bit); | 72 | DEFINE_WAIT_BIT(wq_entry, word, bit); |
| 74 | 73 | ||
| 75 | wq_entry.key.timeout = jiffies + timeout; | 74 | wq_entry.key.timeout = jiffies + timeout; |
| 75 | |||
| 76 | return __wait_on_bit(wq_head, &wq_entry, action, mode); | 76 | return __wait_on_bit(wq_head, &wq_entry, action, mode); |
| 77 | } | 77 | } |
| 78 | EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); | 78 | EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); |
| @@ -120,6 +120,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); | |||
| 120 | void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) | 120 | void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) |
| 121 | { | 121 | { |
| 122 | struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); | 122 | struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); |
| 123 | |||
| 123 | if (waitqueue_active(wq_head)) | 124 | if (waitqueue_active(wq_head)) |
| 124 | __wake_up(wq_head, TASK_NORMAL, 1, &key); | 125 | __wake_up(wq_head, TASK_NORMAL, 1, &key); |
| 125 | } | 126 | } |
| @@ -148,108 +149,55 @@ void wake_up_bit(void *word, int bit) | |||
| 148 | } | 149 | } |
| 149 | EXPORT_SYMBOL(wake_up_bit); | 150 | EXPORT_SYMBOL(wake_up_bit); |
| 150 | 151 | ||
| 151 | /* | 152 | wait_queue_head_t *__var_waitqueue(void *p) |
| 152 | * Manipulate the atomic_t address to produce a better bit waitqueue table hash | ||
| 153 | * index (we're keying off bit -1, but that would produce a horrible hash | ||
| 154 | * value). | ||
| 155 | */ | ||
| 156 | static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) | ||
| 157 | { | 153 | { |
| 158 | if (BITS_PER_LONG == 64) { | 154 | return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS); |
| 159 | unsigned long q = (unsigned long)p; | ||
| 160 | return bit_waitqueue((void *)(q & ~1), q & 1); | ||
| 161 | } | ||
| 162 | return bit_waitqueue(p, 0); | ||
| 163 | } | 155 | } |
| 156 | EXPORT_SYMBOL(__var_waitqueue); | ||
| 164 | 157 | ||
| 165 | static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, | 158 | static int |
| 166 | void *arg) | 159 | var_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, |
| 160 | int sync, void *arg) | ||
| 167 | { | 161 | { |
| 168 | struct wait_bit_key *key = arg; | 162 | struct wait_bit_key *key = arg; |
| 169 | struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); | 163 | struct wait_bit_queue_entry *wbq_entry = |
| 170 | atomic_t *val = key->flags; | 164 | container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); |
| 171 | 165 | ||
| 172 | if (wait_bit->key.flags != key->flags || | 166 | if (wbq_entry->key.flags != key->flags || |
| 173 | wait_bit->key.bit_nr != key->bit_nr || | 167 | wbq_entry->key.bit_nr != key->bit_nr) |
| 174 | atomic_read(val) != 0) | ||
| 175 | return 0; | 168 | return 0; |
| 176 | return autoremove_wake_function(wq_entry, mode, sync, key); | ||
| 177 | } | ||
| 178 | 169 | ||
| 179 | /* | 170 | return autoremove_wake_function(wq_entry, mode, sync, key); |
| 180 | * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, | ||
| 181 | * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero | ||
| 182 | * return codes halt waiting and return. | ||
| 183 | */ | ||
| 184 | static __sched | ||
| 185 | int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, | ||
| 186 | wait_atomic_t_action_f action, unsigned int mode) | ||
| 187 | { | ||
| 188 | atomic_t *val; | ||
| 189 | int ret = 0; | ||
| 190 | |||
| 191 | do { | ||
| 192 | prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); | ||
| 193 | val = wbq_entry->key.flags; | ||
| 194 | if (atomic_read(val) == 0) | ||
| 195 | break; | ||
| 196 | ret = (*action)(val, mode); | ||
| 197 | } while (!ret && atomic_read(val) != 0); | ||
| 198 | finish_wait(wq_head, &wbq_entry->wq_entry); | ||
| 199 | return ret; | ||
| 200 | } | 171 | } |
| 201 | 172 | ||
| 202 | #define DEFINE_WAIT_ATOMIC_T(name, p) \ | 173 | void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags) |
| 203 | struct wait_bit_queue_entry name = { \ | ||
| 204 | .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ | ||
| 205 | .wq_entry = { \ | ||
| 206 | .private = current, \ | ||
| 207 | .func = wake_atomic_t_function, \ | ||
| 208 | .entry = \ | ||
| 209 | LIST_HEAD_INIT((name).wq_entry.entry), \ | ||
| 210 | }, \ | ||
| 211 | } | ||
| 212 | |||
| 213 | __sched int out_of_line_wait_on_atomic_t(atomic_t *p, | ||
| 214 | wait_atomic_t_action_f action, | ||
| 215 | unsigned int mode) | ||
| 216 | { | 174 | { |
| 217 | struct wait_queue_head *wq_head = atomic_t_waitqueue(p); | 175 | *wbq_entry = (struct wait_bit_queue_entry){ |
| 218 | DEFINE_WAIT_ATOMIC_T(wq_entry, p); | 176 | .key = { |
| 219 | 177 | .flags = (var), | |
| 220 | return __wait_on_atomic_t(wq_head, &wq_entry, action, mode); | 178 | .bit_nr = -1, |
| 179 | }, | ||
| 180 | .wq_entry = { | ||
| 181 | .private = current, | ||
| 182 | .func = var_wake_function, | ||
| 183 | .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry), | ||
| 184 | }, | ||
| 185 | }; | ||
| 221 | } | 186 | } |
| 222 | EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); | 187 | EXPORT_SYMBOL(init_wait_var_entry); |
| 223 | 188 | ||
| 224 | __sched int atomic_t_wait(atomic_t *counter, unsigned int mode) | 189 | void wake_up_var(void *var) |
| 225 | { | 190 | { |
| 226 | schedule(); | 191 | __wake_up_bit(__var_waitqueue(var), var, -1); |
| 227 | if (signal_pending_state(mode, current)) | ||
| 228 | return -EINTR; | ||
| 229 | return 0; | ||
| 230 | } | 192 | } |
| 231 | EXPORT_SYMBOL(atomic_t_wait); | 193 | EXPORT_SYMBOL(wake_up_var); |
| 232 | |||
| 233 | /** | ||
| 234 | * wake_up_atomic_t - Wake up a waiter on a atomic_t | ||
| 235 | * @p: The atomic_t being waited on, a kernel virtual address | ||
| 236 | * | ||
| 237 | * Wake up anyone waiting for the atomic_t to go to zero. | ||
| 238 | * | ||
| 239 | * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t | ||
| 240 | * check is done by the waiter's wake function, not the by the waker itself). | ||
| 241 | */ | ||
| 242 | void wake_up_atomic_t(atomic_t *p) | ||
| 243 | { | ||
| 244 | __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); | ||
| 245 | } | ||
| 246 | EXPORT_SYMBOL(wake_up_atomic_t); | ||
| 247 | 194 | ||
| 248 | __sched int bit_wait(struct wait_bit_key *word, int mode) | 195 | __sched int bit_wait(struct wait_bit_key *word, int mode) |
| 249 | { | 196 | { |
| 250 | schedule(); | 197 | schedule(); |
| 251 | if (signal_pending_state(mode, current)) | 198 | if (signal_pending_state(mode, current)) |
| 252 | return -EINTR; | 199 | return -EINTR; |
| 200 | |||
| 253 | return 0; | 201 | return 0; |
| 254 | } | 202 | } |
| 255 | EXPORT_SYMBOL(bit_wait); | 203 | EXPORT_SYMBOL(bit_wait); |
| @@ -259,6 +207,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode) | |||
| 259 | io_schedule(); | 207 | io_schedule(); |
| 260 | if (signal_pending_state(mode, current)) | 208 | if (signal_pending_state(mode, current)) |
| 261 | return -EINTR; | 209 | return -EINTR; |
| 210 | |||
| 262 | return 0; | 211 | return 0; |
| 263 | } | 212 | } |
| 264 | EXPORT_SYMBOL(bit_wait_io); | 213 | EXPORT_SYMBOL(bit_wait_io); |
| @@ -266,11 +215,13 @@ EXPORT_SYMBOL(bit_wait_io); | |||
| 266 | __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) | 215 | __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) |
| 267 | { | 216 | { |
| 268 | unsigned long now = READ_ONCE(jiffies); | 217 | unsigned long now = READ_ONCE(jiffies); |
| 218 | |||
| 269 | if (time_after_eq(now, word->timeout)) | 219 | if (time_after_eq(now, word->timeout)) |
| 270 | return -EAGAIN; | 220 | return -EAGAIN; |
| 271 | schedule_timeout(word->timeout - now); | 221 | schedule_timeout(word->timeout - now); |
| 272 | if (signal_pending_state(mode, current)) | 222 | if (signal_pending_state(mode, current)) |
| 273 | return -EINTR; | 223 | return -EINTR; |
| 224 | |||
| 274 | return 0; | 225 | return 0; |
| 275 | } | 226 | } |
| 276 | EXPORT_SYMBOL_GPL(bit_wait_timeout); | 227 | EXPORT_SYMBOL_GPL(bit_wait_timeout); |
| @@ -278,11 +229,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout); | |||
| 278 | __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) | 229 | __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) |
| 279 | { | 230 | { |
| 280 | unsigned long now = READ_ONCE(jiffies); | 231 | unsigned long now = READ_ONCE(jiffies); |
| 232 | |||
| 281 | if (time_after_eq(now, word->timeout)) | 233 | if (time_after_eq(now, word->timeout)) |
| 282 | return -EAGAIN; | 234 | return -EAGAIN; |
| 283 | io_schedule_timeout(word->timeout - now); | 235 | io_schedule_timeout(word->timeout - now); |
| 284 | if (signal_pending_state(mode, current)) | 236 | if (signal_pending_state(mode, current)) |
| 285 | return -EINTR; | 237 | return -EINTR; |
| 238 | |||
| 286 | return 0; | 239 | return 0; |
| 287 | } | 240 | } |
| 288 | EXPORT_SYMBOL_GPL(bit_wait_io_timeout); | 241 | EXPORT_SYMBOL_GPL(bit_wait_io_timeout); |
diff --git a/kernel/signal.c b/kernel/signal.c index c6e4c83dc090..f04466655238 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -3573,9 +3573,8 @@ int __save_altstack(stack_t __user *uss, unsigned long sp) | |||
| 3573 | } | 3573 | } |
| 3574 | 3574 | ||
| 3575 | #ifdef CONFIG_COMPAT | 3575 | #ifdef CONFIG_COMPAT |
| 3576 | COMPAT_SYSCALL_DEFINE2(sigaltstack, | 3576 | static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr, |
| 3577 | const compat_stack_t __user *, uss_ptr, | 3577 | compat_stack_t __user *uoss_ptr) |
| 3578 | compat_stack_t __user *, uoss_ptr) | ||
| 3579 | { | 3578 | { |
| 3580 | stack_t uss, uoss; | 3579 | stack_t uss, uoss; |
| 3581 | int ret; | 3580 | int ret; |
| @@ -3602,9 +3601,16 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack, | |||
| 3602 | return ret; | 3601 | return ret; |
| 3603 | } | 3602 | } |
| 3604 | 3603 | ||
| 3604 | COMPAT_SYSCALL_DEFINE2(sigaltstack, | ||
| 3605 | const compat_stack_t __user *, uss_ptr, | ||
| 3606 | compat_stack_t __user *, uoss_ptr) | ||
| 3607 | { | ||
| 3608 | return do_compat_sigaltstack(uss_ptr, uoss_ptr); | ||
| 3609 | } | ||
| 3610 | |||
| 3605 | int compat_restore_altstack(const compat_stack_t __user *uss) | 3611 | int compat_restore_altstack(const compat_stack_t __user *uss) |
| 3606 | { | 3612 | { |
| 3607 | int err = compat_sys_sigaltstack(uss, NULL); | 3613 | int err = do_compat_sigaltstack(uss, NULL); |
| 3608 | /* squash all but -EFAULT for now */ | 3614 | /* squash all but -EFAULT for now */ |
| 3609 | return err == -EFAULT ? err : 0; | 3615 | return err == -EFAULT ? err : 0; |
| 3610 | } | 3616 | } |
| @@ -3629,11 +3635,20 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) | |||
| 3629 | 3635 | ||
| 3630 | /** | 3636 | /** |
| 3631 | * sys_sigpending - examine pending signals | 3637 | * sys_sigpending - examine pending signals |
| 3632 | * @set: where mask of pending signal is returned | 3638 | * @uset: where mask of pending signal is returned |
| 3633 | */ | 3639 | */ |
| 3634 | SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) | 3640 | SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset) |
| 3635 | { | 3641 | { |
| 3636 | return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); | 3642 | sigset_t set; |
| 3643 | int err; | ||
| 3644 | |||
| 3645 | if (sizeof(old_sigset_t) > sizeof(*uset)) | ||
| 3646 | return -EINVAL; | ||
| 3647 | |||
| 3648 | err = do_sigpending(&set); | ||
| 3649 | if (!err && copy_to_user(uset, &set, sizeof(old_sigset_t))) | ||
| 3650 | err = -EFAULT; | ||
| 3651 | return err; | ||
| 3637 | } | 3652 | } |
| 3638 | 3653 | ||
| 3639 | #ifdef CONFIG_COMPAT | 3654 | #ifdef CONFIG_COMPAT |
diff --git a/kernel/sys.c b/kernel/sys.c index f2289de20e19..ad692183dfe9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -69,6 +69,8 @@ | |||
| 69 | #include <asm/io.h> | 69 | #include <asm/io.h> |
| 70 | #include <asm/unistd.h> | 70 | #include <asm/unistd.h> |
| 71 | 71 | ||
| 72 | #include "uid16.h" | ||
| 73 | |||
| 72 | #ifndef SET_UNALIGN_CTL | 74 | #ifndef SET_UNALIGN_CTL |
| 73 | # define SET_UNALIGN_CTL(a, b) (-EINVAL) | 75 | # define SET_UNALIGN_CTL(a, b) (-EINVAL) |
| 74 | #endif | 76 | #endif |
| @@ -340,7 +342,7 @@ out_unlock: | |||
| 340 | * operations (as far as semantic preservation is concerned). | 342 | * operations (as far as semantic preservation is concerned). |
| 341 | */ | 343 | */ |
| 342 | #ifdef CONFIG_MULTIUSER | 344 | #ifdef CONFIG_MULTIUSER |
| 343 | SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | 345 | long __sys_setregid(gid_t rgid, gid_t egid) |
| 344 | { | 346 | { |
| 345 | struct user_namespace *ns = current_user_ns(); | 347 | struct user_namespace *ns = current_user_ns(); |
| 346 | const struct cred *old; | 348 | const struct cred *old; |
| @@ -392,12 +394,17 @@ error: | |||
| 392 | return retval; | 394 | return retval; |
| 393 | } | 395 | } |
| 394 | 396 | ||
| 397 | SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | ||
| 398 | { | ||
| 399 | return __sys_setregid(rgid, egid); | ||
| 400 | } | ||
| 401 | |||
| 395 | /* | 402 | /* |
| 396 | * setgid() is implemented like SysV w/ SAVED_IDS | 403 | * setgid() is implemented like SysV w/ SAVED_IDS |
| 397 | * | 404 | * |
| 398 | * SMP: Same implicit races as above. | 405 | * SMP: Same implicit races as above. |
| 399 | */ | 406 | */ |
| 400 | SYSCALL_DEFINE1(setgid, gid_t, gid) | 407 | long __sys_setgid(gid_t gid) |
| 401 | { | 408 | { |
| 402 | struct user_namespace *ns = current_user_ns(); | 409 | struct user_namespace *ns = current_user_ns(); |
| 403 | const struct cred *old; | 410 | const struct cred *old; |
| @@ -429,6 +436,11 @@ error: | |||
| 429 | return retval; | 436 | return retval; |
| 430 | } | 437 | } |
| 431 | 438 | ||
| 439 | SYSCALL_DEFINE1(setgid, gid_t, gid) | ||
| 440 | { | ||
| 441 | return __sys_setgid(gid); | ||
| 442 | } | ||
| 443 | |||
| 432 | /* | 444 | /* |
| 433 | * change the user struct in a credentials set to match the new UID | 445 | * change the user struct in a credentials set to match the new UID |
| 434 | */ | 446 | */ |
| @@ -473,7 +485,7 @@ static int set_user(struct cred *new) | |||
| 473 | * 100% compatible with BSD. A program which uses just setuid() will be | 485 | * 100% compatible with BSD. A program which uses just setuid() will be |
| 474 | * 100% compatible with POSIX with saved IDs. | 486 | * 100% compatible with POSIX with saved IDs. |
| 475 | */ | 487 | */ |
| 476 | SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | 488 | long __sys_setreuid(uid_t ruid, uid_t euid) |
| 477 | { | 489 | { |
| 478 | struct user_namespace *ns = current_user_ns(); | 490 | struct user_namespace *ns = current_user_ns(); |
| 479 | const struct cred *old; | 491 | const struct cred *old; |
| @@ -533,6 +545,11 @@ error: | |||
| 533 | return retval; | 545 | return retval; |
| 534 | } | 546 | } |
| 535 | 547 | ||
| 548 | SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | ||
| 549 | { | ||
| 550 | return __sys_setreuid(ruid, euid); | ||
| 551 | } | ||
| 552 | |||
| 536 | /* | 553 | /* |
| 537 | * setuid() is implemented like SysV with SAVED_IDS | 554 | * setuid() is implemented like SysV with SAVED_IDS |
| 538 | * | 555 | * |
| @@ -544,7 +561,7 @@ error: | |||
| 544 | * will allow a root program to temporarily drop privileges and be able to | 561 | * will allow a root program to temporarily drop privileges and be able to |
| 545 | * regain them by swapping the real and effective uid. | 562 | * regain them by swapping the real and effective uid. |
| 546 | */ | 563 | */ |
| 547 | SYSCALL_DEFINE1(setuid, uid_t, uid) | 564 | long __sys_setuid(uid_t uid) |
| 548 | { | 565 | { |
| 549 | struct user_namespace *ns = current_user_ns(); | 566 | struct user_namespace *ns = current_user_ns(); |
| 550 | const struct cred *old; | 567 | const struct cred *old; |
| @@ -586,12 +603,17 @@ error: | |||
| 586 | return retval; | 603 | return retval; |
| 587 | } | 604 | } |
| 588 | 605 | ||
| 606 | SYSCALL_DEFINE1(setuid, uid_t, uid) | ||
| 607 | { | ||
| 608 | return __sys_setuid(uid); | ||
| 609 | } | ||
| 610 | |||
| 589 | 611 | ||
| 590 | /* | 612 | /* |
| 591 | * This function implements a generic ability to update ruid, euid, | 613 | * This function implements a generic ability to update ruid, euid, |
| 592 | * and suid. This allows you to implement the 4.4 compatible seteuid(). | 614 | * and suid. This allows you to implement the 4.4 compatible seteuid(). |
| 593 | */ | 615 | */ |
| 594 | SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) | 616 | long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) |
| 595 | { | 617 | { |
| 596 | struct user_namespace *ns = current_user_ns(); | 618 | struct user_namespace *ns = current_user_ns(); |
| 597 | const struct cred *old; | 619 | const struct cred *old; |
| @@ -656,6 +678,11 @@ error: | |||
| 656 | return retval; | 678 | return retval; |
| 657 | } | 679 | } |
| 658 | 680 | ||
| 681 | SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) | ||
| 682 | { | ||
| 683 | return __sys_setresuid(ruid, euid, suid); | ||
| 684 | } | ||
| 685 | |||
| 659 | SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) | 686 | SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) |
| 660 | { | 687 | { |
| 661 | const struct cred *cred = current_cred(); | 688 | const struct cred *cred = current_cred(); |
| @@ -678,7 +705,7 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _ | |||
| 678 | /* | 705 | /* |
| 679 | * Same as above, but for rgid, egid, sgid. | 706 | * Same as above, but for rgid, egid, sgid. |
| 680 | */ | 707 | */ |
| 681 | SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) | 708 | long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) |
| 682 | { | 709 | { |
| 683 | struct user_namespace *ns = current_user_ns(); | 710 | struct user_namespace *ns = current_user_ns(); |
| 684 | const struct cred *old; | 711 | const struct cred *old; |
| @@ -730,6 +757,11 @@ error: | |||
| 730 | return retval; | 757 | return retval; |
| 731 | } | 758 | } |
| 732 | 759 | ||
| 760 | SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) | ||
| 761 | { | ||
| 762 | return __sys_setresgid(rgid, egid, sgid); | ||
| 763 | } | ||
| 764 | |||
| 733 | SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) | 765 | SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) |
| 734 | { | 766 | { |
| 735 | const struct cred *cred = current_cred(); | 767 | const struct cred *cred = current_cred(); |
| @@ -757,7 +789,7 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _ | |||
| 757 | * whatever uid it wants to). It normally shadows "euid", except when | 789 | * whatever uid it wants to). It normally shadows "euid", except when |
| 758 | * explicitly set by setfsuid() or for access.. | 790 | * explicitly set by setfsuid() or for access.. |
| 759 | */ | 791 | */ |
| 760 | SYSCALL_DEFINE1(setfsuid, uid_t, uid) | 792 | long __sys_setfsuid(uid_t uid) |
| 761 | { | 793 | { |
| 762 | const struct cred *old; | 794 | const struct cred *old; |
| 763 | struct cred *new; | 795 | struct cred *new; |
| @@ -793,10 +825,15 @@ change_okay: | |||
| 793 | return old_fsuid; | 825 | return old_fsuid; |
| 794 | } | 826 | } |
| 795 | 827 | ||
| 828 | SYSCALL_DEFINE1(setfsuid, uid_t, uid) | ||
| 829 | { | ||
| 830 | return __sys_setfsuid(uid); | ||
| 831 | } | ||
| 832 | |||
| 796 | /* | 833 | /* |
| 797 | * Samma på svenska.. | 834 | * Samma på svenska.. |
| 798 | */ | 835 | */ |
| 799 | SYSCALL_DEFINE1(setfsgid, gid_t, gid) | 836 | long __sys_setfsgid(gid_t gid) |
| 800 | { | 837 | { |
| 801 | const struct cred *old; | 838 | const struct cred *old; |
| 802 | struct cred *new; | 839 | struct cred *new; |
| @@ -830,6 +867,11 @@ change_okay: | |||
| 830 | commit_creds(new); | 867 | commit_creds(new); |
| 831 | return old_fsgid; | 868 | return old_fsgid; |
| 832 | } | 869 | } |
| 870 | |||
| 871 | SYSCALL_DEFINE1(setfsgid, gid_t, gid) | ||
| 872 | { | ||
| 873 | return __sys_setfsgid(gid); | ||
| 874 | } | ||
| 833 | #endif /* CONFIG_MULTIUSER */ | 875 | #endif /* CONFIG_MULTIUSER */ |
| 834 | 876 | ||
| 835 | /** | 877 | /** |
| @@ -1027,7 +1069,7 @@ out: | |||
| 1027 | return err; | 1069 | return err; |
| 1028 | } | 1070 | } |
| 1029 | 1071 | ||
| 1030 | SYSCALL_DEFINE1(getpgid, pid_t, pid) | 1072 | static int do_getpgid(pid_t pid) |
| 1031 | { | 1073 | { |
| 1032 | struct task_struct *p; | 1074 | struct task_struct *p; |
| 1033 | struct pid *grp; | 1075 | struct pid *grp; |
| @@ -1055,11 +1097,16 @@ out: | |||
| 1055 | return retval; | 1097 | return retval; |
| 1056 | } | 1098 | } |
| 1057 | 1099 | ||
| 1100 | SYSCALL_DEFINE1(getpgid, pid_t, pid) | ||
| 1101 | { | ||
| 1102 | return do_getpgid(pid); | ||
| 1103 | } | ||
| 1104 | |||
| 1058 | #ifdef __ARCH_WANT_SYS_GETPGRP | 1105 | #ifdef __ARCH_WANT_SYS_GETPGRP |
| 1059 | 1106 | ||
| 1060 | SYSCALL_DEFINE0(getpgrp) | 1107 | SYSCALL_DEFINE0(getpgrp) |
| 1061 | { | 1108 | { |
| 1062 | return sys_getpgid(0); | 1109 | return do_getpgid(0); |
| 1063 | } | 1110 | } |
| 1064 | 1111 | ||
| 1065 | #endif | 1112 | #endif |
| @@ -1103,7 +1150,7 @@ static void set_special_pids(struct pid *pid) | |||
| 1103 | change_pid(curr, PIDTYPE_PGID, pid); | 1150 | change_pid(curr, PIDTYPE_PGID, pid); |
| 1104 | } | 1151 | } |
| 1105 | 1152 | ||
| 1106 | SYSCALL_DEFINE0(setsid) | 1153 | int ksys_setsid(void) |
| 1107 | { | 1154 | { |
| 1108 | struct task_struct *group_leader = current->group_leader; | 1155 | struct task_struct *group_leader = current->group_leader; |
| 1109 | struct pid *sid = task_pid(group_leader); | 1156 | struct pid *sid = task_pid(group_leader); |
| @@ -1136,6 +1183,11 @@ out: | |||
| 1136 | return err; | 1183 | return err; |
| 1137 | } | 1184 | } |
| 1138 | 1185 | ||
| 1186 | SYSCALL_DEFINE0(setsid) | ||
| 1187 | { | ||
| 1188 | return ksys_setsid(); | ||
| 1189 | } | ||
| 1190 | |||
| 1139 | DECLARE_RWSEM(uts_sem); | 1191 | DECLARE_RWSEM(uts_sem); |
| 1140 | 1192 | ||
| 1141 | #ifdef COMPAT_UTS_MACHINE | 1193 | #ifdef COMPAT_UTS_MACHINE |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index b5189762d275..6cafc008f6db 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
| @@ -17,245 +17,406 @@ asmlinkage long sys_ni_syscall(void) | |||
| 17 | return -ENOSYS; | 17 | return -ENOSYS; |
| 18 | } | 18 | } |
| 19 | 19 | ||
| 20 | cond_syscall(sys_quotactl); | 20 | #define COND_SYSCALL(name) cond_syscall(sys_##name) |
| 21 | cond_syscall(sys32_quotactl); | 21 | #define COND_SYSCALL_COMPAT(name) cond_syscall(compat_sys_##name) |
| 22 | cond_syscall(sys_acct); | 22 | |
| 23 | cond_syscall(sys_lookup_dcookie); | 23 | /* |
| 24 | cond_syscall(compat_sys_lookup_dcookie); | 24 | * This list is kept in the same order as include/uapi/asm-generic/unistd.h. |
| 25 | cond_syscall(sys_swapon); | 25 | * Architecture specific entries go below, followed by deprecated or obsolete |
| 26 | cond_syscall(sys_swapoff); | 26 | * system calls. |
| 27 | cond_syscall(sys_kexec_load); | 27 | */ |
| 28 | cond_syscall(compat_sys_kexec_load); | 28 | |
| 29 | cond_syscall(sys_kexec_file_load); | 29 | COND_SYSCALL(io_setup); |
| 30 | cond_syscall(sys_init_module); | 30 | COND_SYSCALL_COMPAT(io_setup); |
| 31 | cond_syscall(sys_finit_module); | 31 | COND_SYSCALL(io_destroy); |
| 32 | cond_syscall(sys_delete_module); | 32 | COND_SYSCALL(io_submit); |
| 33 | cond_syscall(sys_socketpair); | 33 | COND_SYSCALL_COMPAT(io_submit); |
| 34 | cond_syscall(sys_bind); | 34 | COND_SYSCALL(io_cancel); |
| 35 | cond_syscall(sys_listen); | 35 | COND_SYSCALL(io_getevents); |
| 36 | cond_syscall(sys_accept); | 36 | COND_SYSCALL_COMPAT(io_getevents); |
| 37 | cond_syscall(sys_accept4); | 37 | |
| 38 | cond_syscall(sys_connect); | 38 | /* fs/xattr.c */ |
| 39 | cond_syscall(sys_getsockname); | 39 | |
| 40 | cond_syscall(sys_getpeername); | 40 | /* fs/dcache.c */ |
| 41 | cond_syscall(sys_sendto); | 41 | |
| 42 | cond_syscall(sys_send); | 42 | /* fs/cookies.c */ |
| 43 | cond_syscall(sys_recvfrom); | 43 | COND_SYSCALL(lookup_dcookie); |
| 44 | cond_syscall(sys_recv); | 44 | COND_SYSCALL_COMPAT(lookup_dcookie); |
| 45 | cond_syscall(sys_socket); | 45 | |
| 46 | cond_syscall(sys_setsockopt); | 46 | /* fs/eventfd.c */ |
| 47 | cond_syscall(compat_sys_setsockopt); | 47 | COND_SYSCALL(eventfd2); |
| 48 | cond_syscall(sys_getsockopt); | 48 | |
| 49 | cond_syscall(compat_sys_getsockopt); | 49 | /* fs/eventfd.c */ |
| 50 | cond_syscall(sys_shutdown); | 50 | COND_SYSCALL(epoll_create1); |
| 51 | cond_syscall(sys_sendmsg); | 51 | COND_SYSCALL(epoll_ctl); |
| 52 | cond_syscall(sys_sendmmsg); | 52 | COND_SYSCALL(epoll_pwait); |
| 53 | cond_syscall(compat_sys_sendmsg); | 53 | COND_SYSCALL_COMPAT(epoll_pwait); |
| 54 | cond_syscall(compat_sys_sendmmsg); | 54 | |
| 55 | cond_syscall(sys_recvmsg); | 55 | /* fs/fcntl.c */ |
| 56 | cond_syscall(sys_recvmmsg); | 56 | |
| 57 | cond_syscall(compat_sys_recvmsg); | 57 | /* fs/inotify_user.c */ |
| 58 | cond_syscall(compat_sys_recv); | 58 | COND_SYSCALL(inotify_init1); |
| 59 | cond_syscall(compat_sys_recvfrom); | 59 | COND_SYSCALL(inotify_add_watch); |
| 60 | cond_syscall(compat_sys_recvmmsg); | 60 | COND_SYSCALL(inotify_rm_watch); |
| 61 | cond_syscall(sys_socketcall); | 61 | |
| 62 | cond_syscall(sys_futex); | 62 | /* fs/ioctl.c */ |
| 63 | cond_syscall(compat_sys_futex); | 63 | |
| 64 | cond_syscall(sys_set_robust_list); | 64 | /* fs/ioprio.c */ |
| 65 | cond_syscall(compat_sys_set_robust_list); | 65 | COND_SYSCALL(ioprio_set); |
| 66 | cond_syscall(sys_get_robust_list); | 66 | COND_SYSCALL(ioprio_get); |
| 67 | cond_syscall(compat_sys_get_robust_list); | 67 | |
| 68 | cond_syscall(sys_epoll_create); | 68 | /* fs/locks.c */ |
| 69 | cond_syscall(sys_epoll_create1); | 69 | COND_SYSCALL(flock); |
| 70 | cond_syscall(sys_epoll_ctl); | 70 | |
| 71 | cond_syscall(sys_epoll_wait); | 71 | /* fs/namei.c */ |
| 72 | cond_syscall(sys_epoll_pwait); | 72 | |
| 73 | cond_syscall(compat_sys_epoll_pwait); | 73 | /* fs/namespace.c */ |
| 74 | cond_syscall(sys_semget); | 74 | |
| 75 | cond_syscall(sys_semop); | 75 | /* fs/nfsctl.c */ |
| 76 | cond_syscall(sys_semtimedop); | 76 | |
| 77 | cond_syscall(compat_sys_semtimedop); | 77 | /* fs/open.c */ |
| 78 | cond_syscall(sys_semctl); | 78 | |
| 79 | cond_syscall(compat_sys_semctl); | 79 | /* fs/pipe.c */ |
| 80 | cond_syscall(sys_msgget); | 80 | |
| 81 | cond_syscall(sys_msgsnd); | 81 | /* fs/quota.c */ |
| 82 | cond_syscall(compat_sys_msgsnd); | 82 | COND_SYSCALL(quotactl); |
| 83 | cond_syscall(sys_msgrcv); | 83 | |
| 84 | cond_syscall(compat_sys_msgrcv); | 84 | /* fs/readdir.c */ |
| 85 | cond_syscall(sys_msgctl); | 85 | |
| 86 | cond_syscall(compat_sys_msgctl); | 86 | /* fs/read_write.c */ |
| 87 | cond_syscall(sys_shmget); | 87 | |
| 88 | cond_syscall(sys_shmat); | 88 | /* fs/sendfile.c */ |
| 89 | cond_syscall(compat_sys_shmat); | 89 | |
| 90 | cond_syscall(sys_shmdt); | 90 | /* fs/select.c */ |
| 91 | cond_syscall(sys_shmctl); | 91 | |
| 92 | cond_syscall(compat_sys_shmctl); | 92 | /* fs/signalfd.c */ |
| 93 | cond_syscall(sys_mq_open); | 93 | COND_SYSCALL(signalfd4); |
| 94 | cond_syscall(sys_mq_unlink); | 94 | COND_SYSCALL_COMPAT(signalfd4); |
| 95 | cond_syscall(sys_mq_timedsend); | 95 | |
| 96 | cond_syscall(sys_mq_timedreceive); | 96 | /* fs/splice.c */ |
| 97 | cond_syscall(sys_mq_notify); | 97 | |
| 98 | cond_syscall(sys_mq_getsetattr); | 98 | /* fs/stat.c */ |
| 99 | cond_syscall(compat_sys_mq_open); | 99 | |
| 100 | cond_syscall(compat_sys_mq_timedsend); | 100 | /* fs/sync.c */ |
| 101 | cond_syscall(compat_sys_mq_timedreceive); | 101 | |
| 102 | cond_syscall(compat_sys_mq_notify); | 102 | /* fs/timerfd.c */ |
| 103 | cond_syscall(compat_sys_mq_getsetattr); | 103 | COND_SYSCALL(timerfd_create); |
| 104 | cond_syscall(sys_mbind); | 104 | COND_SYSCALL(timerfd_settime); |
| 105 | cond_syscall(sys_get_mempolicy); | 105 | COND_SYSCALL_COMPAT(timerfd_settime); |
| 106 | cond_syscall(sys_set_mempolicy); | 106 | COND_SYSCALL(timerfd_gettime); |
| 107 | cond_syscall(compat_sys_mbind); | 107 | COND_SYSCALL_COMPAT(timerfd_gettime); |
| 108 | cond_syscall(compat_sys_get_mempolicy); | 108 | |
| 109 | cond_syscall(compat_sys_set_mempolicy); | 109 | /* fs/utimes.c */ |
| 110 | cond_syscall(sys_add_key); | 110 | |
| 111 | cond_syscall(sys_request_key); | 111 | /* kernel/acct.c */ |
| 112 | cond_syscall(sys_keyctl); | 112 | COND_SYSCALL(acct); |
| 113 | cond_syscall(compat_sys_keyctl); | 113 | |
| 114 | cond_syscall(compat_sys_socketcall); | 114 | /* kernel/capability.c */ |
| 115 | cond_syscall(sys_inotify_init); | 115 | COND_SYSCALL(capget); |
| 116 | cond_syscall(sys_inotify_init1); | 116 | COND_SYSCALL(capset); |
| 117 | cond_syscall(sys_inotify_add_watch); | 117 | |
| 118 | cond_syscall(sys_inotify_rm_watch); | 118 | /* kernel/exec_domain.c */ |
| 119 | cond_syscall(sys_migrate_pages); | 119 | |
| 120 | cond_syscall(sys_move_pages); | 120 | /* kernel/exit.c */ |
| 121 | cond_syscall(sys_chown16); | 121 | |
| 122 | cond_syscall(sys_fchown16); | 122 | /* kernel/fork.c */ |
| 123 | cond_syscall(sys_getegid16); | 123 | |
| 124 | cond_syscall(sys_geteuid16); | 124 | /* kernel/futex.c */ |
| 125 | cond_syscall(sys_getgid16); | 125 | COND_SYSCALL(futex); |
| 126 | cond_syscall(sys_getgroups16); | 126 | COND_SYSCALL_COMPAT(futex); |
| 127 | cond_syscall(sys_getresgid16); | 127 | COND_SYSCALL(set_robust_list); |
| 128 | cond_syscall(sys_getresuid16); | 128 | COND_SYSCALL_COMPAT(set_robust_list); |
| 129 | cond_syscall(sys_getuid16); | 129 | COND_SYSCALL(get_robust_list); |
| 130 | cond_syscall(sys_lchown16); | 130 | COND_SYSCALL_COMPAT(get_robust_list); |
| 131 | cond_syscall(sys_setfsgid16); | 131 | |
| 132 | cond_syscall(sys_setfsuid16); | 132 | /* kernel/hrtimer.c */ |
| 133 | cond_syscall(sys_setgid16); | 133 | |
| 134 | cond_syscall(sys_setgroups16); | 134 | /* kernel/itimer.c */ |
| 135 | cond_syscall(sys_setregid16); | 135 | |
| 136 | cond_syscall(sys_setresgid16); | 136 | /* kernel/kexec.c */ |
| 137 | cond_syscall(sys_setresuid16); | 137 | COND_SYSCALL(kexec_load); |
| 138 | cond_syscall(sys_setreuid16); | 138 | COND_SYSCALL_COMPAT(kexec_load); |
| 139 | cond_syscall(sys_setuid16); | 139 | |
| 140 | cond_syscall(sys_sgetmask); | 140 | /* kernel/module.c */ |
| 141 | cond_syscall(sys_ssetmask); | 141 | COND_SYSCALL(init_module); |
| 142 | cond_syscall(sys_vm86old); | 142 | COND_SYSCALL(delete_module); |
| 143 | cond_syscall(sys_vm86); | 143 | |
| 144 | cond_syscall(sys_modify_ldt); | 144 | /* kernel/posix-timers.c */ |
| 145 | cond_syscall(sys_ipc); | 145 | |
| 146 | cond_syscall(compat_sys_ipc); | 146 | /* kernel/printk.c */ |
| 147 | cond_syscall(compat_sys_sysctl); | 147 | COND_SYSCALL(syslog); |
| 148 | cond_syscall(sys_flock); | 148 | |
| 149 | cond_syscall(sys_io_setup); | 149 | /* kernel/ptrace.c */ |
| 150 | cond_syscall(sys_io_destroy); | 150 | |
| 151 | cond_syscall(sys_io_submit); | 151 | /* kernel/sched/core.c */ |
| 152 | cond_syscall(sys_io_cancel); | 152 | |
| 153 | cond_syscall(sys_io_getevents); | 153 | /* kernel/signal.c */ |
| 154 | cond_syscall(compat_sys_io_setup); | 154 | |
| 155 | cond_syscall(compat_sys_io_submit); | 155 | /* kernel/sys.c */ |
| 156 | cond_syscall(compat_sys_io_getevents); | 156 | COND_SYSCALL(setregid); |
| 157 | cond_syscall(sys_sysfs); | 157 | COND_SYSCALL(setgid); |
| 158 | cond_syscall(sys_syslog); | 158 | COND_SYSCALL(setreuid); |
| 159 | cond_syscall(sys_process_vm_readv); | 159 | COND_SYSCALL(setuid); |
| 160 | cond_syscall(sys_process_vm_writev); | 160 | COND_SYSCALL(setresuid); |
| 161 | cond_syscall(compat_sys_process_vm_readv); | 161 | COND_SYSCALL(getresuid); |
| 162 | cond_syscall(compat_sys_process_vm_writev); | 162 | COND_SYSCALL(setresgid); |
| 163 | cond_syscall(sys_uselib); | 163 | COND_SYSCALL(getresgid); |
| 164 | cond_syscall(sys_fadvise64); | 164 | COND_SYSCALL(setfsuid); |
| 165 | cond_syscall(sys_fadvise64_64); | 165 | COND_SYSCALL(setfsgid); |
| 166 | cond_syscall(sys_madvise); | 166 | COND_SYSCALL(setgroups); |
| 167 | cond_syscall(sys_setuid); | 167 | COND_SYSCALL(getgroups); |
| 168 | cond_syscall(sys_setregid); | 168 | |
| 169 | cond_syscall(sys_setgid); | 169 | /* kernel/time.c */ |
| 170 | cond_syscall(sys_setreuid); | 170 | |
| 171 | cond_syscall(sys_setresuid); | 171 | /* kernel/timer.c */ |
| 172 | cond_syscall(sys_getresuid); | 172 | |
| 173 | cond_syscall(sys_setresgid); | 173 | /* ipc/mqueue.c */ |
| 174 | cond_syscall(sys_getresgid); | 174 | COND_SYSCALL(mq_open); |
| 175 | cond_syscall(sys_setgroups); | 175 | COND_SYSCALL_COMPAT(mq_open); |
| 176 | cond_syscall(sys_getgroups); | 176 | COND_SYSCALL(mq_unlink); |
| 177 | cond_syscall(sys_setfsuid); | 177 | COND_SYSCALL(mq_timedsend); |
| 178 | cond_syscall(sys_setfsgid); | 178 | COND_SYSCALL_COMPAT(mq_timedsend); |
| 179 | cond_syscall(sys_capget); | 179 | COND_SYSCALL(mq_timedreceive); |
| 180 | cond_syscall(sys_capset); | 180 | COND_SYSCALL_COMPAT(mq_timedreceive); |
| 181 | cond_syscall(sys_copy_file_range); | 181 | COND_SYSCALL(mq_notify); |
| 182 | 182 | COND_SYSCALL_COMPAT(mq_notify); | |
| 183 | /* arch-specific weak syscall entries */ | 183 | COND_SYSCALL(mq_getsetattr); |
| 184 | cond_syscall(sys_pciconfig_read); | 184 | COND_SYSCALL_COMPAT(mq_getsetattr); |
| 185 | cond_syscall(sys_pciconfig_write); | 185 | |
| 186 | cond_syscall(sys_pciconfig_iobase); | 186 | /* ipc/msg.c */ |
| 187 | cond_syscall(compat_sys_s390_ipc); | 187 | COND_SYSCALL(msgget); |
| 188 | cond_syscall(ppc_rtas); | 188 | COND_SYSCALL(msgctl); |
| 189 | cond_syscall(sys_spu_run); | 189 | COND_SYSCALL_COMPAT(msgctl); |
| 190 | cond_syscall(sys_spu_create); | 190 | COND_SYSCALL(msgrcv); |
| 191 | cond_syscall(sys_subpage_prot); | 191 | COND_SYSCALL_COMPAT(msgrcv); |
| 192 | cond_syscall(sys_s390_pci_mmio_read); | 192 | COND_SYSCALL(msgsnd); |
| 193 | cond_syscall(sys_s390_pci_mmio_write); | 193 | COND_SYSCALL_COMPAT(msgsnd); |
| 194 | 194 | ||
| 195 | /* mmu depending weak syscall entries */ | 195 | /* ipc/sem.c */ |
| 196 | cond_syscall(sys_mprotect); | 196 | COND_SYSCALL(semget); |
| 197 | cond_syscall(sys_msync); | 197 | COND_SYSCALL(semctl); |
| 198 | cond_syscall(sys_mlock); | 198 | COND_SYSCALL_COMPAT(semctl); |
| 199 | cond_syscall(sys_munlock); | 199 | COND_SYSCALL(semtimedop); |
| 200 | cond_syscall(sys_mlockall); | 200 | COND_SYSCALL_COMPAT(semtimedop); |
| 201 | cond_syscall(sys_munlockall); | 201 | COND_SYSCALL(semop); |
| 202 | cond_syscall(sys_mlock2); | 202 | |
| 203 | cond_syscall(sys_mincore); | 203 | /* ipc/shm.c */ |
| 204 | cond_syscall(sys_madvise); | 204 | COND_SYSCALL(shmget); |
| 205 | cond_syscall(sys_mremap); | 205 | COND_SYSCALL(shmctl); |
| 206 | cond_syscall(sys_remap_file_pages); | 206 | COND_SYSCALL_COMPAT(shmctl); |
| 207 | cond_syscall(compat_sys_move_pages); | 207 | COND_SYSCALL(shmat); |
| 208 | cond_syscall(compat_sys_migrate_pages); | 208 | COND_SYSCALL_COMPAT(shmat); |
| 209 | 209 | COND_SYSCALL(shmdt); | |
| 210 | /* block-layer dependent */ | 210 | |
| 211 | cond_syscall(sys_bdflush); | 211 | /* net/socket.c */ |
| 212 | cond_syscall(sys_ioprio_set); | 212 | COND_SYSCALL(socket); |
| 213 | cond_syscall(sys_ioprio_get); | 213 | COND_SYSCALL(socketpair); |
| 214 | 214 | COND_SYSCALL(bind); | |
| 215 | /* New file descriptors */ | 215 | COND_SYSCALL(listen); |
| 216 | cond_syscall(sys_signalfd); | 216 | COND_SYSCALL(accept); |
| 217 | cond_syscall(sys_signalfd4); | 217 | COND_SYSCALL(connect); |
| 218 | cond_syscall(compat_sys_signalfd); | 218 | COND_SYSCALL(getsockname); |
| 219 | cond_syscall(compat_sys_signalfd4); | 219 | COND_SYSCALL(getpeername); |
| 220 | cond_syscall(sys_timerfd_create); | 220 | COND_SYSCALL(setsockopt); |
| 221 | cond_syscall(sys_timerfd_settime); | 221 | COND_SYSCALL_COMPAT(setsockopt); |
| 222 | cond_syscall(sys_timerfd_gettime); | 222 | COND_SYSCALL(getsockopt); |
| 223 | cond_syscall(compat_sys_timerfd_settime); | 223 | COND_SYSCALL_COMPAT(getsockopt); |
| 224 | cond_syscall(compat_sys_timerfd_gettime); | 224 | COND_SYSCALL(sendto); |
| 225 | cond_syscall(sys_eventfd); | 225 | COND_SYSCALL(shutdown); |
| 226 | cond_syscall(sys_eventfd2); | 226 | COND_SYSCALL(recvfrom); |
| 227 | cond_syscall(sys_memfd_create); | 227 | COND_SYSCALL_COMPAT(recvfrom); |
| 228 | cond_syscall(sys_userfaultfd); | 228 | COND_SYSCALL(sendmsg); |
| 229 | 229 | COND_SYSCALL_COMPAT(sendmsg); | |
| 230 | /* performance counters: */ | 230 | COND_SYSCALL(recvmsg); |
| 231 | cond_syscall(sys_perf_event_open); | 231 | COND_SYSCALL_COMPAT(recvmsg); |
| 232 | 232 | ||
| 233 | /* fanotify! */ | 233 | /* mm/filemap.c */ |
| 234 | cond_syscall(sys_fanotify_init); | 234 | |
| 235 | cond_syscall(sys_fanotify_mark); | 235 | /* mm/nommu.c, also with MMU */ |
| 236 | cond_syscall(compat_sys_fanotify_mark); | 236 | COND_SYSCALL(mremap); |
| 237 | |||
| 238 | /* security/keys/keyctl.c */ | ||
| 239 | COND_SYSCALL(add_key); | ||
| 240 | COND_SYSCALL(request_key); | ||
| 241 | COND_SYSCALL(keyctl); | ||
| 242 | COND_SYSCALL_COMPAT(keyctl); | ||
| 243 | |||
| 244 | /* arch/example/kernel/sys_example.c */ | ||
| 245 | |||
| 246 | /* mm/fadvise.c */ | ||
| 247 | COND_SYSCALL(fadvise64_64); | ||
| 248 | |||
| 249 | /* mm/, CONFIG_MMU only */ | ||
| 250 | COND_SYSCALL(swapon); | ||
| 251 | COND_SYSCALL(swapoff); | ||
| 252 | COND_SYSCALL(mprotect); | ||
| 253 | COND_SYSCALL(msync); | ||
| 254 | COND_SYSCALL(mlock); | ||
| 255 | COND_SYSCALL(munlock); | ||
| 256 | COND_SYSCALL(mlockall); | ||
| 257 | COND_SYSCALL(munlockall); | ||
| 258 | COND_SYSCALL(mincore); | ||
| 259 | COND_SYSCALL(madvise); | ||
| 260 | COND_SYSCALL(remap_file_pages); | ||
| 261 | COND_SYSCALL(mbind); | ||
| 262 | COND_SYSCALL_COMPAT(mbind); | ||
| 263 | COND_SYSCALL(get_mempolicy); | ||
| 264 | COND_SYSCALL_COMPAT(get_mempolicy); | ||
| 265 | COND_SYSCALL(set_mempolicy); | ||
| 266 | COND_SYSCALL_COMPAT(set_mempolicy); | ||
| 267 | COND_SYSCALL(migrate_pages); | ||
| 268 | COND_SYSCALL_COMPAT(migrate_pages); | ||
| 269 | COND_SYSCALL(move_pages); | ||
| 270 | COND_SYSCALL_COMPAT(move_pages); | ||
| 271 | |||
| 272 | COND_SYSCALL(perf_event_open); | ||
| 273 | COND_SYSCALL(accept4); | ||
| 274 | COND_SYSCALL(recvmmsg); | ||
| 275 | COND_SYSCALL_COMPAT(recvmmsg); | ||
| 276 | |||
| 277 | /* | ||
| 278 | * Architecture specific syscalls: see further below | ||
| 279 | */ | ||
| 280 | |||
| 281 | /* fanotify */ | ||
| 282 | COND_SYSCALL(fanotify_init); | ||
| 283 | COND_SYSCALL(fanotify_mark); | ||
| 237 | 284 | ||
| 238 | /* open by handle */ | 285 | /* open by handle */ |
| 239 | cond_syscall(sys_name_to_handle_at); | 286 | COND_SYSCALL(name_to_handle_at); |
| 240 | cond_syscall(sys_open_by_handle_at); | 287 | COND_SYSCALL(open_by_handle_at); |
| 241 | cond_syscall(compat_sys_open_by_handle_at); | 288 | COND_SYSCALL_COMPAT(open_by_handle_at); |
| 289 | |||
| 290 | COND_SYSCALL(sendmmsg); | ||
| 291 | COND_SYSCALL_COMPAT(sendmmsg); | ||
| 292 | COND_SYSCALL(process_vm_readv); | ||
| 293 | COND_SYSCALL_COMPAT(process_vm_readv); | ||
| 294 | COND_SYSCALL(process_vm_writev); | ||
| 295 | COND_SYSCALL_COMPAT(process_vm_writev); | ||
| 242 | 296 | ||
| 243 | /* compare kernel pointers */ | 297 | /* compare kernel pointers */ |
| 244 | cond_syscall(sys_kcmp); | 298 | COND_SYSCALL(kcmp); |
| 299 | |||
| 300 | COND_SYSCALL(finit_module); | ||
| 245 | 301 | ||
| 246 | /* operate on Secure Computing state */ | 302 | /* operate on Secure Computing state */ |
| 247 | cond_syscall(sys_seccomp); | 303 | COND_SYSCALL(seccomp); |
| 304 | |||
| 305 | COND_SYSCALL(memfd_create); | ||
| 248 | 306 | ||
| 249 | /* access BPF programs and maps */ | 307 | /* access BPF programs and maps */ |
| 250 | cond_syscall(sys_bpf); | 308 | COND_SYSCALL(bpf); |
| 251 | 309 | ||
| 252 | /* execveat */ | 310 | /* execveat */ |
| 253 | cond_syscall(sys_execveat); | 311 | COND_SYSCALL(execveat); |
| 312 | |||
| 313 | COND_SYSCALL(userfaultfd); | ||
| 254 | 314 | ||
| 255 | /* membarrier */ | 315 | /* membarrier */ |
| 256 | cond_syscall(sys_membarrier); | 316 | COND_SYSCALL(membarrier); |
| 317 | |||
| 318 | COND_SYSCALL(mlock2); | ||
| 319 | |||
| 320 | COND_SYSCALL(copy_file_range); | ||
| 257 | 321 | ||
| 258 | /* memory protection keys */ | 322 | /* memory protection keys */ |
| 259 | cond_syscall(sys_pkey_mprotect); | 323 | COND_SYSCALL(pkey_mprotect); |
| 260 | cond_syscall(sys_pkey_alloc); | 324 | COND_SYSCALL(pkey_alloc); |
| 261 | cond_syscall(sys_pkey_free); | 325 | COND_SYSCALL(pkey_free); |
| 326 | |||
| 327 | |||
| 328 | /* | ||
| 329 | * Architecture specific weak syscall entries. | ||
| 330 | */ | ||
| 331 | |||
| 332 | /* pciconfig: alpha, arm, arm64, ia64, sparc */ | ||
| 333 | COND_SYSCALL(pciconfig_read); | ||
| 334 | COND_SYSCALL(pciconfig_write); | ||
| 335 | COND_SYSCALL(pciconfig_iobase); | ||
| 336 | |||
| 337 | /* sys_socketcall: arm, mips, x86, ... */ | ||
| 338 | COND_SYSCALL(socketcall); | ||
| 339 | COND_SYSCALL_COMPAT(socketcall); | ||
| 340 | |||
| 341 | /* compat syscalls for arm64, x86, ... */ | ||
| 342 | COND_SYSCALL_COMPAT(sysctl); | ||
| 343 | COND_SYSCALL_COMPAT(fanotify_mark); | ||
| 344 | |||
| 345 | /* x86 */ | ||
| 346 | COND_SYSCALL(vm86old); | ||
| 347 | COND_SYSCALL(modify_ldt); | ||
| 348 | COND_SYSCALL_COMPAT(quotactl32); | ||
| 349 | COND_SYSCALL(vm86); | ||
| 350 | COND_SYSCALL(kexec_file_load); | ||
| 351 | |||
| 352 | /* s390 */ | ||
| 353 | COND_SYSCALL(s390_pci_mmio_read); | ||
| 354 | COND_SYSCALL(s390_pci_mmio_write); | ||
| 355 | COND_SYSCALL_COMPAT(s390_ipc); | ||
| 356 | |||
| 357 | /* powerpc */ | ||
| 358 | cond_syscall(ppc_rtas); | ||
| 359 | COND_SYSCALL(spu_run); | ||
| 360 | COND_SYSCALL(spu_create); | ||
| 361 | COND_SYSCALL(subpage_prot); | ||
| 362 | |||
| 363 | |||
| 364 | /* | ||
| 365 | * Deprecated system calls which are still defined in | ||
| 366 | * include/uapi/asm-generic/unistd.h and wanted by >= 1 arch | ||
| 367 | */ | ||
| 368 | |||
| 369 | /* __ARCH_WANT_SYSCALL_NO_FLAGS */ | ||
| 370 | COND_SYSCALL(epoll_create); | ||
| 371 | COND_SYSCALL(inotify_init); | ||
| 372 | COND_SYSCALL(eventfd); | ||
| 373 | COND_SYSCALL(signalfd); | ||
| 374 | COND_SYSCALL_COMPAT(signalfd); | ||
| 375 | |||
| 376 | /* __ARCH_WANT_SYSCALL_OFF_T */ | ||
| 377 | COND_SYSCALL(fadvise64); | ||
| 378 | |||
| 379 | /* __ARCH_WANT_SYSCALL_DEPRECATED */ | ||
| 380 | COND_SYSCALL(epoll_wait); | ||
| 381 | COND_SYSCALL(recv); | ||
| 382 | COND_SYSCALL_COMPAT(recv); | ||
| 383 | COND_SYSCALL(send); | ||
| 384 | COND_SYSCALL(bdflush); | ||
| 385 | COND_SYSCALL(uselib); | ||
| 386 | |||
| 387 | |||
| 388 | /* | ||
| 389 | * The syscalls below are not found in include/uapi/asm-generic/unistd.h | ||
| 390 | */ | ||
| 391 | |||
| 392 | /* obsolete: SGETMASK_SYSCALL */ | ||
| 393 | COND_SYSCALL(sgetmask); | ||
| 394 | COND_SYSCALL(ssetmask); | ||
| 395 | |||
| 396 | /* obsolete: SYSFS_SYSCALL */ | ||
| 397 | COND_SYSCALL(sysfs); | ||
| 398 | |||
| 399 | /* obsolete: __ARCH_WANT_SYS_IPC */ | ||
| 400 | COND_SYSCALL(ipc); | ||
| 401 | COND_SYSCALL_COMPAT(ipc); | ||
| 402 | |||
| 403 | /* obsolete: UID16 */ | ||
| 404 | COND_SYSCALL(chown16); | ||
| 405 | COND_SYSCALL(fchown16); | ||
| 406 | COND_SYSCALL(getegid16); | ||
| 407 | COND_SYSCALL(geteuid16); | ||
| 408 | COND_SYSCALL(getgid16); | ||
| 409 | COND_SYSCALL(getgroups16); | ||
| 410 | COND_SYSCALL(getresgid16); | ||
| 411 | COND_SYSCALL(getresuid16); | ||
| 412 | COND_SYSCALL(getuid16); | ||
| 413 | COND_SYSCALL(lchown16); | ||
| 414 | COND_SYSCALL(setfsgid16); | ||
| 415 | COND_SYSCALL(setfsuid16); | ||
| 416 | COND_SYSCALL(setgid16); | ||
| 417 | COND_SYSCALL(setgroups16); | ||
| 418 | COND_SYSCALL(setregid16); | ||
| 419 | COND_SYSCALL(setresgid16); | ||
| 420 | COND_SYSCALL(setresuid16); | ||
| 421 | COND_SYSCALL(setreuid16); | ||
| 422 | COND_SYSCALL(setuid16); | ||
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f6b5f19223d6..78eabc41eaa6 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
| @@ -113,16 +113,6 @@ config NO_HZ_FULL | |||
| 113 | 113 | ||
| 114 | endchoice | 114 | endchoice |
| 115 | 115 | ||
| 116 | config NO_HZ_FULL_ALL | ||
| 117 | bool "Full dynticks system on all CPUs by default (except CPU 0)" | ||
| 118 | depends on NO_HZ_FULL | ||
| 119 | help | ||
| 120 | If the user doesn't pass the nohz_full boot option to | ||
| 121 | define the range of full dynticks CPUs, consider that all | ||
| 122 | CPUs in the system are full dynticks by default. | ||
| 123 | Note the boot CPU will still be kept outside the range to | ||
| 124 | handle the timekeeping duty. | ||
| 125 | |||
| 126 | config NO_HZ | 116 | config NO_HZ |
| 127 | bool "Old Idle dynticks config" | 117 | bool "Old Idle dynticks config" |
| 128 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS | 118 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 29a5733eff83..5d4a0342f934 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -405,30 +405,12 @@ static int tick_nohz_cpu_down(unsigned int cpu) | |||
| 405 | return 0; | 405 | return 0; |
| 406 | } | 406 | } |
| 407 | 407 | ||
| 408 | static int tick_nohz_init_all(void) | ||
| 409 | { | ||
| 410 | int err = -1; | ||
| 411 | |||
| 412 | #ifdef CONFIG_NO_HZ_FULL_ALL | ||
| 413 | if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { | ||
| 414 | WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); | ||
| 415 | return err; | ||
| 416 | } | ||
| 417 | err = 0; | ||
| 418 | cpumask_setall(tick_nohz_full_mask); | ||
| 419 | tick_nohz_full_running = true; | ||
| 420 | #endif | ||
| 421 | return err; | ||
| 422 | } | ||
| 423 | |||
| 424 | void __init tick_nohz_init(void) | 408 | void __init tick_nohz_init(void) |
| 425 | { | 409 | { |
| 426 | int cpu, ret; | 410 | int cpu, ret; |
| 427 | 411 | ||
| 428 | if (!tick_nohz_full_running) { | 412 | if (!tick_nohz_full_running) |
| 429 | if (tick_nohz_init_all() < 0) | 413 | return; |
| 430 | return; | ||
| 431 | } | ||
| 432 | 414 | ||
| 433 | /* | 415 | /* |
| 434 | * Full dynticks uses irq work to drive the tick rescheduling on safe | 416 | * Full dynticks uses irq work to drive the tick rescheduling on safe |
| @@ -481,11 +463,18 @@ static int __init setup_tick_nohz(char *str) | |||
| 481 | 463 | ||
| 482 | __setup("nohz=", setup_tick_nohz); | 464 | __setup("nohz=", setup_tick_nohz); |
| 483 | 465 | ||
| 484 | int tick_nohz_tick_stopped(void) | 466 | bool tick_nohz_tick_stopped(void) |
| 485 | { | 467 | { |
| 486 | return __this_cpu_read(tick_cpu_sched.tick_stopped); | 468 | return __this_cpu_read(tick_cpu_sched.tick_stopped); |
| 487 | } | 469 | } |
| 488 | 470 | ||
| 471 | bool tick_nohz_tick_stopped_cpu(int cpu) | ||
| 472 | { | ||
| 473 | struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); | ||
| 474 | |||
| 475 | return ts->tick_stopped; | ||
| 476 | } | ||
| 477 | |||
| 489 | /** | 478 | /** |
| 490 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted | 479 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted |
| 491 | * | 480 | * |
| @@ -741,12 +730,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
| 741 | delta = KTIME_MAX; | 730 | delta = KTIME_MAX; |
| 742 | } | 731 | } |
| 743 | 732 | ||
| 744 | #ifdef CONFIG_NO_HZ_FULL | ||
| 745 | /* Limit the tick delta to the maximum scheduler deferment */ | ||
| 746 | if (!ts->inidle) | ||
| 747 | delta = min(delta, scheduler_tick_max_deferment()); | ||
| 748 | #endif | ||
| 749 | |||
| 750 | /* Calculate the next expiry time */ | 733 | /* Calculate the next expiry time */ |
| 751 | if (delta < (KTIME_MAX - basemono)) | 734 | if (delta < (KTIME_MAX - basemono)) |
| 752 | expires = basemono + delta; | 735 | expires = basemono + delta; |
| @@ -953,13 +936,6 @@ void tick_nohz_idle_enter(void) | |||
| 953 | struct tick_sched *ts; | 936 | struct tick_sched *ts; |
| 954 | 937 | ||
| 955 | lockdep_assert_irqs_enabled(); | 938 | lockdep_assert_irqs_enabled(); |
| 956 | /* | ||
| 957 | * Update the idle state in the scheduler domain hierarchy | ||
| 958 | * when tick_nohz_stop_sched_tick() is called from the idle loop. | ||
| 959 | * State will be updated to busy during the first busy tick after | ||
| 960 | * exiting idle. | ||
| 961 | */ | ||
| 962 | set_cpu_sd_state_idle(); | ||
| 963 | 939 | ||
| 964 | local_irq_disable(); | 940 | local_irq_disable(); |
| 965 | 941 | ||
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 55d6dff37daf..2c416509b834 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
| @@ -8,6 +8,7 @@ | |||
| 8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
| 9 | #include <linux/kprobes.h> | 9 | #include <linux/kprobes.h> |
| 10 | #include "trace.h" | 10 | #include "trace.h" |
| 11 | #include "trace_probe.h" | ||
| 11 | 12 | ||
| 12 | static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; | 13 | static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; |
| 13 | 14 | ||
| @@ -237,6 +238,107 @@ void perf_trace_destroy(struct perf_event *p_event) | |||
| 237 | mutex_unlock(&event_mutex); | 238 | mutex_unlock(&event_mutex); |
| 238 | } | 239 | } |
| 239 | 240 | ||
| 241 | #ifdef CONFIG_KPROBE_EVENTS | ||
| 242 | int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) | ||
| 243 | { | ||
| 244 | int ret; | ||
| 245 | char *func = NULL; | ||
| 246 | struct trace_event_call *tp_event; | ||
| 247 | |||
| 248 | if (p_event->attr.kprobe_func) { | ||
| 249 | func = kzalloc(KSYM_NAME_LEN, GFP_KERNEL); | ||
| 250 | if (!func) | ||
| 251 | return -ENOMEM; | ||
| 252 | ret = strncpy_from_user( | ||
| 253 | func, u64_to_user_ptr(p_event->attr.kprobe_func), | ||
| 254 | KSYM_NAME_LEN); | ||
| 255 | if (ret < 0) | ||
| 256 | goto out; | ||
| 257 | |||
| 258 | if (func[0] == '\0') { | ||
| 259 | kfree(func); | ||
| 260 | func = NULL; | ||
| 261 | } | ||
| 262 | } | ||
| 263 | |||
| 264 | tp_event = create_local_trace_kprobe( | ||
| 265 | func, (void *)(unsigned long)(p_event->attr.kprobe_addr), | ||
| 266 | p_event->attr.probe_offset, is_retprobe); | ||
| 267 | if (IS_ERR(tp_event)) { | ||
| 268 | ret = PTR_ERR(tp_event); | ||
| 269 | goto out; | ||
| 270 | } | ||
| 271 | |||
| 272 | ret = perf_trace_event_init(tp_event, p_event); | ||
| 273 | if (ret) | ||
| 274 | destroy_local_trace_kprobe(tp_event); | ||
| 275 | out: | ||
| 276 | kfree(func); | ||
| 277 | return ret; | ||
| 278 | } | ||
| 279 | |||
| 280 | void perf_kprobe_destroy(struct perf_event *p_event) | ||
| 281 | { | ||
| 282 | perf_trace_event_close(p_event); | ||
| 283 | perf_trace_event_unreg(p_event); | ||
| 284 | |||
| 285 | destroy_local_trace_kprobe(p_event->tp_event); | ||
| 286 | } | ||
| 287 | #endif /* CONFIG_KPROBE_EVENTS */ | ||
| 288 | |||
| 289 | #ifdef CONFIG_UPROBE_EVENTS | ||
| 290 | int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe) | ||
| 291 | { | ||
| 292 | int ret; | ||
| 293 | char *path = NULL; | ||
| 294 | struct trace_event_call *tp_event; | ||
| 295 | |||
| 296 | if (!p_event->attr.uprobe_path) | ||
| 297 | return -EINVAL; | ||
| 298 | path = kzalloc(PATH_MAX, GFP_KERNEL); | ||
| 299 | if (!path) | ||
| 300 | return -ENOMEM; | ||
| 301 | ret = strncpy_from_user( | ||
| 302 | path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX); | ||
| 303 | if (ret < 0) | ||
| 304 | goto out; | ||
| 305 | if (path[0] == '\0') { | ||
| 306 | ret = -EINVAL; | ||
| 307 | goto out; | ||
| 308 | } | ||
| 309 | |||
| 310 | tp_event = create_local_trace_uprobe( | ||
| 311 | path, p_event->attr.probe_offset, is_retprobe); | ||
| 312 | if (IS_ERR(tp_event)) { | ||
| 313 | ret = PTR_ERR(tp_event); | ||
| 314 | goto out; | ||
| 315 | } | ||
| 316 | |||
| 317 | /* | ||
| 318 | * local trace_uprobe need to hold event_mutex to call | ||
| 319 | * uprobe_buffer_enable() and uprobe_buffer_disable(). | ||
| 320 | * event_mutex is not required for local trace_kprobes. | ||
| 321 | */ | ||
| 322 | mutex_lock(&event_mutex); | ||
| 323 | ret = perf_trace_event_init(tp_event, p_event); | ||
| 324 | if (ret) | ||
| 325 | destroy_local_trace_uprobe(tp_event); | ||
| 326 | mutex_unlock(&event_mutex); | ||
| 327 | out: | ||
| 328 | kfree(path); | ||
| 329 | return ret; | ||
| 330 | } | ||
| 331 | |||
| 332 | void perf_uprobe_destroy(struct perf_event *p_event) | ||
| 333 | { | ||
| 334 | mutex_lock(&event_mutex); | ||
| 335 | perf_trace_event_close(p_event); | ||
| 336 | perf_trace_event_unreg(p_event); | ||
| 337 | mutex_unlock(&event_mutex); | ||
| 338 | destroy_local_trace_uprobe(p_event->tp_event); | ||
| 339 | } | ||
| 340 | #endif /* CONFIG_UPROBE_EVENTS */ | ||
| 341 | |||
| 240 | int perf_trace_add(struct perf_event *p_event, int flags) | 342 | int perf_trace_add(struct perf_event *p_event, int flags) |
| 241 | { | 343 | { |
| 242 | struct trace_event_call *tp_event = p_event->tp_event; | 344 | struct trace_event_call *tp_event = p_event->tp_event; |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ae4147eaebd4..1cd3fb4d70f8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
| @@ -462,6 +462,14 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) | |||
| 462 | disable_kprobe(&tk->rp.kp); | 462 | disable_kprobe(&tk->rp.kp); |
| 463 | wait = 1; | 463 | wait = 1; |
| 464 | } | 464 | } |
| 465 | |||
| 466 | /* | ||
| 467 | * if tk is not added to any list, it must be a local trace_kprobe | ||
| 468 | * created with perf_event_open. We don't need to wait for these | ||
| 469 | * trace_kprobes | ||
| 470 | */ | ||
| 471 | if (list_empty(&tk->list)) | ||
| 472 | wait = 0; | ||
| 465 | out: | 473 | out: |
| 466 | if (wait) { | 474 | if (wait) { |
| 467 | /* | 475 | /* |
| @@ -1358,12 +1366,9 @@ static struct trace_event_functions kprobe_funcs = { | |||
| 1358 | .trace = print_kprobe_event | 1366 | .trace = print_kprobe_event |
| 1359 | }; | 1367 | }; |
| 1360 | 1368 | ||
| 1361 | static int register_kprobe_event(struct trace_kprobe *tk) | 1369 | static inline void init_trace_event_call(struct trace_kprobe *tk, |
| 1370 | struct trace_event_call *call) | ||
| 1362 | { | 1371 | { |
| 1363 | struct trace_event_call *call = &tk->tp.call; | ||
| 1364 | int ret; | ||
| 1365 | |||
| 1366 | /* Initialize trace_event_call */ | ||
| 1367 | INIT_LIST_HEAD(&call->class->fields); | 1372 | INIT_LIST_HEAD(&call->class->fields); |
| 1368 | if (trace_kprobe_is_return(tk)) { | 1373 | if (trace_kprobe_is_return(tk)) { |
| 1369 | call->event.funcs = &kretprobe_funcs; | 1374 | call->event.funcs = &kretprobe_funcs; |
| @@ -1372,6 +1377,19 @@ static int register_kprobe_event(struct trace_kprobe *tk) | |||
| 1372 | call->event.funcs = &kprobe_funcs; | 1377 | call->event.funcs = &kprobe_funcs; |
| 1373 | call->class->define_fields = kprobe_event_define_fields; | 1378 | call->class->define_fields = kprobe_event_define_fields; |
| 1374 | } | 1379 | } |
| 1380 | |||
| 1381 | call->flags = TRACE_EVENT_FL_KPROBE; | ||
| 1382 | call->class->reg = kprobe_register; | ||
| 1383 | call->data = tk; | ||
| 1384 | } | ||
| 1385 | |||
| 1386 | static int register_kprobe_event(struct trace_kprobe *tk) | ||
| 1387 | { | ||
| 1388 | struct trace_event_call *call = &tk->tp.call; | ||
| 1389 | int ret = 0; | ||
| 1390 | |||
| 1391 | init_trace_event_call(tk, call); | ||
| 1392 | |||
| 1375 | if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) | 1393 | if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) |
| 1376 | return -ENOMEM; | 1394 | return -ENOMEM; |
| 1377 | ret = register_trace_event(&call->event); | 1395 | ret = register_trace_event(&call->event); |
| @@ -1379,9 +1397,6 @@ static int register_kprobe_event(struct trace_kprobe *tk) | |||
| 1379 | kfree(call->print_fmt); | 1397 | kfree(call->print_fmt); |
| 1380 | return -ENODEV; | 1398 | return -ENODEV; |
| 1381 | } | 1399 | } |
| 1382 | call->flags = TRACE_EVENT_FL_KPROBE; | ||
| 1383 | call->class->reg = kprobe_register; | ||
| 1384 | call->data = tk; | ||
| 1385 | ret = trace_add_event_call(call); | 1400 | ret = trace_add_event_call(call); |
| 1386 | if (ret) { | 1401 | if (ret) { |
| 1387 | pr_info("Failed to register kprobe event: %s\n", | 1402 | pr_info("Failed to register kprobe event: %s\n", |
| @@ -1403,6 +1418,66 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) | |||
| 1403 | return ret; | 1418 | return ret; |
| 1404 | } | 1419 | } |
| 1405 | 1420 | ||
| 1421 | #ifdef CONFIG_PERF_EVENTS | ||
| 1422 | /* create a trace_kprobe, but don't add it to global lists */ | ||
| 1423 | struct trace_event_call * | ||
| 1424 | create_local_trace_kprobe(char *func, void *addr, unsigned long offs, | ||
| 1425 | bool is_return) | ||
| 1426 | { | ||
| 1427 | struct trace_kprobe *tk; | ||
| 1428 | int ret; | ||
| 1429 | char *event; | ||
| 1430 | |||
| 1431 | /* | ||
| 1432 | * local trace_kprobes are not added to probe_list, so they are never | ||
| 1433 | * searched in find_trace_kprobe(). Therefore, there is no concern of | ||
| 1434 | * duplicated name here. | ||
| 1435 | */ | ||
| 1436 | event = func ? func : "DUMMY_EVENT"; | ||
| 1437 | |||
| 1438 | tk = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event, (void *)addr, func, | ||
| 1439 | offs, 0 /* maxactive */, 0 /* nargs */, | ||
| 1440 | is_return); | ||
| 1441 | |||
| 1442 | if (IS_ERR(tk)) { | ||
| 1443 | pr_info("Failed to allocate trace_probe.(%d)\n", | ||
| 1444 | (int)PTR_ERR(tk)); | ||
| 1445 | return ERR_CAST(tk); | ||
| 1446 | } | ||
| 1447 | |||
| 1448 | init_trace_event_call(tk, &tk->tp.call); | ||
| 1449 | |||
| 1450 | if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { | ||
| 1451 | ret = -ENOMEM; | ||
| 1452 | goto error; | ||
| 1453 | } | ||
| 1454 | |||
| 1455 | ret = __register_trace_kprobe(tk); | ||
| 1456 | if (ret < 0) | ||
| 1457 | goto error; | ||
| 1458 | |||
| 1459 | return &tk->tp.call; | ||
| 1460 | error: | ||
| 1461 | free_trace_kprobe(tk); | ||
| 1462 | return ERR_PTR(ret); | ||
| 1463 | } | ||
| 1464 | |||
| 1465 | void destroy_local_trace_kprobe(struct trace_event_call *event_call) | ||
| 1466 | { | ||
| 1467 | struct trace_kprobe *tk; | ||
| 1468 | |||
| 1469 | tk = container_of(event_call, struct trace_kprobe, tp.call); | ||
| 1470 | |||
| 1471 | if (trace_probe_is_enabled(&tk->tp)) { | ||
| 1472 | WARN_ON(1); | ||
| 1473 | return; | ||
| 1474 | } | ||
| 1475 | |||
| 1476 | __unregister_trace_kprobe(tk); | ||
| 1477 | free_trace_kprobe(tk); | ||
| 1478 | } | ||
| 1479 | #endif /* CONFIG_PERF_EVENTS */ | ||
| 1480 | |||
| 1406 | /* Make a tracefs interface for controlling probe points */ | 1481 | /* Make a tracefs interface for controlling probe points */ |
| 1407 | static __init int init_kprobe_trace(void) | 1482 | static __init int init_kprobe_trace(void) |
| 1408 | { | 1483 | { |
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 6a4d3fa94042..75daff22ccea 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h | |||
| @@ -416,3 +416,14 @@ store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, | |||
| 416 | } | 416 | } |
| 417 | 417 | ||
| 418 | extern int set_print_fmt(struct trace_probe *tp, bool is_return); | 418 | extern int set_print_fmt(struct trace_probe *tp, bool is_return); |
| 419 | |||
| 420 | #ifdef CONFIG_PERF_EVENTS | ||
| 421 | extern struct trace_event_call * | ||
| 422 | create_local_trace_kprobe(char *func, void *addr, unsigned long offs, | ||
| 423 | bool is_return); | ||
| 424 | extern void destroy_local_trace_kprobe(struct trace_event_call *event_call); | ||
| 425 | |||
| 426 | extern struct trace_event_call * | ||
| 427 | create_local_trace_uprobe(char *name, unsigned long offs, bool is_return); | ||
| 428 | extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); | ||
| 429 | #endif | ||
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 268029ae1be6..2014f4351ae0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
| @@ -1292,16 +1292,25 @@ static struct trace_event_functions uprobe_funcs = { | |||
| 1292 | .trace = print_uprobe_event | 1292 | .trace = print_uprobe_event |
| 1293 | }; | 1293 | }; |
| 1294 | 1294 | ||
| 1295 | static int register_uprobe_event(struct trace_uprobe *tu) | 1295 | static inline void init_trace_event_call(struct trace_uprobe *tu, |
| 1296 | struct trace_event_call *call) | ||
| 1296 | { | 1297 | { |
| 1297 | struct trace_event_call *call = &tu->tp.call; | ||
| 1298 | int ret; | ||
| 1299 | |||
| 1300 | /* Initialize trace_event_call */ | ||
| 1301 | INIT_LIST_HEAD(&call->class->fields); | 1298 | INIT_LIST_HEAD(&call->class->fields); |
| 1302 | call->event.funcs = &uprobe_funcs; | 1299 | call->event.funcs = &uprobe_funcs; |
| 1303 | call->class->define_fields = uprobe_event_define_fields; | 1300 | call->class->define_fields = uprobe_event_define_fields; |
| 1304 | 1301 | ||
| 1302 | call->flags = TRACE_EVENT_FL_UPROBE; | ||
| 1303 | call->class->reg = trace_uprobe_register; | ||
| 1304 | call->data = tu; | ||
| 1305 | } | ||
| 1306 | |||
| 1307 | static int register_uprobe_event(struct trace_uprobe *tu) | ||
| 1308 | { | ||
| 1309 | struct trace_event_call *call = &tu->tp.call; | ||
| 1310 | int ret = 0; | ||
| 1311 | |||
| 1312 | init_trace_event_call(tu, call); | ||
| 1313 | |||
| 1305 | if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) | 1314 | if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) |
| 1306 | return -ENOMEM; | 1315 | return -ENOMEM; |
| 1307 | 1316 | ||
| @@ -1311,9 +1320,6 @@ static int register_uprobe_event(struct trace_uprobe *tu) | |||
| 1311 | return -ENODEV; | 1320 | return -ENODEV; |
| 1312 | } | 1321 | } |
| 1313 | 1322 | ||
| 1314 | call->flags = TRACE_EVENT_FL_UPROBE; | ||
| 1315 | call->class->reg = trace_uprobe_register; | ||
| 1316 | call->data = tu; | ||
| 1317 | ret = trace_add_event_call(call); | 1323 | ret = trace_add_event_call(call); |
| 1318 | 1324 | ||
| 1319 | if (ret) { | 1325 | if (ret) { |
| @@ -1339,6 +1345,70 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) | |||
| 1339 | return 0; | 1345 | return 0; |
| 1340 | } | 1346 | } |
| 1341 | 1347 | ||
| 1348 | #ifdef CONFIG_PERF_EVENTS | ||
| 1349 | struct trace_event_call * | ||
| 1350 | create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) | ||
| 1351 | { | ||
| 1352 | struct trace_uprobe *tu; | ||
| 1353 | struct inode *inode; | ||
| 1354 | struct path path; | ||
| 1355 | int ret; | ||
| 1356 | |||
| 1357 | ret = kern_path(name, LOOKUP_FOLLOW, &path); | ||
| 1358 | if (ret) | ||
| 1359 | return ERR_PTR(ret); | ||
| 1360 | |||
| 1361 | inode = igrab(d_inode(path.dentry)); | ||
| 1362 | path_put(&path); | ||
| 1363 | |||
| 1364 | if (!inode || !S_ISREG(inode->i_mode)) { | ||
| 1365 | iput(inode); | ||
| 1366 | return ERR_PTR(-EINVAL); | ||
| 1367 | } | ||
| 1368 | |||
| 1369 | /* | ||
| 1370 | * local trace_kprobes are not added to probe_list, so they are never | ||
| 1371 | * searched in find_trace_kprobe(). Therefore, there is no concern of | ||
| 1372 | * duplicated name "DUMMY_EVENT" here. | ||
| 1373 | */ | ||
| 1374 | tu = alloc_trace_uprobe(UPROBE_EVENT_SYSTEM, "DUMMY_EVENT", 0, | ||
| 1375 | is_return); | ||
| 1376 | |||
| 1377 | if (IS_ERR(tu)) { | ||
| 1378 | pr_info("Failed to allocate trace_uprobe.(%d)\n", | ||
| 1379 | (int)PTR_ERR(tu)); | ||
| 1380 | return ERR_CAST(tu); | ||
| 1381 | } | ||
| 1382 | |||
| 1383 | tu->offset = offs; | ||
| 1384 | tu->inode = inode; | ||
| 1385 | tu->filename = kstrdup(name, GFP_KERNEL); | ||
| 1386 | init_trace_event_call(tu, &tu->tp.call); | ||
| 1387 | |||
| 1388 | if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { | ||
| 1389 | ret = -ENOMEM; | ||
| 1390 | goto error; | ||
| 1391 | } | ||
| 1392 | |||
| 1393 | return &tu->tp.call; | ||
| 1394 | error: | ||
| 1395 | free_trace_uprobe(tu); | ||
| 1396 | return ERR_PTR(ret); | ||
| 1397 | } | ||
| 1398 | |||
| 1399 | void destroy_local_trace_uprobe(struct trace_event_call *event_call) | ||
| 1400 | { | ||
| 1401 | struct trace_uprobe *tu; | ||
| 1402 | |||
| 1403 | tu = container_of(event_call, struct trace_uprobe, tp.call); | ||
| 1404 | |||
| 1405 | kfree(tu->tp.call.print_fmt); | ||
| 1406 | tu->tp.call.print_fmt = NULL; | ||
| 1407 | |||
| 1408 | free_trace_uprobe(tu); | ||
| 1409 | } | ||
| 1410 | #endif /* CONFIG_PERF_EVENTS */ | ||
| 1411 | |||
| 1342 | /* Make a trace interface for controling probe points */ | 1412 | /* Make a trace interface for controling probe points */ |
| 1343 | static __init int init_uprobe_trace(void) | 1413 | static __init int init_uprobe_trace(void) |
| 1344 | { | 1414 | { |
diff --git a/kernel/uid16.c b/kernel/uid16.c index ef1da2a5f9bd..af6925d8599b 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
| @@ -18,44 +18,46 @@ | |||
| 18 | 18 | ||
| 19 | #include <linux/uaccess.h> | 19 | #include <linux/uaccess.h> |
| 20 | 20 | ||
| 21 | #include "uid16.h" | ||
| 22 | |||
| 21 | SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) | 23 | SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) |
| 22 | { | 24 | { |
| 23 | return sys_chown(filename, low2highuid(user), low2highgid(group)); | 25 | return ksys_chown(filename, low2highuid(user), low2highgid(group)); |
| 24 | } | 26 | } |
| 25 | 27 | ||
| 26 | SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) | 28 | SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) |
| 27 | { | 29 | { |
| 28 | return sys_lchown(filename, low2highuid(user), low2highgid(group)); | 30 | return ksys_lchown(filename, low2highuid(user), low2highgid(group)); |
| 29 | } | 31 | } |
| 30 | 32 | ||
| 31 | SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) | 33 | SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) |
| 32 | { | 34 | { |
| 33 | return sys_fchown(fd, low2highuid(user), low2highgid(group)); | 35 | return ksys_fchown(fd, low2highuid(user), low2highgid(group)); |
| 34 | } | 36 | } |
| 35 | 37 | ||
| 36 | SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) | 38 | SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) |
| 37 | { | 39 | { |
| 38 | return sys_setregid(low2highgid(rgid), low2highgid(egid)); | 40 | return __sys_setregid(low2highgid(rgid), low2highgid(egid)); |
| 39 | } | 41 | } |
| 40 | 42 | ||
| 41 | SYSCALL_DEFINE1(setgid16, old_gid_t, gid) | 43 | SYSCALL_DEFINE1(setgid16, old_gid_t, gid) |
| 42 | { | 44 | { |
| 43 | return sys_setgid(low2highgid(gid)); | 45 | return __sys_setgid(low2highgid(gid)); |
| 44 | } | 46 | } |
| 45 | 47 | ||
| 46 | SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) | 48 | SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) |
| 47 | { | 49 | { |
| 48 | return sys_setreuid(low2highuid(ruid), low2highuid(euid)); | 50 | return __sys_setreuid(low2highuid(ruid), low2highuid(euid)); |
| 49 | } | 51 | } |
| 50 | 52 | ||
| 51 | SYSCALL_DEFINE1(setuid16, old_uid_t, uid) | 53 | SYSCALL_DEFINE1(setuid16, old_uid_t, uid) |
| 52 | { | 54 | { |
| 53 | return sys_setuid(low2highuid(uid)); | 55 | return __sys_setuid(low2highuid(uid)); |
| 54 | } | 56 | } |
| 55 | 57 | ||
| 56 | SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) | 58 | SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) |
| 57 | { | 59 | { |
| 58 | return sys_setresuid(low2highuid(ruid), low2highuid(euid), | 60 | return __sys_setresuid(low2highuid(ruid), low2highuid(euid), |
| 59 | low2highuid(suid)); | 61 | low2highuid(suid)); |
| 60 | } | 62 | } |
| 61 | 63 | ||
| @@ -78,11 +80,10 @@ SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euid | |||
| 78 | 80 | ||
| 79 | SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) | 81 | SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) |
| 80 | { | 82 | { |
| 81 | return sys_setresgid(low2highgid(rgid), low2highgid(egid), | 83 | return __sys_setresgid(low2highgid(rgid), low2highgid(egid), |
| 82 | low2highgid(sgid)); | 84 | low2highgid(sgid)); |
| 83 | } | 85 | } |
| 84 | 86 | ||
| 85 | |||
| 86 | SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp) | 87 | SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp) |
| 87 | { | 88 | { |
| 88 | const struct cred *cred = current_cred(); | 89 | const struct cred *cred = current_cred(); |
| @@ -102,12 +103,12 @@ SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egid | |||
| 102 | 103 | ||
| 103 | SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) | 104 | SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) |
| 104 | { | 105 | { |
| 105 | return sys_setfsuid(low2highuid(uid)); | 106 | return __sys_setfsuid(low2highuid(uid)); |
| 106 | } | 107 | } |
| 107 | 108 | ||
| 108 | SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) | 109 | SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) |
| 109 | { | 110 | { |
| 110 | return sys_setfsgid(low2highgid(gid)); | 111 | return __sys_setfsgid(low2highgid(gid)); |
| 111 | } | 112 | } |
| 112 | 113 | ||
| 113 | static int groups16_to_user(old_gid_t __user *grouplist, | 114 | static int groups16_to_user(old_gid_t __user *grouplist, |
diff --git a/kernel/uid16.h b/kernel/uid16.h new file mode 100644 index 000000000000..cdca040f7602 --- /dev/null +++ b/kernel/uid16.h | |||
| @@ -0,0 +1,14 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
| 2 | #ifndef LINUX_UID16_H | ||
| 3 | #define LINUX_UID16_H | ||
| 4 | |||
| 5 | long __sys_setuid(uid_t uid); | ||
| 6 | long __sys_setgid(gid_t gid); | ||
| 7 | long __sys_setreuid(uid_t ruid, uid_t euid); | ||
| 8 | long __sys_setregid(gid_t rgid, gid_t egid); | ||
| 9 | long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid); | ||
| 10 | long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid); | ||
| 11 | long __sys_setfsuid(uid_t uid); | ||
| 12 | long __sys_setfsgid(gid_t gid); | ||
| 13 | |||
| 14 | #endif /* LINUX_UID16_H */ | ||
diff --git a/kernel/umh.c b/kernel/umh.c index 18e5fa4b0e71..f76b3ff876cf 100644 --- a/kernel/umh.c +++ b/kernel/umh.c | |||
| @@ -118,7 +118,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) | |||
| 118 | { | 118 | { |
| 119 | pid_t pid; | 119 | pid_t pid; |
| 120 | 120 | ||
| 121 | /* If SIGCLD is ignored sys_wait4 won't populate the status. */ | 121 | /* If SIGCLD is ignored kernel_wait4 won't populate the status. */ |
| 122 | kernel_sigaction(SIGCHLD, SIG_DFL); | 122 | kernel_sigaction(SIGCHLD, SIG_DFL); |
| 123 | pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); | 123 | pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); |
| 124 | if (pid < 0) { | 124 | if (pid < 0) { |
| @@ -135,7 +135,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) | |||
| 135 | * | 135 | * |
| 136 | * Thus the __user pointer cast is valid here. | 136 | * Thus the __user pointer cast is valid here. |
| 137 | */ | 137 | */ |
| 138 | sys_wait4(pid, (int __user *)&ret, 0, NULL); | 138 | kernel_wait4(pid, (int __user *)&ret, 0, NULL); |
| 139 | 139 | ||
| 140 | /* | 140 | /* |
| 141 | * If ret is 0, either call_usermodehelper_exec_async failed and | 141 | * If ret is 0, either call_usermodehelper_exec_async failed and |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6ec6ba65127b..254e636a3d6b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -5573,12 +5573,13 @@ static void __init wq_numa_init(void) | |||
| 5573 | int __init workqueue_init_early(void) | 5573 | int __init workqueue_init_early(void) |
| 5574 | { | 5574 | { |
| 5575 | int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; | 5575 | int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; |
| 5576 | int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; | ||
| 5576 | int i, cpu; | 5577 | int i, cpu; |
| 5577 | 5578 | ||
| 5578 | WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); | 5579 | WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); |
| 5579 | 5580 | ||
| 5580 | BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); | 5581 | BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); |
| 5581 | cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); | 5582 | cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags)); |
| 5582 | 5583 | ||
| 5583 | pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); | 5584 | pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); |
| 5584 | 5585 | ||
