diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/events/core.c | 64 | ||||
-rw-r--r-- | kernel/events/hw_breakpoint.c | 11 | ||||
-rw-r--r-- | kernel/sched/core.c | 73 | ||||
-rw-r--r-- | kernel/sched/fair.c | 37 | ||||
-rw-r--r-- | kernel/sched/rt.c | 1 | ||||
-rw-r--r-- | kernel/sched/sched.h | 1 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 1 | ||||
-rw-r--r-- | kernel/workqueue.c | 110 |
8 files changed, 172 insertions, 126 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index b7935fcec7d9..7fee567153f0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -1253,7 +1253,7 @@ retry: | |||
1253 | /* | 1253 | /* |
1254 | * Cross CPU call to disable a performance event | 1254 | * Cross CPU call to disable a performance event |
1255 | */ | 1255 | */ |
1256 | static int __perf_event_disable(void *info) | 1256 | int __perf_event_disable(void *info) |
1257 | { | 1257 | { |
1258 | struct perf_event *event = info; | 1258 | struct perf_event *event = info; |
1259 | struct perf_event_context *ctx = event->ctx; | 1259 | struct perf_event_context *ctx = event->ctx; |
@@ -2935,12 +2935,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); | |||
2935 | /* | 2935 | /* |
2936 | * Called when the last reference to the file is gone. | 2936 | * Called when the last reference to the file is gone. |
2937 | */ | 2937 | */ |
2938 | static int perf_release(struct inode *inode, struct file *file) | 2938 | static void put_event(struct perf_event *event) |
2939 | { | 2939 | { |
2940 | struct perf_event *event = file->private_data; | ||
2941 | struct task_struct *owner; | 2940 | struct task_struct *owner; |
2942 | 2941 | ||
2943 | file->private_data = NULL; | 2942 | if (!atomic_long_dec_and_test(&event->refcount)) |
2943 | return; | ||
2944 | 2944 | ||
2945 | rcu_read_lock(); | 2945 | rcu_read_lock(); |
2946 | owner = ACCESS_ONCE(event->owner); | 2946 | owner = ACCESS_ONCE(event->owner); |
@@ -2975,7 +2975,13 @@ static int perf_release(struct inode *inode, struct file *file) | |||
2975 | put_task_struct(owner); | 2975 | put_task_struct(owner); |
2976 | } | 2976 | } |
2977 | 2977 | ||
2978 | return perf_event_release_kernel(event); | 2978 | perf_event_release_kernel(event); |
2979 | } | ||
2980 | |||
2981 | static int perf_release(struct inode *inode, struct file *file) | ||
2982 | { | ||
2983 | put_event(file->private_data); | ||
2984 | return 0; | ||
2979 | } | 2985 | } |
2980 | 2986 | ||
2981 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) | 2987 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) |
@@ -3227,7 +3233,7 @@ unlock: | |||
3227 | 3233 | ||
3228 | static const struct file_operations perf_fops; | 3234 | static const struct file_operations perf_fops; |
3229 | 3235 | ||
3230 | static struct perf_event *perf_fget_light(int fd, int *fput_needed) | 3236 | static struct file *perf_fget_light(int fd, int *fput_needed) |
3231 | { | 3237 | { |
3232 | struct file *file; | 3238 | struct file *file; |
3233 | 3239 | ||
@@ -3241,7 +3247,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed) | |||
3241 | return ERR_PTR(-EBADF); | 3247 | return ERR_PTR(-EBADF); |
3242 | } | 3248 | } |
3243 | 3249 | ||
3244 | return file->private_data; | 3250 | return file; |
3245 | } | 3251 | } |
3246 | 3252 | ||
3247 | static int perf_event_set_output(struct perf_event *event, | 3253 | static int perf_event_set_output(struct perf_event *event, |
@@ -3273,19 +3279,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
3273 | 3279 | ||
3274 | case PERF_EVENT_IOC_SET_OUTPUT: | 3280 | case PERF_EVENT_IOC_SET_OUTPUT: |
3275 | { | 3281 | { |
3282 | struct file *output_file = NULL; | ||
3276 | struct perf_event *output_event = NULL; | 3283 | struct perf_event *output_event = NULL; |
3277 | int fput_needed = 0; | 3284 | int fput_needed = 0; |
3278 | int ret; | 3285 | int ret; |
3279 | 3286 | ||
3280 | if (arg != -1) { | 3287 | if (arg != -1) { |
3281 | output_event = perf_fget_light(arg, &fput_needed); | 3288 | output_file = perf_fget_light(arg, &fput_needed); |
3282 | if (IS_ERR(output_event)) | 3289 | if (IS_ERR(output_file)) |
3283 | return PTR_ERR(output_event); | 3290 | return PTR_ERR(output_file); |
3291 | output_event = output_file->private_data; | ||
3284 | } | 3292 | } |
3285 | 3293 | ||
3286 | ret = perf_event_set_output(event, output_event); | 3294 | ret = perf_event_set_output(event, output_event); |
3287 | if (output_event) | 3295 | if (output_event) |
3288 | fput_light(output_event->filp, fput_needed); | 3296 | fput_light(output_file, fput_needed); |
3289 | 3297 | ||
3290 | return ret; | 3298 | return ret; |
3291 | } | 3299 | } |
@@ -5950,6 +5958,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
5950 | 5958 | ||
5951 | mutex_init(&event->mmap_mutex); | 5959 | mutex_init(&event->mmap_mutex); |
5952 | 5960 | ||
5961 | atomic_long_set(&event->refcount, 1); | ||
5953 | event->cpu = cpu; | 5962 | event->cpu = cpu; |
5954 | event->attr = *attr; | 5963 | event->attr = *attr; |
5955 | event->group_leader = group_leader; | 5964 | event->group_leader = group_leader; |
@@ -6260,12 +6269,12 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6260 | return event_fd; | 6269 | return event_fd; |
6261 | 6270 | ||
6262 | if (group_fd != -1) { | 6271 | if (group_fd != -1) { |
6263 | group_leader = perf_fget_light(group_fd, &fput_needed); | 6272 | group_file = perf_fget_light(group_fd, &fput_needed); |
6264 | if (IS_ERR(group_leader)) { | 6273 | if (IS_ERR(group_file)) { |
6265 | err = PTR_ERR(group_leader); | 6274 | err = PTR_ERR(group_file); |
6266 | goto err_fd; | 6275 | goto err_fd; |
6267 | } | 6276 | } |
6268 | group_file = group_leader->filp; | 6277 | group_leader = group_file->private_data; |
6269 | if (flags & PERF_FLAG_FD_OUTPUT) | 6278 | if (flags & PERF_FLAG_FD_OUTPUT) |
6270 | output_event = group_leader; | 6279 | output_event = group_leader; |
6271 | if (flags & PERF_FLAG_FD_NO_GROUP) | 6280 | if (flags & PERF_FLAG_FD_NO_GROUP) |
@@ -6402,7 +6411,6 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6402 | put_ctx(gctx); | 6411 | put_ctx(gctx); |
6403 | } | 6412 | } |
6404 | 6413 | ||
6405 | event->filp = event_file; | ||
6406 | WARN_ON_ONCE(ctx->parent_ctx); | 6414 | WARN_ON_ONCE(ctx->parent_ctx); |
6407 | mutex_lock(&ctx->mutex); | 6415 | mutex_lock(&ctx->mutex); |
6408 | 6416 | ||
@@ -6496,7 +6504,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
6496 | goto err_free; | 6504 | goto err_free; |
6497 | } | 6505 | } |
6498 | 6506 | ||
6499 | event->filp = NULL; | ||
6500 | WARN_ON_ONCE(ctx->parent_ctx); | 6507 | WARN_ON_ONCE(ctx->parent_ctx); |
6501 | mutex_lock(&ctx->mutex); | 6508 | mutex_lock(&ctx->mutex); |
6502 | perf_install_in_context(ctx, event, cpu); | 6509 | perf_install_in_context(ctx, event, cpu); |
@@ -6578,7 +6585,7 @@ static void sync_child_event(struct perf_event *child_event, | |||
6578 | * Release the parent event, if this was the last | 6585 | * Release the parent event, if this was the last |
6579 | * reference to it. | 6586 | * reference to it. |
6580 | */ | 6587 | */ |
6581 | fput(parent_event->filp); | 6588 | put_event(parent_event); |
6582 | } | 6589 | } |
6583 | 6590 | ||
6584 | static void | 6591 | static void |
@@ -6654,9 +6661,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
6654 | * | 6661 | * |
6655 | * __perf_event_exit_task() | 6662 | * __perf_event_exit_task() |
6656 | * sync_child_event() | 6663 | * sync_child_event() |
6657 | * fput(parent_event->filp) | 6664 | * put_event() |
6658 | * perf_release() | 6665 | * mutex_lock(&ctx->mutex) |
6659 | * mutex_lock(&ctx->mutex) | ||
6660 | * | 6666 | * |
6661 | * But since its the parent context it won't be the same instance. | 6667 | * But since its the parent context it won't be the same instance. |
6662 | */ | 6668 | */ |
@@ -6724,7 +6730,7 @@ static void perf_free_event(struct perf_event *event, | |||
6724 | list_del_init(&event->child_list); | 6730 | list_del_init(&event->child_list); |
6725 | mutex_unlock(&parent->child_mutex); | 6731 | mutex_unlock(&parent->child_mutex); |
6726 | 6732 | ||
6727 | fput(parent->filp); | 6733 | put_event(parent); |
6728 | 6734 | ||
6729 | perf_group_detach(event); | 6735 | perf_group_detach(event); |
6730 | list_del_event(event, ctx); | 6736 | list_del_event(event, ctx); |
@@ -6804,6 +6810,12 @@ inherit_event(struct perf_event *parent_event, | |||
6804 | NULL, NULL); | 6810 | NULL, NULL); |
6805 | if (IS_ERR(child_event)) | 6811 | if (IS_ERR(child_event)) |
6806 | return child_event; | 6812 | return child_event; |
6813 | |||
6814 | if (!atomic_long_inc_not_zero(&parent_event->refcount)) { | ||
6815 | free_event(child_event); | ||
6816 | return NULL; | ||
6817 | } | ||
6818 | |||
6807 | get_ctx(child_ctx); | 6819 | get_ctx(child_ctx); |
6808 | 6820 | ||
6809 | /* | 6821 | /* |
@@ -6845,14 +6857,6 @@ inherit_event(struct perf_event *parent_event, | |||
6845 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); | 6857 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); |
6846 | 6858 | ||
6847 | /* | 6859 | /* |
6848 | * Get a reference to the parent filp - we will fput it | ||
6849 | * when the child event exits. This is safe to do because | ||
6850 | * we are in the parent and we know that the filp still | ||
6851 | * exists and has a nonzero count: | ||
6852 | */ | ||
6853 | atomic_long_inc(&parent_event->filp->f_count); | ||
6854 | |||
6855 | /* | ||
6856 | * Link this into the parent event's child list | 6860 | * Link this into the parent event's child list |
6857 | */ | 6861 | */ |
6858 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); | 6862 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); |
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index bb38c4d3ee12..9a7b487c6fe2 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
@@ -453,7 +453,16 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att | |||
453 | int old_type = bp->attr.bp_type; | 453 | int old_type = bp->attr.bp_type; |
454 | int err = 0; | 454 | int err = 0; |
455 | 455 | ||
456 | perf_event_disable(bp); | 456 | /* |
457 | * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it | ||
458 | * will not be possible to raise IPIs that invoke __perf_event_disable. | ||
459 | * So call the function directly after making sure we are targeting the | ||
460 | * current task. | ||
461 | */ | ||
462 | if (irqs_disabled() && bp->ctx && bp->ctx->task == current) | ||
463 | __perf_event_disable(bp); | ||
464 | else | ||
465 | perf_event_disable(bp); | ||
457 | 466 | ||
458 | bp->attr.bp_addr = attr->bp_addr; | 467 | bp->attr.bp_addr = attr->bp_addr; |
459 | bp->attr.bp_type = attr->bp_type; | 468 | bp->attr.bp_type = attr->bp_type; |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fbf1fd098dc6..649c9f876cb1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -5304,27 +5304,17 @@ void idle_task_exit(void) | |||
5304 | } | 5304 | } |
5305 | 5305 | ||
5306 | /* | 5306 | /* |
5307 | * While a dead CPU has no uninterruptible tasks queued at this point, | 5307 | * Since this CPU is going 'away' for a while, fold any nr_active delta |
5308 | * it might still have a nonzero ->nr_uninterruptible counter, because | 5308 | * we might have. Assumes we're called after migrate_tasks() so that the |
5309 | * for performance reasons the counter is not stricly tracking tasks to | 5309 | * nr_active count is stable. |
5310 | * their home CPUs. So we just add the counter to another CPU's counter, | 5310 | * |
5311 | * to keep the global sum constant after CPU-down: | 5311 | * Also see the comment "Global load-average calculations". |
5312 | */ | ||
5313 | static void migrate_nr_uninterruptible(struct rq *rq_src) | ||
5314 | { | ||
5315 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); | ||
5316 | |||
5317 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | ||
5318 | rq_src->nr_uninterruptible = 0; | ||
5319 | } | ||
5320 | |||
5321 | /* | ||
5322 | * remove the tasks which were accounted by rq from calc_load_tasks. | ||
5323 | */ | 5312 | */ |
5324 | static void calc_global_load_remove(struct rq *rq) | 5313 | static void calc_load_migrate(struct rq *rq) |
5325 | { | 5314 | { |
5326 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | 5315 | long delta = calc_load_fold_active(rq); |
5327 | rq->calc_load_active = 0; | 5316 | if (delta) |
5317 | atomic_long_add(delta, &calc_load_tasks); | ||
5328 | } | 5318 | } |
5329 | 5319 | ||
5330 | /* | 5320 | /* |
@@ -5352,9 +5342,6 @@ static void migrate_tasks(unsigned int dead_cpu) | |||
5352 | */ | 5342 | */ |
5353 | rq->stop = NULL; | 5343 | rq->stop = NULL; |
5354 | 5344 | ||
5355 | /* Ensure any throttled groups are reachable by pick_next_task */ | ||
5356 | unthrottle_offline_cfs_rqs(rq); | ||
5357 | |||
5358 | for ( ; ; ) { | 5345 | for ( ; ; ) { |
5359 | /* | 5346 | /* |
5360 | * There's this thread running, bail when that's the only | 5347 | * There's this thread running, bail when that's the only |
@@ -5618,8 +5605,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5618 | BUG_ON(rq->nr_running != 1); /* the migration thread */ | 5605 | BUG_ON(rq->nr_running != 1); /* the migration thread */ |
5619 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 5606 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
5620 | 5607 | ||
5621 | migrate_nr_uninterruptible(rq); | 5608 | calc_load_migrate(rq); |
5622 | calc_global_load_remove(rq); | ||
5623 | break; | 5609 | break; |
5624 | #endif | 5610 | #endif |
5625 | } | 5611 | } |
@@ -6028,11 +6014,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
6028 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this | 6014 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this |
6029 | * allows us to avoid some pointer chasing select_idle_sibling(). | 6015 | * allows us to avoid some pointer chasing select_idle_sibling(). |
6030 | * | 6016 | * |
6031 | * Iterate domains and sched_groups downward, assigning CPUs to be | ||
6032 | * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing | ||
6033 | * due to random perturbation self canceling, ie sw buddies pull | ||
6034 | * their counterpart to their CPU's hw counterpart. | ||
6035 | * | ||
6036 | * Also keep a unique ID per domain (we use the first cpu number in | 6017 | * Also keep a unique ID per domain (we use the first cpu number in |
6037 | * the cpumask of the domain), this allows us to quickly tell if | 6018 | * the cpumask of the domain), this allows us to quickly tell if |
6038 | * two cpus are in the same cache domain, see cpus_share_cache(). | 6019 | * two cpus are in the same cache domain, see cpus_share_cache(). |
@@ -6046,40 +6027,8 @@ static void update_top_cache_domain(int cpu) | |||
6046 | int id = cpu; | 6027 | int id = cpu; |
6047 | 6028 | ||
6048 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | 6029 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); |
6049 | if (sd) { | 6030 | if (sd) |
6050 | struct sched_domain *tmp = sd; | ||
6051 | struct sched_group *sg, *prev; | ||
6052 | bool right; | ||
6053 | |||
6054 | /* | ||
6055 | * Traverse to first CPU in group, and count hops | ||
6056 | * to cpu from there, switching direction on each | ||
6057 | * hop, never ever pointing the last CPU rightward. | ||
6058 | */ | ||
6059 | do { | ||
6060 | id = cpumask_first(sched_domain_span(tmp)); | ||
6061 | prev = sg = tmp->groups; | ||
6062 | right = 1; | ||
6063 | |||
6064 | while (cpumask_first(sched_group_cpus(sg)) != id) | ||
6065 | sg = sg->next; | ||
6066 | |||
6067 | while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { | ||
6068 | prev = sg; | ||
6069 | sg = sg->next; | ||
6070 | right = !right; | ||
6071 | } | ||
6072 | |||
6073 | /* A CPU went down, never point back to domain start. */ | ||
6074 | if (right && cpumask_first(sched_group_cpus(sg->next)) == id) | ||
6075 | right = false; | ||
6076 | |||
6077 | sg = right ? sg->next : prev; | ||
6078 | tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); | ||
6079 | } while ((tmp = tmp->child)); | ||
6080 | |||
6081 | id = cpumask_first(sched_domain_span(sd)); | 6031 | id = cpumask_first(sched_domain_span(sd)); |
6082 | } | ||
6083 | 6032 | ||
6084 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | 6033 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); |
6085 | per_cpu(sd_llc_id, cpu) = id; | 6034 | per_cpu(sd_llc_id, cpu) = id; |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c219bf8d704c..96e2b18b6283 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -2052,7 +2052,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
2052 | hrtimer_cancel(&cfs_b->slack_timer); | 2052 | hrtimer_cancel(&cfs_b->slack_timer); |
2053 | } | 2053 | } |
2054 | 2054 | ||
2055 | void unthrottle_offline_cfs_rqs(struct rq *rq) | 2055 | static void unthrottle_offline_cfs_rqs(struct rq *rq) |
2056 | { | 2056 | { |
2057 | struct cfs_rq *cfs_rq; | 2057 | struct cfs_rq *cfs_rq; |
2058 | 2058 | ||
@@ -2106,7 +2106,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | |||
2106 | return NULL; | 2106 | return NULL; |
2107 | } | 2107 | } |
2108 | static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | 2108 | static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} |
2109 | void unthrottle_offline_cfs_rqs(struct rq *rq) {} | 2109 | static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} |
2110 | 2110 | ||
2111 | #endif /* CONFIG_CFS_BANDWIDTH */ | 2111 | #endif /* CONFIG_CFS_BANDWIDTH */ |
2112 | 2112 | ||
@@ -2637,6 +2637,8 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2637 | int cpu = smp_processor_id(); | 2637 | int cpu = smp_processor_id(); |
2638 | int prev_cpu = task_cpu(p); | 2638 | int prev_cpu = task_cpu(p); |
2639 | struct sched_domain *sd; | 2639 | struct sched_domain *sd; |
2640 | struct sched_group *sg; | ||
2641 | int i; | ||
2640 | 2642 | ||
2641 | /* | 2643 | /* |
2642 | * If the task is going to be woken-up on this cpu and if it is | 2644 | * If the task is going to be woken-up on this cpu and if it is |
@@ -2653,17 +2655,29 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2653 | return prev_cpu; | 2655 | return prev_cpu; |
2654 | 2656 | ||
2655 | /* | 2657 | /* |
2656 | * Otherwise, check assigned siblings to find an elegible idle cpu. | 2658 | * Otherwise, iterate the domains and find an elegible idle cpu. |
2657 | */ | 2659 | */ |
2658 | sd = rcu_dereference(per_cpu(sd_llc, target)); | 2660 | sd = rcu_dereference(per_cpu(sd_llc, target)); |
2659 | |||
2660 | for_each_lower_domain(sd) { | 2661 | for_each_lower_domain(sd) { |
2661 | if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) | 2662 | sg = sd->groups; |
2662 | continue; | 2663 | do { |
2663 | if (idle_cpu(sd->idle_buddy)) | 2664 | if (!cpumask_intersects(sched_group_cpus(sg), |
2664 | return sd->idle_buddy; | 2665 | tsk_cpus_allowed(p))) |
2665 | } | 2666 | goto next; |
2666 | 2667 | ||
2668 | for_each_cpu(i, sched_group_cpus(sg)) { | ||
2669 | if (!idle_cpu(i)) | ||
2670 | goto next; | ||
2671 | } | ||
2672 | |||
2673 | target = cpumask_first_and(sched_group_cpus(sg), | ||
2674 | tsk_cpus_allowed(p)); | ||
2675 | goto done; | ||
2676 | next: | ||
2677 | sg = sg->next; | ||
2678 | } while (sg != sd->groups); | ||
2679 | } | ||
2680 | done: | ||
2667 | return target; | 2681 | return target; |
2668 | } | 2682 | } |
2669 | 2683 | ||
@@ -3658,7 +3672,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
3658 | * @group: sched_group whose statistics are to be updated. | 3672 | * @group: sched_group whose statistics are to be updated. |
3659 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 3673 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
3660 | * @local_group: Does group contain this_cpu. | 3674 | * @local_group: Does group contain this_cpu. |
3661 | * @cpus: Set of cpus considered for load balancing. | ||
3662 | * @balance: Should we balance. | 3675 | * @balance: Should we balance. |
3663 | * @sgs: variable to hold the statistics for this group. | 3676 | * @sgs: variable to hold the statistics for this group. |
3664 | */ | 3677 | */ |
@@ -3805,7 +3818,6 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
3805 | /** | 3818 | /** |
3806 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. | 3819 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. |
3807 | * @env: The load balancing environment. | 3820 | * @env: The load balancing environment. |
3808 | * @cpus: Set of cpus considered for load balancing. | ||
3809 | * @balance: Should we balance. | 3821 | * @balance: Should we balance. |
3810 | * @sds: variable to hold the statistics for this sched_domain. | 3822 | * @sds: variable to hold the statistics for this sched_domain. |
3811 | */ | 3823 | */ |
@@ -4956,6 +4968,9 @@ static void rq_online_fair(struct rq *rq) | |||
4956 | static void rq_offline_fair(struct rq *rq) | 4968 | static void rq_offline_fair(struct rq *rq) |
4957 | { | 4969 | { |
4958 | update_sysctl(); | 4970 | update_sysctl(); |
4971 | |||
4972 | /* Ensure any throttled groups are reachable by pick_next_task */ | ||
4973 | unthrottle_offline_cfs_rqs(rq); | ||
4959 | } | 4974 | } |
4960 | 4975 | ||
4961 | #endif /* CONFIG_SMP */ | 4976 | #endif /* CONFIG_SMP */ |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 944cb68420e9..e0b7ba9c040f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -691,6 +691,7 @@ balanced: | |||
691 | * runtime - in which case borrowing doesn't make sense. | 691 | * runtime - in which case borrowing doesn't make sense. |
692 | */ | 692 | */ |
693 | rt_rq->rt_runtime = RUNTIME_INF; | 693 | rt_rq->rt_runtime = RUNTIME_INF; |
694 | rt_rq->rt_throttled = 0; | ||
694 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 695 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
695 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 696 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
696 | } | 697 | } |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f6714d009e77..0848fa36c383 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -1144,7 +1144,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu); | |||
1144 | 1144 | ||
1145 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1145 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); |
1146 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1146 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); |
1147 | extern void unthrottle_offline_cfs_rqs(struct rq *rq); | ||
1148 | 1147 | ||
1149 | extern void account_cfs_bandwidth_used(int enabled, int was_enabled); | 1148 | extern void account_cfs_bandwidth_used(int enabled, int was_enabled); |
1150 | 1149 | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 024540f97f74..3a9e5d5c1091 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -573,6 +573,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) | |||
573 | tick_do_update_jiffies64(now); | 573 | tick_do_update_jiffies64(now); |
574 | update_cpu_load_nohz(); | 574 | update_cpu_load_nohz(); |
575 | 575 | ||
576 | calc_load_exit_idle(); | ||
576 | touch_softlockup_watchdog(); | 577 | touch_softlockup_watchdog(); |
577 | /* | 578 | /* |
578 | * Cancel the scheduled timer and restore the tick | 579 | * Cancel the scheduled timer and restore the tick |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 692d97628a10..1e1373bcb3e3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -66,6 +66,7 @@ enum { | |||
66 | 66 | ||
67 | /* pool flags */ | 67 | /* pool flags */ |
68 | POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ | 68 | POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ |
69 | POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ | ||
69 | 70 | ||
70 | /* worker flags */ | 71 | /* worker flags */ |
71 | WORKER_STARTED = 1 << 0, /* started */ | 72 | WORKER_STARTED = 1 << 0, /* started */ |
@@ -652,7 +653,7 @@ static bool need_to_manage_workers(struct worker_pool *pool) | |||
652 | /* Do we have too many workers and should some go away? */ | 653 | /* Do we have too many workers and should some go away? */ |
653 | static bool too_many_workers(struct worker_pool *pool) | 654 | static bool too_many_workers(struct worker_pool *pool) |
654 | { | 655 | { |
655 | bool managing = mutex_is_locked(&pool->manager_mutex); | 656 | bool managing = pool->flags & POOL_MANAGING_WORKERS; |
656 | int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ | 657 | int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ |
657 | int nr_busy = pool->nr_workers - nr_idle; | 658 | int nr_busy = pool->nr_workers - nr_idle; |
658 | 659 | ||
@@ -1326,6 +1327,15 @@ static void idle_worker_rebind(struct worker *worker) | |||
1326 | 1327 | ||
1327 | /* we did our part, wait for rebind_workers() to finish up */ | 1328 | /* we did our part, wait for rebind_workers() to finish up */ |
1328 | wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); | 1329 | wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); |
1330 | |||
1331 | /* | ||
1332 | * rebind_workers() shouldn't finish until all workers passed the | ||
1333 | * above WORKER_REBIND wait. Tell it when done. | ||
1334 | */ | ||
1335 | spin_lock_irq(&worker->pool->gcwq->lock); | ||
1336 | if (!--worker->idle_rebind->cnt) | ||
1337 | complete(&worker->idle_rebind->done); | ||
1338 | spin_unlock_irq(&worker->pool->gcwq->lock); | ||
1329 | } | 1339 | } |
1330 | 1340 | ||
1331 | /* | 1341 | /* |
@@ -1396,12 +1406,15 @@ retry: | |||
1396 | /* set REBIND and kick idle ones, we'll wait for these later */ | 1406 | /* set REBIND and kick idle ones, we'll wait for these later */ |
1397 | for_each_worker_pool(pool, gcwq) { | 1407 | for_each_worker_pool(pool, gcwq) { |
1398 | list_for_each_entry(worker, &pool->idle_list, entry) { | 1408 | list_for_each_entry(worker, &pool->idle_list, entry) { |
1409 | unsigned long worker_flags = worker->flags; | ||
1410 | |||
1399 | if (worker->flags & WORKER_REBIND) | 1411 | if (worker->flags & WORKER_REBIND) |
1400 | continue; | 1412 | continue; |
1401 | 1413 | ||
1402 | /* morph UNBOUND to REBIND */ | 1414 | /* morph UNBOUND to REBIND atomically */ |
1403 | worker->flags &= ~WORKER_UNBOUND; | 1415 | worker_flags &= ~WORKER_UNBOUND; |
1404 | worker->flags |= WORKER_REBIND; | 1416 | worker_flags |= WORKER_REBIND; |
1417 | ACCESS_ONCE(worker->flags) = worker_flags; | ||
1405 | 1418 | ||
1406 | idle_rebind.cnt++; | 1419 | idle_rebind.cnt++; |
1407 | worker->idle_rebind = &idle_rebind; | 1420 | worker->idle_rebind = &idle_rebind; |
@@ -1419,25 +1432,15 @@ retry: | |||
1419 | goto retry; | 1432 | goto retry; |
1420 | } | 1433 | } |
1421 | 1434 | ||
1422 | /* | 1435 | /* all idle workers are rebound, rebind busy workers */ |
1423 | * All idle workers are rebound and waiting for %WORKER_REBIND to | ||
1424 | * be cleared inside idle_worker_rebind(). Clear and release. | ||
1425 | * Clearing %WORKER_REBIND from this foreign context is safe | ||
1426 | * because these workers are still guaranteed to be idle. | ||
1427 | */ | ||
1428 | for_each_worker_pool(pool, gcwq) | ||
1429 | list_for_each_entry(worker, &pool->idle_list, entry) | ||
1430 | worker->flags &= ~WORKER_REBIND; | ||
1431 | |||
1432 | wake_up_all(&gcwq->rebind_hold); | ||
1433 | |||
1434 | /* rebind busy workers */ | ||
1435 | for_each_busy_worker(worker, i, pos, gcwq) { | 1436 | for_each_busy_worker(worker, i, pos, gcwq) { |
1436 | struct work_struct *rebind_work = &worker->rebind_work; | 1437 | struct work_struct *rebind_work = &worker->rebind_work; |
1438 | unsigned long worker_flags = worker->flags; | ||
1437 | 1439 | ||
1438 | /* morph UNBOUND to REBIND */ | 1440 | /* morph UNBOUND to REBIND atomically */ |
1439 | worker->flags &= ~WORKER_UNBOUND; | 1441 | worker_flags &= ~WORKER_UNBOUND; |
1440 | worker->flags |= WORKER_REBIND; | 1442 | worker_flags |= WORKER_REBIND; |
1443 | ACCESS_ONCE(worker->flags) = worker_flags; | ||
1441 | 1444 | ||
1442 | if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, | 1445 | if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, |
1443 | work_data_bits(rebind_work))) | 1446 | work_data_bits(rebind_work))) |
@@ -1449,6 +1452,34 @@ retry: | |||
1449 | worker->scheduled.next, | 1452 | worker->scheduled.next, |
1450 | work_color_to_flags(WORK_NO_COLOR)); | 1453 | work_color_to_flags(WORK_NO_COLOR)); |
1451 | } | 1454 | } |
1455 | |||
1456 | /* | ||
1457 | * All idle workers are rebound and waiting for %WORKER_REBIND to | ||
1458 | * be cleared inside idle_worker_rebind(). Clear and release. | ||
1459 | * Clearing %WORKER_REBIND from this foreign context is safe | ||
1460 | * because these workers are still guaranteed to be idle. | ||
1461 | * | ||
1462 | * We need to make sure all idle workers passed WORKER_REBIND wait | ||
1463 | * in idle_worker_rebind() before returning; otherwise, workers can | ||
1464 | * get stuck at the wait if hotplug cycle repeats. | ||
1465 | */ | ||
1466 | idle_rebind.cnt = 1; | ||
1467 | INIT_COMPLETION(idle_rebind.done); | ||
1468 | |||
1469 | for_each_worker_pool(pool, gcwq) { | ||
1470 | list_for_each_entry(worker, &pool->idle_list, entry) { | ||
1471 | worker->flags &= ~WORKER_REBIND; | ||
1472 | idle_rebind.cnt++; | ||
1473 | } | ||
1474 | } | ||
1475 | |||
1476 | wake_up_all(&gcwq->rebind_hold); | ||
1477 | |||
1478 | if (--idle_rebind.cnt) { | ||
1479 | spin_unlock_irq(&gcwq->lock); | ||
1480 | wait_for_completion(&idle_rebind.done); | ||
1481 | spin_lock_irq(&gcwq->lock); | ||
1482 | } | ||
1452 | } | 1483 | } |
1453 | 1484 | ||
1454 | static struct worker *alloc_worker(void) | 1485 | static struct worker *alloc_worker(void) |
@@ -1794,9 +1825,45 @@ static bool manage_workers(struct worker *worker) | |||
1794 | struct worker_pool *pool = worker->pool; | 1825 | struct worker_pool *pool = worker->pool; |
1795 | bool ret = false; | 1826 | bool ret = false; |
1796 | 1827 | ||
1797 | if (!mutex_trylock(&pool->manager_mutex)) | 1828 | if (pool->flags & POOL_MANAGING_WORKERS) |
1798 | return ret; | 1829 | return ret; |
1799 | 1830 | ||
1831 | pool->flags |= POOL_MANAGING_WORKERS; | ||
1832 | |||
1833 | /* | ||
1834 | * To simplify both worker management and CPU hotplug, hold off | ||
1835 | * management while hotplug is in progress. CPU hotplug path can't | ||
1836 | * grab %POOL_MANAGING_WORKERS to achieve this because that can | ||
1837 | * lead to idle worker depletion (all become busy thinking someone | ||
1838 | * else is managing) which in turn can result in deadlock under | ||
1839 | * extreme circumstances. Use @pool->manager_mutex to synchronize | ||
1840 | * manager against CPU hotplug. | ||
1841 | * | ||
1842 | * manager_mutex would always be free unless CPU hotplug is in | ||
1843 | * progress. trylock first without dropping @gcwq->lock. | ||
1844 | */ | ||
1845 | if (unlikely(!mutex_trylock(&pool->manager_mutex))) { | ||
1846 | spin_unlock_irq(&pool->gcwq->lock); | ||
1847 | mutex_lock(&pool->manager_mutex); | ||
1848 | /* | ||
1849 | * CPU hotplug could have happened while we were waiting | ||
1850 | * for manager_mutex. Hotplug itself can't handle us | ||
1851 | * because manager isn't either on idle or busy list, and | ||
1852 | * @gcwq's state and ours could have deviated. | ||
1853 | * | ||
1854 | * As hotplug is now excluded via manager_mutex, we can | ||
1855 | * simply try to bind. It will succeed or fail depending | ||
1856 | * on @gcwq's current state. Try it and adjust | ||
1857 | * %WORKER_UNBOUND accordingly. | ||
1858 | */ | ||
1859 | if (worker_maybe_bind_and_lock(worker)) | ||
1860 | worker->flags &= ~WORKER_UNBOUND; | ||
1861 | else | ||
1862 | worker->flags |= WORKER_UNBOUND; | ||
1863 | |||
1864 | ret = true; | ||
1865 | } | ||
1866 | |||
1800 | pool->flags &= ~POOL_MANAGE_WORKERS; | 1867 | pool->flags &= ~POOL_MANAGE_WORKERS; |
1801 | 1868 | ||
1802 | /* | 1869 | /* |
@@ -1806,6 +1873,7 @@ static bool manage_workers(struct worker *worker) | |||
1806 | ret |= maybe_destroy_workers(pool); | 1873 | ret |= maybe_destroy_workers(pool); |
1807 | ret |= maybe_create_worker(pool); | 1874 | ret |= maybe_create_worker(pool); |
1808 | 1875 | ||
1876 | pool->flags &= ~POOL_MANAGING_WORKERS; | ||
1809 | mutex_unlock(&pool->manager_mutex); | 1877 | mutex_unlock(&pool->manager_mutex); |
1810 | return ret; | 1878 | return ret; |
1811 | } | 1879 | } |