diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/audit_tree.c | 19 | ||||
-rw-r--r-- | kernel/events/core.c | 64 | ||||
-rw-r--r-- | kernel/events/hw_breakpoint.c | 11 | ||||
-rw-r--r-- | kernel/fork.c | 4 | ||||
-rw-r--r-- | kernel/sched/core.c | 108 | ||||
-rw-r--r-- | kernel/sched/fair.c | 48 | ||||
-rw-r--r-- | kernel/sched/rt.c | 14 | ||||
-rw-r--r-- | kernel/sched/sched.h | 9 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 22 | ||||
-rw-r--r-- | kernel/task_work.c | 1 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 1 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 39 | ||||
-rw-r--r-- | kernel/timer.c | 9 | ||||
-rw-r--r-- | kernel/trace/trace_syscalls.c | 4 | ||||
-rw-r--r-- | kernel/workqueue.c | 110 |
15 files changed, 294 insertions, 169 deletions
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 3a5ca582ba1e..ed206fd88cca 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -250,7 +250,6 @@ static void untag_chunk(struct node *p) | |||
250 | spin_unlock(&hash_lock); | 250 | spin_unlock(&hash_lock); |
251 | spin_unlock(&entry->lock); | 251 | spin_unlock(&entry->lock); |
252 | fsnotify_destroy_mark(entry); | 252 | fsnotify_destroy_mark(entry); |
253 | fsnotify_put_mark(entry); | ||
254 | goto out; | 253 | goto out; |
255 | } | 254 | } |
256 | 255 | ||
@@ -259,7 +258,7 @@ static void untag_chunk(struct node *p) | |||
259 | 258 | ||
260 | fsnotify_duplicate_mark(&new->mark, entry); | 259 | fsnotify_duplicate_mark(&new->mark, entry); |
261 | if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { | 260 | if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { |
262 | free_chunk(new); | 261 | fsnotify_put_mark(&new->mark); |
263 | goto Fallback; | 262 | goto Fallback; |
264 | } | 263 | } |
265 | 264 | ||
@@ -293,7 +292,7 @@ static void untag_chunk(struct node *p) | |||
293 | spin_unlock(&hash_lock); | 292 | spin_unlock(&hash_lock); |
294 | spin_unlock(&entry->lock); | 293 | spin_unlock(&entry->lock); |
295 | fsnotify_destroy_mark(entry); | 294 | fsnotify_destroy_mark(entry); |
296 | fsnotify_put_mark(entry); | 295 | fsnotify_put_mark(&new->mark); /* drop initial reference */ |
297 | goto out; | 296 | goto out; |
298 | 297 | ||
299 | Fallback: | 298 | Fallback: |
@@ -322,7 +321,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) | |||
322 | 321 | ||
323 | entry = &chunk->mark; | 322 | entry = &chunk->mark; |
324 | if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { | 323 | if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { |
325 | free_chunk(chunk); | 324 | fsnotify_put_mark(entry); |
326 | return -ENOSPC; | 325 | return -ENOSPC; |
327 | } | 326 | } |
328 | 327 | ||
@@ -347,6 +346,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) | |||
347 | insert_hash(chunk); | 346 | insert_hash(chunk); |
348 | spin_unlock(&hash_lock); | 347 | spin_unlock(&hash_lock); |
349 | spin_unlock(&entry->lock); | 348 | spin_unlock(&entry->lock); |
349 | fsnotify_put_mark(entry); /* drop initial reference */ | ||
350 | return 0; | 350 | return 0; |
351 | } | 351 | } |
352 | 352 | ||
@@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
396 | fsnotify_duplicate_mark(chunk_entry, old_entry); | 396 | fsnotify_duplicate_mark(chunk_entry, old_entry); |
397 | if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { | 397 | if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { |
398 | spin_unlock(&old_entry->lock); | 398 | spin_unlock(&old_entry->lock); |
399 | free_chunk(chunk); | 399 | fsnotify_put_mark(chunk_entry); |
400 | fsnotify_put_mark(old_entry); | 400 | fsnotify_put_mark(old_entry); |
401 | return -ENOSPC; | 401 | return -ENOSPC; |
402 | } | 402 | } |
@@ -444,8 +444,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
444 | spin_unlock(&chunk_entry->lock); | 444 | spin_unlock(&chunk_entry->lock); |
445 | spin_unlock(&old_entry->lock); | 445 | spin_unlock(&old_entry->lock); |
446 | fsnotify_destroy_mark(old_entry); | 446 | fsnotify_destroy_mark(old_entry); |
447 | fsnotify_put_mark(chunk_entry); /* drop initial reference */ | ||
447 | fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ | 448 | fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ |
448 | fsnotify_put_mark(old_entry); /* and kill it */ | ||
449 | return 0; | 449 | return 0; |
450 | } | 450 | } |
451 | 451 | ||
@@ -916,7 +916,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify | |||
916 | struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); | 916 | struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); |
917 | 917 | ||
918 | evict_chunk(chunk); | 918 | evict_chunk(chunk); |
919 | fsnotify_put_mark(entry); | 919 | |
920 | /* | ||
921 | * We are guaranteed to have at least one reference to the mark from | ||
922 | * either the inode or the caller of fsnotify_destroy_mark(). | ||
923 | */ | ||
924 | BUG_ON(atomic_read(&entry->refcnt) < 1); | ||
920 | } | 925 | } |
921 | 926 | ||
922 | static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, | 927 | static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, |
diff --git a/kernel/events/core.c b/kernel/events/core.c index b7935fcec7d9..7fee567153f0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -1253,7 +1253,7 @@ retry: | |||
1253 | /* | 1253 | /* |
1254 | * Cross CPU call to disable a performance event | 1254 | * Cross CPU call to disable a performance event |
1255 | */ | 1255 | */ |
1256 | static int __perf_event_disable(void *info) | 1256 | int __perf_event_disable(void *info) |
1257 | { | 1257 | { |
1258 | struct perf_event *event = info; | 1258 | struct perf_event *event = info; |
1259 | struct perf_event_context *ctx = event->ctx; | 1259 | struct perf_event_context *ctx = event->ctx; |
@@ -2935,12 +2935,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); | |||
2935 | /* | 2935 | /* |
2936 | * Called when the last reference to the file is gone. | 2936 | * Called when the last reference to the file is gone. |
2937 | */ | 2937 | */ |
2938 | static int perf_release(struct inode *inode, struct file *file) | 2938 | static void put_event(struct perf_event *event) |
2939 | { | 2939 | { |
2940 | struct perf_event *event = file->private_data; | ||
2941 | struct task_struct *owner; | 2940 | struct task_struct *owner; |
2942 | 2941 | ||
2943 | file->private_data = NULL; | 2942 | if (!atomic_long_dec_and_test(&event->refcount)) |
2943 | return; | ||
2944 | 2944 | ||
2945 | rcu_read_lock(); | 2945 | rcu_read_lock(); |
2946 | owner = ACCESS_ONCE(event->owner); | 2946 | owner = ACCESS_ONCE(event->owner); |
@@ -2975,7 +2975,13 @@ static int perf_release(struct inode *inode, struct file *file) | |||
2975 | put_task_struct(owner); | 2975 | put_task_struct(owner); |
2976 | } | 2976 | } |
2977 | 2977 | ||
2978 | return perf_event_release_kernel(event); | 2978 | perf_event_release_kernel(event); |
2979 | } | ||
2980 | |||
2981 | static int perf_release(struct inode *inode, struct file *file) | ||
2982 | { | ||
2983 | put_event(file->private_data); | ||
2984 | return 0; | ||
2979 | } | 2985 | } |
2980 | 2986 | ||
2981 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) | 2987 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) |
@@ -3227,7 +3233,7 @@ unlock: | |||
3227 | 3233 | ||
3228 | static const struct file_operations perf_fops; | 3234 | static const struct file_operations perf_fops; |
3229 | 3235 | ||
3230 | static struct perf_event *perf_fget_light(int fd, int *fput_needed) | 3236 | static struct file *perf_fget_light(int fd, int *fput_needed) |
3231 | { | 3237 | { |
3232 | struct file *file; | 3238 | struct file *file; |
3233 | 3239 | ||
@@ -3241,7 +3247,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed) | |||
3241 | return ERR_PTR(-EBADF); | 3247 | return ERR_PTR(-EBADF); |
3242 | } | 3248 | } |
3243 | 3249 | ||
3244 | return file->private_data; | 3250 | return file; |
3245 | } | 3251 | } |
3246 | 3252 | ||
3247 | static int perf_event_set_output(struct perf_event *event, | 3253 | static int perf_event_set_output(struct perf_event *event, |
@@ -3273,19 +3279,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
3273 | 3279 | ||
3274 | case PERF_EVENT_IOC_SET_OUTPUT: | 3280 | case PERF_EVENT_IOC_SET_OUTPUT: |
3275 | { | 3281 | { |
3282 | struct file *output_file = NULL; | ||
3276 | struct perf_event *output_event = NULL; | 3283 | struct perf_event *output_event = NULL; |
3277 | int fput_needed = 0; | 3284 | int fput_needed = 0; |
3278 | int ret; | 3285 | int ret; |
3279 | 3286 | ||
3280 | if (arg != -1) { | 3287 | if (arg != -1) { |
3281 | output_event = perf_fget_light(arg, &fput_needed); | 3288 | output_file = perf_fget_light(arg, &fput_needed); |
3282 | if (IS_ERR(output_event)) | 3289 | if (IS_ERR(output_file)) |
3283 | return PTR_ERR(output_event); | 3290 | return PTR_ERR(output_file); |
3291 | output_event = output_file->private_data; | ||
3284 | } | 3292 | } |
3285 | 3293 | ||
3286 | ret = perf_event_set_output(event, output_event); | 3294 | ret = perf_event_set_output(event, output_event); |
3287 | if (output_event) | 3295 | if (output_event) |
3288 | fput_light(output_event->filp, fput_needed); | 3296 | fput_light(output_file, fput_needed); |
3289 | 3297 | ||
3290 | return ret; | 3298 | return ret; |
3291 | } | 3299 | } |
@@ -5950,6 +5958,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
5950 | 5958 | ||
5951 | mutex_init(&event->mmap_mutex); | 5959 | mutex_init(&event->mmap_mutex); |
5952 | 5960 | ||
5961 | atomic_long_set(&event->refcount, 1); | ||
5953 | event->cpu = cpu; | 5962 | event->cpu = cpu; |
5954 | event->attr = *attr; | 5963 | event->attr = *attr; |
5955 | event->group_leader = group_leader; | 5964 | event->group_leader = group_leader; |
@@ -6260,12 +6269,12 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6260 | return event_fd; | 6269 | return event_fd; |
6261 | 6270 | ||
6262 | if (group_fd != -1) { | 6271 | if (group_fd != -1) { |
6263 | group_leader = perf_fget_light(group_fd, &fput_needed); | 6272 | group_file = perf_fget_light(group_fd, &fput_needed); |
6264 | if (IS_ERR(group_leader)) { | 6273 | if (IS_ERR(group_file)) { |
6265 | err = PTR_ERR(group_leader); | 6274 | err = PTR_ERR(group_file); |
6266 | goto err_fd; | 6275 | goto err_fd; |
6267 | } | 6276 | } |
6268 | group_file = group_leader->filp; | 6277 | group_leader = group_file->private_data; |
6269 | if (flags & PERF_FLAG_FD_OUTPUT) | 6278 | if (flags & PERF_FLAG_FD_OUTPUT) |
6270 | output_event = group_leader; | 6279 | output_event = group_leader; |
6271 | if (flags & PERF_FLAG_FD_NO_GROUP) | 6280 | if (flags & PERF_FLAG_FD_NO_GROUP) |
@@ -6402,7 +6411,6 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6402 | put_ctx(gctx); | 6411 | put_ctx(gctx); |
6403 | } | 6412 | } |
6404 | 6413 | ||
6405 | event->filp = event_file; | ||
6406 | WARN_ON_ONCE(ctx->parent_ctx); | 6414 | WARN_ON_ONCE(ctx->parent_ctx); |
6407 | mutex_lock(&ctx->mutex); | 6415 | mutex_lock(&ctx->mutex); |
6408 | 6416 | ||
@@ -6496,7 +6504,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
6496 | goto err_free; | 6504 | goto err_free; |
6497 | } | 6505 | } |
6498 | 6506 | ||
6499 | event->filp = NULL; | ||
6500 | WARN_ON_ONCE(ctx->parent_ctx); | 6507 | WARN_ON_ONCE(ctx->parent_ctx); |
6501 | mutex_lock(&ctx->mutex); | 6508 | mutex_lock(&ctx->mutex); |
6502 | perf_install_in_context(ctx, event, cpu); | 6509 | perf_install_in_context(ctx, event, cpu); |
@@ -6578,7 +6585,7 @@ static void sync_child_event(struct perf_event *child_event, | |||
6578 | * Release the parent event, if this was the last | 6585 | * Release the parent event, if this was the last |
6579 | * reference to it. | 6586 | * reference to it. |
6580 | */ | 6587 | */ |
6581 | fput(parent_event->filp); | 6588 | put_event(parent_event); |
6582 | } | 6589 | } |
6583 | 6590 | ||
6584 | static void | 6591 | static void |
@@ -6654,9 +6661,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
6654 | * | 6661 | * |
6655 | * __perf_event_exit_task() | 6662 | * __perf_event_exit_task() |
6656 | * sync_child_event() | 6663 | * sync_child_event() |
6657 | * fput(parent_event->filp) | 6664 | * put_event() |
6658 | * perf_release() | 6665 | * mutex_lock(&ctx->mutex) |
6659 | * mutex_lock(&ctx->mutex) | ||
6660 | * | 6666 | * |
6661 | * But since its the parent context it won't be the same instance. | 6667 | * But since its the parent context it won't be the same instance. |
6662 | */ | 6668 | */ |
@@ -6724,7 +6730,7 @@ static void perf_free_event(struct perf_event *event, | |||
6724 | list_del_init(&event->child_list); | 6730 | list_del_init(&event->child_list); |
6725 | mutex_unlock(&parent->child_mutex); | 6731 | mutex_unlock(&parent->child_mutex); |
6726 | 6732 | ||
6727 | fput(parent->filp); | 6733 | put_event(parent); |
6728 | 6734 | ||
6729 | perf_group_detach(event); | 6735 | perf_group_detach(event); |
6730 | list_del_event(event, ctx); | 6736 | list_del_event(event, ctx); |
@@ -6804,6 +6810,12 @@ inherit_event(struct perf_event *parent_event, | |||
6804 | NULL, NULL); | 6810 | NULL, NULL); |
6805 | if (IS_ERR(child_event)) | 6811 | if (IS_ERR(child_event)) |
6806 | return child_event; | 6812 | return child_event; |
6813 | |||
6814 | if (!atomic_long_inc_not_zero(&parent_event->refcount)) { | ||
6815 | free_event(child_event); | ||
6816 | return NULL; | ||
6817 | } | ||
6818 | |||
6807 | get_ctx(child_ctx); | 6819 | get_ctx(child_ctx); |
6808 | 6820 | ||
6809 | /* | 6821 | /* |
@@ -6845,14 +6857,6 @@ inherit_event(struct perf_event *parent_event, | |||
6845 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); | 6857 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); |
6846 | 6858 | ||
6847 | /* | 6859 | /* |
6848 | * Get a reference to the parent filp - we will fput it | ||
6849 | * when the child event exits. This is safe to do because | ||
6850 | * we are in the parent and we know that the filp still | ||
6851 | * exists and has a nonzero count: | ||
6852 | */ | ||
6853 | atomic_long_inc(&parent_event->filp->f_count); | ||
6854 | |||
6855 | /* | ||
6856 | * Link this into the parent event's child list | 6860 | * Link this into the parent event's child list |
6857 | */ | 6861 | */ |
6858 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); | 6862 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); |
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index bb38c4d3ee12..9a7b487c6fe2 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
@@ -453,7 +453,16 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att | |||
453 | int old_type = bp->attr.bp_type; | 453 | int old_type = bp->attr.bp_type; |
454 | int err = 0; | 454 | int err = 0; |
455 | 455 | ||
456 | perf_event_disable(bp); | 456 | /* |
457 | * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it | ||
458 | * will not be possible to raise IPIs that invoke __perf_event_disable. | ||
459 | * So call the function directly after making sure we are targeting the | ||
460 | * current task. | ||
461 | */ | ||
462 | if (irqs_disabled() && bp->ctx && bp->ctx->task == current) | ||
463 | __perf_event_disable(bp); | ||
464 | else | ||
465 | perf_event_disable(bp); | ||
457 | 466 | ||
458 | bp->attr.bp_addr = attr->bp_addr; | 467 | bp->attr.bp_addr = attr->bp_addr; |
459 | bp->attr.bp_type = attr->bp_type; | 468 | bp->attr.bp_type = attr->bp_type; |
diff --git a/kernel/fork.c b/kernel/fork.c index 3bd2280d79f6..2c8857e12855 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -455,8 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
455 | if (retval) | 455 | if (retval) |
456 | goto out; | 456 | goto out; |
457 | 457 | ||
458 | if (file && uprobe_mmap(tmp)) | 458 | if (file) |
459 | goto out; | 459 | uprobe_mmap(tmp); |
460 | } | 460 | } |
461 | /* a new mm has just been created */ | 461 | /* a new mm has just been created */ |
462 | arch_dup_mmap(oldmm, mm); | 462 | arch_dup_mmap(oldmm, mm); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 82ad284f823b..649c9f876cb1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -3142,6 +3142,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
3142 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) | 3142 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) |
3143 | #endif | 3143 | #endif |
3144 | 3144 | ||
3145 | static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) | ||
3146 | { | ||
3147 | u64 temp = (__force u64) rtime; | ||
3148 | |||
3149 | temp *= (__force u64) utime; | ||
3150 | |||
3151 | if (sizeof(cputime_t) == 4) | ||
3152 | temp = div_u64(temp, (__force u32) total); | ||
3153 | else | ||
3154 | temp = div64_u64(temp, (__force u64) total); | ||
3155 | |||
3156 | return (__force cputime_t) temp; | ||
3157 | } | ||
3158 | |||
3145 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | 3159 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) |
3146 | { | 3160 | { |
3147 | cputime_t rtime, utime = p->utime, total = utime + p->stime; | 3161 | cputime_t rtime, utime = p->utime, total = utime + p->stime; |
@@ -3151,13 +3165,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
3151 | */ | 3165 | */ |
3152 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | 3166 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); |
3153 | 3167 | ||
3154 | if (total) { | 3168 | if (total) |
3155 | u64 temp = (__force u64) rtime; | 3169 | utime = scale_utime(utime, rtime, total); |
3156 | 3170 | else | |
3157 | temp *= (__force u64) utime; | ||
3158 | do_div(temp, (__force u32) total); | ||
3159 | utime = (__force cputime_t) temp; | ||
3160 | } else | ||
3161 | utime = rtime; | 3171 | utime = rtime; |
3162 | 3172 | ||
3163 | /* | 3173 | /* |
@@ -3184,13 +3194,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
3184 | total = cputime.utime + cputime.stime; | 3194 | total = cputime.utime + cputime.stime; |
3185 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | 3195 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); |
3186 | 3196 | ||
3187 | if (total) { | 3197 | if (total) |
3188 | u64 temp = (__force u64) rtime; | 3198 | utime = scale_utime(cputime.utime, rtime, total); |
3189 | 3199 | else | |
3190 | temp *= (__force u64) cputime.utime; | ||
3191 | do_div(temp, (__force u32) total); | ||
3192 | utime = (__force cputime_t) temp; | ||
3193 | } else | ||
3194 | utime = rtime; | 3200 | utime = rtime; |
3195 | 3201 | ||
3196 | sig->prev_utime = max(sig->prev_utime, utime); | 3202 | sig->prev_utime = max(sig->prev_utime, utime); |
@@ -5298,27 +5304,17 @@ void idle_task_exit(void) | |||
5298 | } | 5304 | } |
5299 | 5305 | ||
5300 | /* | 5306 | /* |
5301 | * While a dead CPU has no uninterruptible tasks queued at this point, | 5307 | * Since this CPU is going 'away' for a while, fold any nr_active delta |
5302 | * it might still have a nonzero ->nr_uninterruptible counter, because | 5308 | * we might have. Assumes we're called after migrate_tasks() so that the |
5303 | * for performance reasons the counter is not stricly tracking tasks to | 5309 | * nr_active count is stable. |
5304 | * their home CPUs. So we just add the counter to another CPU's counter, | 5310 | * |
5305 | * to keep the global sum constant after CPU-down: | 5311 | * Also see the comment "Global load-average calculations". |
5306 | */ | ||
5307 | static void migrate_nr_uninterruptible(struct rq *rq_src) | ||
5308 | { | ||
5309 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); | ||
5310 | |||
5311 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | ||
5312 | rq_src->nr_uninterruptible = 0; | ||
5313 | } | ||
5314 | |||
5315 | /* | ||
5316 | * remove the tasks which were accounted by rq from calc_load_tasks. | ||
5317 | */ | 5312 | */ |
5318 | static void calc_global_load_remove(struct rq *rq) | 5313 | static void calc_load_migrate(struct rq *rq) |
5319 | { | 5314 | { |
5320 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | 5315 | long delta = calc_load_fold_active(rq); |
5321 | rq->calc_load_active = 0; | 5316 | if (delta) |
5317 | atomic_long_add(delta, &calc_load_tasks); | ||
5322 | } | 5318 | } |
5323 | 5319 | ||
5324 | /* | 5320 | /* |
@@ -5346,9 +5342,6 @@ static void migrate_tasks(unsigned int dead_cpu) | |||
5346 | */ | 5342 | */ |
5347 | rq->stop = NULL; | 5343 | rq->stop = NULL; |
5348 | 5344 | ||
5349 | /* Ensure any throttled groups are reachable by pick_next_task */ | ||
5350 | unthrottle_offline_cfs_rqs(rq); | ||
5351 | |||
5352 | for ( ; ; ) { | 5345 | for ( ; ; ) { |
5353 | /* | 5346 | /* |
5354 | * There's this thread running, bail when that's the only | 5347 | * There's this thread running, bail when that's the only |
@@ -5612,8 +5605,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5612 | BUG_ON(rq->nr_running != 1); /* the migration thread */ | 5605 | BUG_ON(rq->nr_running != 1); /* the migration thread */ |
5613 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 5606 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
5614 | 5607 | ||
5615 | migrate_nr_uninterruptible(rq); | 5608 | calc_load_migrate(rq); |
5616 | calc_global_load_remove(rq); | ||
5617 | break; | 5609 | break; |
5618 | #endif | 5610 | #endif |
5619 | } | 5611 | } |
@@ -6022,11 +6014,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
6022 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this | 6014 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this |
6023 | * allows us to avoid some pointer chasing select_idle_sibling(). | 6015 | * allows us to avoid some pointer chasing select_idle_sibling(). |
6024 | * | 6016 | * |
6025 | * Iterate domains and sched_groups downward, assigning CPUs to be | ||
6026 | * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing | ||
6027 | * due to random perturbation self canceling, ie sw buddies pull | ||
6028 | * their counterpart to their CPU's hw counterpart. | ||
6029 | * | ||
6030 | * Also keep a unique ID per domain (we use the first cpu number in | 6017 | * Also keep a unique ID per domain (we use the first cpu number in |
6031 | * the cpumask of the domain), this allows us to quickly tell if | 6018 | * the cpumask of the domain), this allows us to quickly tell if |
6032 | * two cpus are in the same cache domain, see cpus_share_cache(). | 6019 | * two cpus are in the same cache domain, see cpus_share_cache(). |
@@ -6040,40 +6027,8 @@ static void update_top_cache_domain(int cpu) | |||
6040 | int id = cpu; | 6027 | int id = cpu; |
6041 | 6028 | ||
6042 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | 6029 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); |
6043 | if (sd) { | 6030 | if (sd) |
6044 | struct sched_domain *tmp = sd; | ||
6045 | struct sched_group *sg, *prev; | ||
6046 | bool right; | ||
6047 | |||
6048 | /* | ||
6049 | * Traverse to first CPU in group, and count hops | ||
6050 | * to cpu from there, switching direction on each | ||
6051 | * hop, never ever pointing the last CPU rightward. | ||
6052 | */ | ||
6053 | do { | ||
6054 | id = cpumask_first(sched_domain_span(tmp)); | ||
6055 | prev = sg = tmp->groups; | ||
6056 | right = 1; | ||
6057 | |||
6058 | while (cpumask_first(sched_group_cpus(sg)) != id) | ||
6059 | sg = sg->next; | ||
6060 | |||
6061 | while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { | ||
6062 | prev = sg; | ||
6063 | sg = sg->next; | ||
6064 | right = !right; | ||
6065 | } | ||
6066 | |||
6067 | /* A CPU went down, never point back to domain start. */ | ||
6068 | if (right && cpumask_first(sched_group_cpus(sg->next)) == id) | ||
6069 | right = false; | ||
6070 | |||
6071 | sg = right ? sg->next : prev; | ||
6072 | tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); | ||
6073 | } while ((tmp = tmp->child)); | ||
6074 | |||
6075 | id = cpumask_first(sched_domain_span(sd)); | 6031 | id = cpumask_first(sched_domain_span(sd)); |
6076 | } | ||
6077 | 6032 | ||
6078 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | 6033 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); |
6079 | per_cpu(sd_llc_id, cpu) = id; | 6034 | per_cpu(sd_llc_id, cpu) = id; |
@@ -7246,6 +7201,7 @@ int in_sched_functions(unsigned long addr) | |||
7246 | 7201 | ||
7247 | #ifdef CONFIG_CGROUP_SCHED | 7202 | #ifdef CONFIG_CGROUP_SCHED |
7248 | struct task_group root_task_group; | 7203 | struct task_group root_task_group; |
7204 | LIST_HEAD(task_groups); | ||
7249 | #endif | 7205 | #endif |
7250 | 7206 | ||
7251 | DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 7207 | DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d0cc03b3e70b..96e2b18b6283 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -2052,7 +2052,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
2052 | hrtimer_cancel(&cfs_b->slack_timer); | 2052 | hrtimer_cancel(&cfs_b->slack_timer); |
2053 | } | 2053 | } |
2054 | 2054 | ||
2055 | void unthrottle_offline_cfs_rqs(struct rq *rq) | 2055 | static void unthrottle_offline_cfs_rqs(struct rq *rq) |
2056 | { | 2056 | { |
2057 | struct cfs_rq *cfs_rq; | 2057 | struct cfs_rq *cfs_rq; |
2058 | 2058 | ||
@@ -2106,7 +2106,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | |||
2106 | return NULL; | 2106 | return NULL; |
2107 | } | 2107 | } |
2108 | static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | 2108 | static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} |
2109 | void unthrottle_offline_cfs_rqs(struct rq *rq) {} | 2109 | static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} |
2110 | 2110 | ||
2111 | #endif /* CONFIG_CFS_BANDWIDTH */ | 2111 | #endif /* CONFIG_CFS_BANDWIDTH */ |
2112 | 2112 | ||
@@ -2637,6 +2637,8 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2637 | int cpu = smp_processor_id(); | 2637 | int cpu = smp_processor_id(); |
2638 | int prev_cpu = task_cpu(p); | 2638 | int prev_cpu = task_cpu(p); |
2639 | struct sched_domain *sd; | 2639 | struct sched_domain *sd; |
2640 | struct sched_group *sg; | ||
2641 | int i; | ||
2640 | 2642 | ||
2641 | /* | 2643 | /* |
2642 | * If the task is going to be woken-up on this cpu and if it is | 2644 | * If the task is going to be woken-up on this cpu and if it is |
@@ -2653,17 +2655,29 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
2653 | return prev_cpu; | 2655 | return prev_cpu; |
2654 | 2656 | ||
2655 | /* | 2657 | /* |
2656 | * Otherwise, check assigned siblings to find an elegible idle cpu. | 2658 | * Otherwise, iterate the domains and find an elegible idle cpu. |
2657 | */ | 2659 | */ |
2658 | sd = rcu_dereference(per_cpu(sd_llc, target)); | 2660 | sd = rcu_dereference(per_cpu(sd_llc, target)); |
2659 | |||
2660 | for_each_lower_domain(sd) { | 2661 | for_each_lower_domain(sd) { |
2661 | if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) | 2662 | sg = sd->groups; |
2662 | continue; | 2663 | do { |
2663 | if (idle_cpu(sd->idle_buddy)) | 2664 | if (!cpumask_intersects(sched_group_cpus(sg), |
2664 | return sd->idle_buddy; | 2665 | tsk_cpus_allowed(p))) |
2665 | } | 2666 | goto next; |
2666 | 2667 | ||
2668 | for_each_cpu(i, sched_group_cpus(sg)) { | ||
2669 | if (!idle_cpu(i)) | ||
2670 | goto next; | ||
2671 | } | ||
2672 | |||
2673 | target = cpumask_first_and(sched_group_cpus(sg), | ||
2674 | tsk_cpus_allowed(p)); | ||
2675 | goto done; | ||
2676 | next: | ||
2677 | sg = sg->next; | ||
2678 | } while (sg != sd->groups); | ||
2679 | } | ||
2680 | done: | ||
2667 | return target; | 2681 | return target; |
2668 | } | 2682 | } |
2669 | 2683 | ||
@@ -3387,6 +3401,14 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
3387 | 3401 | ||
3388 | static void update_h_load(long cpu) | 3402 | static void update_h_load(long cpu) |
3389 | { | 3403 | { |
3404 | struct rq *rq = cpu_rq(cpu); | ||
3405 | unsigned long now = jiffies; | ||
3406 | |||
3407 | if (rq->h_load_throttle == now) | ||
3408 | return; | ||
3409 | |||
3410 | rq->h_load_throttle = now; | ||
3411 | |||
3390 | rcu_read_lock(); | 3412 | rcu_read_lock(); |
3391 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 3413 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
3392 | rcu_read_unlock(); | 3414 | rcu_read_unlock(); |
@@ -3650,7 +3672,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
3650 | * @group: sched_group whose statistics are to be updated. | 3672 | * @group: sched_group whose statistics are to be updated. |
3651 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 3673 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
3652 | * @local_group: Does group contain this_cpu. | 3674 | * @local_group: Does group contain this_cpu. |
3653 | * @cpus: Set of cpus considered for load balancing. | ||
3654 | * @balance: Should we balance. | 3675 | * @balance: Should we balance. |
3655 | * @sgs: variable to hold the statistics for this group. | 3676 | * @sgs: variable to hold the statistics for this group. |
3656 | */ | 3677 | */ |
@@ -3797,7 +3818,6 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
3797 | /** | 3818 | /** |
3798 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. | 3819 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. |
3799 | * @env: The load balancing environment. | 3820 | * @env: The load balancing environment. |
3800 | * @cpus: Set of cpus considered for load balancing. | ||
3801 | * @balance: Should we balance. | 3821 | * @balance: Should we balance. |
3802 | * @sds: variable to hold the statistics for this sched_domain. | 3822 | * @sds: variable to hold the statistics for this sched_domain. |
3803 | */ | 3823 | */ |
@@ -4293,11 +4313,10 @@ redo: | |||
4293 | env.src_rq = busiest; | 4313 | env.src_rq = busiest; |
4294 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 4314 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
4295 | 4315 | ||
4316 | update_h_load(env.src_cpu); | ||
4296 | more_balance: | 4317 | more_balance: |
4297 | local_irq_save(flags); | 4318 | local_irq_save(flags); |
4298 | double_rq_lock(this_rq, busiest); | 4319 | double_rq_lock(this_rq, busiest); |
4299 | if (!env.loop) | ||
4300 | update_h_load(env.src_cpu); | ||
4301 | 4320 | ||
4302 | /* | 4321 | /* |
4303 | * cur_ld_moved - load moved in current iteration | 4322 | * cur_ld_moved - load moved in current iteration |
@@ -4949,6 +4968,9 @@ static void rq_online_fair(struct rq *rq) | |||
4949 | static void rq_offline_fair(struct rq *rq) | 4968 | static void rq_offline_fair(struct rq *rq) |
4950 | { | 4969 | { |
4951 | update_sysctl(); | 4970 | update_sysctl(); |
4971 | |||
4972 | /* Ensure any throttled groups are reachable by pick_next_task */ | ||
4973 | unthrottle_offline_cfs_rqs(rq); | ||
4952 | } | 4974 | } |
4953 | 4975 | ||
4954 | #endif /* CONFIG_SMP */ | 4976 | #endif /* CONFIG_SMP */ |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 573e1ca01102..e0b7ba9c040f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -691,6 +691,7 @@ balanced: | |||
691 | * runtime - in which case borrowing doesn't make sense. | 691 | * runtime - in which case borrowing doesn't make sense. |
692 | */ | 692 | */ |
693 | rt_rq->rt_runtime = RUNTIME_INF; | 693 | rt_rq->rt_runtime = RUNTIME_INF; |
694 | rt_rq->rt_throttled = 0; | ||
694 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 695 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
695 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 696 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
696 | } | 697 | } |
@@ -788,6 +789,19 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
788 | const struct cpumask *span; | 789 | const struct cpumask *span; |
789 | 790 | ||
790 | span = sched_rt_period_mask(); | 791 | span = sched_rt_period_mask(); |
792 | #ifdef CONFIG_RT_GROUP_SCHED | ||
793 | /* | ||
794 | * FIXME: isolated CPUs should really leave the root task group, | ||
795 | * whether they are isolcpus or were isolated via cpusets, lest | ||
796 | * the timer run on a CPU which does not service all runqueues, | ||
797 | * potentially leaving other CPUs indefinitely throttled. If | ||
798 | * isolation is really required, the user will turn the throttle | ||
799 | * off to kill the perturbations it causes anyway. Meanwhile, | ||
800 | * this maintains functionality for boot and/or troubleshooting. | ||
801 | */ | ||
802 | if (rt_b == &root_task_group.rt_bandwidth) | ||
803 | span = cpu_online_mask; | ||
804 | #endif | ||
791 | for_each_cpu(i, span) { | 805 | for_each_cpu(i, span) { |
792 | int enqueue = 0; | 806 | int enqueue = 0; |
793 | struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); | 807 | struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c35a1a7dd4d6..0848fa36c383 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex; | |||
80 | struct cfs_rq; | 80 | struct cfs_rq; |
81 | struct rt_rq; | 81 | struct rt_rq; |
82 | 82 | ||
83 | static LIST_HEAD(task_groups); | 83 | extern struct list_head task_groups; |
84 | 84 | ||
85 | struct cfs_bandwidth { | 85 | struct cfs_bandwidth { |
86 | #ifdef CONFIG_CFS_BANDWIDTH | 86 | #ifdef CONFIG_CFS_BANDWIDTH |
@@ -374,7 +374,11 @@ struct rq { | |||
374 | #ifdef CONFIG_FAIR_GROUP_SCHED | 374 | #ifdef CONFIG_FAIR_GROUP_SCHED |
375 | /* list of leaf cfs_rq on this cpu: */ | 375 | /* list of leaf cfs_rq on this cpu: */ |
376 | struct list_head leaf_cfs_rq_list; | 376 | struct list_head leaf_cfs_rq_list; |
377 | #endif | 377 | #ifdef CONFIG_SMP |
378 | unsigned long h_load_throttle; | ||
379 | #endif /* CONFIG_SMP */ | ||
380 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
381 | |||
378 | #ifdef CONFIG_RT_GROUP_SCHED | 382 | #ifdef CONFIG_RT_GROUP_SCHED |
379 | struct list_head leaf_rt_rq_list; | 383 | struct list_head leaf_rt_rq_list; |
380 | #endif | 384 | #endif |
@@ -1140,7 +1144,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu); | |||
1140 | 1144 | ||
1141 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1145 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); |
1142 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1146 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); |
1143 | extern void unthrottle_offline_cfs_rqs(struct rq *rq); | ||
1144 | 1147 | ||
1145 | extern void account_cfs_bandwidth_used(int enabled, int was_enabled); | 1148 | extern void account_cfs_bandwidth_used(int enabled, int was_enabled); |
1146 | 1149 | ||
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 7b386e86fd23..da5eb5bed84a 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
@@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) | |||
27 | { | 27 | { |
28 | struct task_struct *stop = rq->stop; | 28 | struct task_struct *stop = rq->stop; |
29 | 29 | ||
30 | if (stop && stop->on_rq) | 30 | if (stop && stop->on_rq) { |
31 | stop->se.exec_start = rq->clock_task; | ||
31 | return stop; | 32 | return stop; |
33 | } | ||
32 | 34 | ||
33 | return NULL; | 35 | return NULL; |
34 | } | 36 | } |
@@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq) | |||
52 | 54 | ||
53 | static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) | 55 | static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) |
54 | { | 56 | { |
57 | struct task_struct *curr = rq->curr; | ||
58 | u64 delta_exec; | ||
59 | |||
60 | delta_exec = rq->clock_task - curr->se.exec_start; | ||
61 | if (unlikely((s64)delta_exec < 0)) | ||
62 | delta_exec = 0; | ||
63 | |||
64 | schedstat_set(curr->se.statistics.exec_max, | ||
65 | max(curr->se.statistics.exec_max, delta_exec)); | ||
66 | |||
67 | curr->se.sum_exec_runtime += delta_exec; | ||
68 | account_group_exec_runtime(curr, delta_exec); | ||
69 | |||
70 | curr->se.exec_start = rq->clock_task; | ||
71 | cpuacct_charge(curr, delta_exec); | ||
55 | } | 72 | } |
56 | 73 | ||
57 | static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) | 74 | static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) |
@@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) | |||
60 | 77 | ||
61 | static void set_curr_task_stop(struct rq *rq) | 78 | static void set_curr_task_stop(struct rq *rq) |
62 | { | 79 | { |
80 | struct task_struct *stop = rq->stop; | ||
81 | |||
82 | stop->se.exec_start = rq->clock_task; | ||
63 | } | 83 | } |
64 | 84 | ||
65 | static void switched_to_stop(struct rq *rq, struct task_struct *p) | 85 | static void switched_to_stop(struct rq *rq, struct task_struct *p) |
diff --git a/kernel/task_work.c b/kernel/task_work.c index 91d4e1742a0c..d320d44903bd 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c | |||
@@ -75,6 +75,7 @@ void task_work_run(void) | |||
75 | p = q->next; | 75 | p = q->next; |
76 | q->func(q); | 76 | q->func(q); |
77 | q = p; | 77 | q = p; |
78 | cond_resched(); | ||
78 | } | 79 | } |
79 | } | 80 | } |
80 | } | 81 | } |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 024540f97f74..3a9e5d5c1091 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -573,6 +573,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) | |||
573 | tick_do_update_jiffies64(now); | 573 | tick_do_update_jiffies64(now); |
574 | update_cpu_load_nohz(); | 574 | update_cpu_load_nohz(); |
575 | 575 | ||
576 | calc_load_exit_idle(); | ||
576 | touch_softlockup_watchdog(); | 577 | touch_softlockup_watchdog(); |
577 | /* | 578 | /* |
578 | * Cancel the scheduled timer and restore the tick | 579 | * Cancel the scheduled timer and restore the tick |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e16af197a2bc..34e5eac81424 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -115,6 +115,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) | |||
115 | { | 115 | { |
116 | tk->xtime_sec += ts->tv_sec; | 116 | tk->xtime_sec += ts->tv_sec; |
117 | tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; | 117 | tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; |
118 | tk_normalize_xtime(tk); | ||
118 | } | 119 | } |
119 | 120 | ||
120 | static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) | 121 | static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) |
@@ -276,7 +277,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) | |||
276 | tk->xtime_nsec += cycle_delta * tk->mult; | 277 | tk->xtime_nsec += cycle_delta * tk->mult; |
277 | 278 | ||
278 | /* If arch requires, add in gettimeoffset() */ | 279 | /* If arch requires, add in gettimeoffset() */ |
279 | tk->xtime_nsec += arch_gettimeoffset() << tk->shift; | 280 | tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; |
280 | 281 | ||
281 | tk_normalize_xtime(tk); | 282 | tk_normalize_xtime(tk); |
282 | 283 | ||
@@ -427,7 +428,7 @@ int do_settimeofday(const struct timespec *tv) | |||
427 | struct timespec ts_delta, xt; | 428 | struct timespec ts_delta, xt; |
428 | unsigned long flags; | 429 | unsigned long flags; |
429 | 430 | ||
430 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) | 431 | if (!timespec_valid_strict(tv)) |
431 | return -EINVAL; | 432 | return -EINVAL; |
432 | 433 | ||
433 | write_seqlock_irqsave(&tk->lock, flags); | 434 | write_seqlock_irqsave(&tk->lock, flags); |
@@ -463,6 +464,8 @@ int timekeeping_inject_offset(struct timespec *ts) | |||
463 | { | 464 | { |
464 | struct timekeeper *tk = &timekeeper; | 465 | struct timekeeper *tk = &timekeeper; |
465 | unsigned long flags; | 466 | unsigned long flags; |
467 | struct timespec tmp; | ||
468 | int ret = 0; | ||
466 | 469 | ||
467 | if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) | 470 | if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) |
468 | return -EINVAL; | 471 | return -EINVAL; |
@@ -471,10 +474,17 @@ int timekeeping_inject_offset(struct timespec *ts) | |||
471 | 474 | ||
472 | timekeeping_forward_now(tk); | 475 | timekeeping_forward_now(tk); |
473 | 476 | ||
477 | /* Make sure the proposed value is valid */ | ||
478 | tmp = timespec_add(tk_xtime(tk), *ts); | ||
479 | if (!timespec_valid_strict(&tmp)) { | ||
480 | ret = -EINVAL; | ||
481 | goto error; | ||
482 | } | ||
474 | 483 | ||
475 | tk_xtime_add(tk, ts); | 484 | tk_xtime_add(tk, ts); |
476 | tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); | 485 | tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); |
477 | 486 | ||
487 | error: /* even if we error out, we forwarded the time, so call update */ | ||
478 | timekeeping_update(tk, true); | 488 | timekeeping_update(tk, true); |
479 | 489 | ||
480 | write_sequnlock_irqrestore(&tk->lock, flags); | 490 | write_sequnlock_irqrestore(&tk->lock, flags); |
@@ -482,7 +492,7 @@ int timekeeping_inject_offset(struct timespec *ts) | |||
482 | /* signal hrtimers about time change */ | 492 | /* signal hrtimers about time change */ |
483 | clock_was_set(); | 493 | clock_was_set(); |
484 | 494 | ||
485 | return 0; | 495 | return ret; |
486 | } | 496 | } |
487 | EXPORT_SYMBOL(timekeeping_inject_offset); | 497 | EXPORT_SYMBOL(timekeeping_inject_offset); |
488 | 498 | ||
@@ -649,7 +659,20 @@ void __init timekeeping_init(void) | |||
649 | struct timespec now, boot, tmp; | 659 | struct timespec now, boot, tmp; |
650 | 660 | ||
651 | read_persistent_clock(&now); | 661 | read_persistent_clock(&now); |
662 | if (!timespec_valid_strict(&now)) { | ||
663 | pr_warn("WARNING: Persistent clock returned invalid value!\n" | ||
664 | " Check your CMOS/BIOS settings.\n"); | ||
665 | now.tv_sec = 0; | ||
666 | now.tv_nsec = 0; | ||
667 | } | ||
668 | |||
652 | read_boot_clock(&boot); | 669 | read_boot_clock(&boot); |
670 | if (!timespec_valid_strict(&boot)) { | ||
671 | pr_warn("WARNING: Boot clock returned invalid value!\n" | ||
672 | " Check your CMOS/BIOS settings.\n"); | ||
673 | boot.tv_sec = 0; | ||
674 | boot.tv_nsec = 0; | ||
675 | } | ||
653 | 676 | ||
654 | seqlock_init(&tk->lock); | 677 | seqlock_init(&tk->lock); |
655 | 678 | ||
@@ -690,7 +713,7 @@ static struct timespec timekeeping_suspend_time; | |||
690 | static void __timekeeping_inject_sleeptime(struct timekeeper *tk, | 713 | static void __timekeeping_inject_sleeptime(struct timekeeper *tk, |
691 | struct timespec *delta) | 714 | struct timespec *delta) |
692 | { | 715 | { |
693 | if (!timespec_valid(delta)) { | 716 | if (!timespec_valid_strict(delta)) { |
694 | printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " | 717 | printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " |
695 | "sleep delta value!\n"); | 718 | "sleep delta value!\n"); |
696 | return; | 719 | return; |
@@ -1129,6 +1152,10 @@ static void update_wall_time(void) | |||
1129 | offset = (clock->read(clock) - clock->cycle_last) & clock->mask; | 1152 | offset = (clock->read(clock) - clock->cycle_last) & clock->mask; |
1130 | #endif | 1153 | #endif |
1131 | 1154 | ||
1155 | /* Check if there's really nothing to do */ | ||
1156 | if (offset < tk->cycle_interval) | ||
1157 | goto out; | ||
1158 | |||
1132 | /* | 1159 | /* |
1133 | * With NO_HZ we may have to accumulate many cycle_intervals | 1160 | * With NO_HZ we may have to accumulate many cycle_intervals |
1134 | * (think "ticks") worth of time at once. To do this efficiently, | 1161 | * (think "ticks") worth of time at once. To do this efficiently, |
@@ -1161,9 +1188,9 @@ static void update_wall_time(void) | |||
1161 | * the vsyscall implementations are converted to use xtime_nsec | 1188 | * the vsyscall implementations are converted to use xtime_nsec |
1162 | * (shifted nanoseconds), this can be killed. | 1189 | * (shifted nanoseconds), this can be killed. |
1163 | */ | 1190 | */ |
1164 | remainder = tk->xtime_nsec & ((1 << tk->shift) - 1); | 1191 | remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); |
1165 | tk->xtime_nsec -= remainder; | 1192 | tk->xtime_nsec -= remainder; |
1166 | tk->xtime_nsec += 1 << tk->shift; | 1193 | tk->xtime_nsec += 1ULL << tk->shift; |
1167 | tk->ntp_error += remainder << tk->ntp_error_shift; | 1194 | tk->ntp_error += remainder << tk->ntp_error_shift; |
1168 | 1195 | ||
1169 | /* | 1196 | /* |
diff --git a/kernel/timer.c b/kernel/timer.c index a61c09374eba..8c5e7b908c68 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -1407,13 +1407,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds) | |||
1407 | 1407 | ||
1408 | #endif | 1408 | #endif |
1409 | 1409 | ||
1410 | #ifndef __alpha__ | ||
1411 | |||
1412 | /* | ||
1413 | * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this | ||
1414 | * should be moved into arch/i386 instead? | ||
1415 | */ | ||
1416 | |||
1417 | /** | 1410 | /** |
1418 | * sys_getpid - return the thread group id of the current process | 1411 | * sys_getpid - return the thread group id of the current process |
1419 | * | 1412 | * |
@@ -1469,8 +1462,6 @@ SYSCALL_DEFINE0(getegid) | |||
1469 | return from_kgid_munged(current_user_ns(), current_egid()); | 1462 | return from_kgid_munged(current_user_ns(), current_egid()); |
1470 | } | 1463 | } |
1471 | 1464 | ||
1472 | #endif | ||
1473 | |||
1474 | static void process_timeout(unsigned long __data) | 1465 | static void process_timeout(unsigned long __data) |
1475 | { | 1466 | { |
1476 | wake_up_process((struct task_struct *)__data); | 1467 | wake_up_process((struct task_struct *)__data); |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 60e4d7875672..6b245f64c8dd 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -506,6 +506,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
506 | int size; | 506 | int size; |
507 | 507 | ||
508 | syscall_nr = syscall_get_nr(current, regs); | 508 | syscall_nr = syscall_get_nr(current, regs); |
509 | if (syscall_nr < 0) | ||
510 | return; | ||
509 | if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) | 511 | if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) |
510 | return; | 512 | return; |
511 | 513 | ||
@@ -580,6 +582,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
580 | int size; | 582 | int size; |
581 | 583 | ||
582 | syscall_nr = syscall_get_nr(current, regs); | 584 | syscall_nr = syscall_get_nr(current, regs); |
585 | if (syscall_nr < 0) | ||
586 | return; | ||
583 | if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) | 587 | if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) |
584 | return; | 588 | return; |
585 | 589 | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 692d97628a10..1e1373bcb3e3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -66,6 +66,7 @@ enum { | |||
66 | 66 | ||
67 | /* pool flags */ | 67 | /* pool flags */ |
68 | POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ | 68 | POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ |
69 | POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ | ||
69 | 70 | ||
70 | /* worker flags */ | 71 | /* worker flags */ |
71 | WORKER_STARTED = 1 << 0, /* started */ | 72 | WORKER_STARTED = 1 << 0, /* started */ |
@@ -652,7 +653,7 @@ static bool need_to_manage_workers(struct worker_pool *pool) | |||
652 | /* Do we have too many workers and should some go away? */ | 653 | /* Do we have too many workers and should some go away? */ |
653 | static bool too_many_workers(struct worker_pool *pool) | 654 | static bool too_many_workers(struct worker_pool *pool) |
654 | { | 655 | { |
655 | bool managing = mutex_is_locked(&pool->manager_mutex); | 656 | bool managing = pool->flags & POOL_MANAGING_WORKERS; |
656 | int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ | 657 | int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ |
657 | int nr_busy = pool->nr_workers - nr_idle; | 658 | int nr_busy = pool->nr_workers - nr_idle; |
658 | 659 | ||
@@ -1326,6 +1327,15 @@ static void idle_worker_rebind(struct worker *worker) | |||
1326 | 1327 | ||
1327 | /* we did our part, wait for rebind_workers() to finish up */ | 1328 | /* we did our part, wait for rebind_workers() to finish up */ |
1328 | wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); | 1329 | wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); |
1330 | |||
1331 | /* | ||
1332 | * rebind_workers() shouldn't finish until all workers passed the | ||
1333 | * above WORKER_REBIND wait. Tell it when done. | ||
1334 | */ | ||
1335 | spin_lock_irq(&worker->pool->gcwq->lock); | ||
1336 | if (!--worker->idle_rebind->cnt) | ||
1337 | complete(&worker->idle_rebind->done); | ||
1338 | spin_unlock_irq(&worker->pool->gcwq->lock); | ||
1329 | } | 1339 | } |
1330 | 1340 | ||
1331 | /* | 1341 | /* |
@@ -1396,12 +1406,15 @@ retry: | |||
1396 | /* set REBIND and kick idle ones, we'll wait for these later */ | 1406 | /* set REBIND and kick idle ones, we'll wait for these later */ |
1397 | for_each_worker_pool(pool, gcwq) { | 1407 | for_each_worker_pool(pool, gcwq) { |
1398 | list_for_each_entry(worker, &pool->idle_list, entry) { | 1408 | list_for_each_entry(worker, &pool->idle_list, entry) { |
1409 | unsigned long worker_flags = worker->flags; | ||
1410 | |||
1399 | if (worker->flags & WORKER_REBIND) | 1411 | if (worker->flags & WORKER_REBIND) |
1400 | continue; | 1412 | continue; |
1401 | 1413 | ||
1402 | /* morph UNBOUND to REBIND */ | 1414 | /* morph UNBOUND to REBIND atomically */ |
1403 | worker->flags &= ~WORKER_UNBOUND; | 1415 | worker_flags &= ~WORKER_UNBOUND; |
1404 | worker->flags |= WORKER_REBIND; | 1416 | worker_flags |= WORKER_REBIND; |
1417 | ACCESS_ONCE(worker->flags) = worker_flags; | ||
1405 | 1418 | ||
1406 | idle_rebind.cnt++; | 1419 | idle_rebind.cnt++; |
1407 | worker->idle_rebind = &idle_rebind; | 1420 | worker->idle_rebind = &idle_rebind; |
@@ -1419,25 +1432,15 @@ retry: | |||
1419 | goto retry; | 1432 | goto retry; |
1420 | } | 1433 | } |
1421 | 1434 | ||
1422 | /* | 1435 | /* all idle workers are rebound, rebind busy workers */ |
1423 | * All idle workers are rebound and waiting for %WORKER_REBIND to | ||
1424 | * be cleared inside idle_worker_rebind(). Clear and release. | ||
1425 | * Clearing %WORKER_REBIND from this foreign context is safe | ||
1426 | * because these workers are still guaranteed to be idle. | ||
1427 | */ | ||
1428 | for_each_worker_pool(pool, gcwq) | ||
1429 | list_for_each_entry(worker, &pool->idle_list, entry) | ||
1430 | worker->flags &= ~WORKER_REBIND; | ||
1431 | |||
1432 | wake_up_all(&gcwq->rebind_hold); | ||
1433 | |||
1434 | /* rebind busy workers */ | ||
1435 | for_each_busy_worker(worker, i, pos, gcwq) { | 1436 | for_each_busy_worker(worker, i, pos, gcwq) { |
1436 | struct work_struct *rebind_work = &worker->rebind_work; | 1437 | struct work_struct *rebind_work = &worker->rebind_work; |
1438 | unsigned long worker_flags = worker->flags; | ||
1437 | 1439 | ||
1438 | /* morph UNBOUND to REBIND */ | 1440 | /* morph UNBOUND to REBIND atomically */ |
1439 | worker->flags &= ~WORKER_UNBOUND; | 1441 | worker_flags &= ~WORKER_UNBOUND; |
1440 | worker->flags |= WORKER_REBIND; | 1442 | worker_flags |= WORKER_REBIND; |
1443 | ACCESS_ONCE(worker->flags) = worker_flags; | ||
1441 | 1444 | ||
1442 | if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, | 1445 | if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, |
1443 | work_data_bits(rebind_work))) | 1446 | work_data_bits(rebind_work))) |
@@ -1449,6 +1452,34 @@ retry: | |||
1449 | worker->scheduled.next, | 1452 | worker->scheduled.next, |
1450 | work_color_to_flags(WORK_NO_COLOR)); | 1453 | work_color_to_flags(WORK_NO_COLOR)); |
1451 | } | 1454 | } |
1455 | |||
1456 | /* | ||
1457 | * All idle workers are rebound and waiting for %WORKER_REBIND to | ||
1458 | * be cleared inside idle_worker_rebind(). Clear and release. | ||
1459 | * Clearing %WORKER_REBIND from this foreign context is safe | ||
1460 | * because these workers are still guaranteed to be idle. | ||
1461 | * | ||
1462 | * We need to make sure all idle workers passed WORKER_REBIND wait | ||
1463 | * in idle_worker_rebind() before returning; otherwise, workers can | ||
1464 | * get stuck at the wait if hotplug cycle repeats. | ||
1465 | */ | ||
1466 | idle_rebind.cnt = 1; | ||
1467 | INIT_COMPLETION(idle_rebind.done); | ||
1468 | |||
1469 | for_each_worker_pool(pool, gcwq) { | ||
1470 | list_for_each_entry(worker, &pool->idle_list, entry) { | ||
1471 | worker->flags &= ~WORKER_REBIND; | ||
1472 | idle_rebind.cnt++; | ||
1473 | } | ||
1474 | } | ||
1475 | |||
1476 | wake_up_all(&gcwq->rebind_hold); | ||
1477 | |||
1478 | if (--idle_rebind.cnt) { | ||
1479 | spin_unlock_irq(&gcwq->lock); | ||
1480 | wait_for_completion(&idle_rebind.done); | ||
1481 | spin_lock_irq(&gcwq->lock); | ||
1482 | } | ||
1452 | } | 1483 | } |
1453 | 1484 | ||
1454 | static struct worker *alloc_worker(void) | 1485 | static struct worker *alloc_worker(void) |
@@ -1794,9 +1825,45 @@ static bool manage_workers(struct worker *worker) | |||
1794 | struct worker_pool *pool = worker->pool; | 1825 | struct worker_pool *pool = worker->pool; |
1795 | bool ret = false; | 1826 | bool ret = false; |
1796 | 1827 | ||
1797 | if (!mutex_trylock(&pool->manager_mutex)) | 1828 | if (pool->flags & POOL_MANAGING_WORKERS) |
1798 | return ret; | 1829 | return ret; |
1799 | 1830 | ||
1831 | pool->flags |= POOL_MANAGING_WORKERS; | ||
1832 | |||
1833 | /* | ||
1834 | * To simplify both worker management and CPU hotplug, hold off | ||
1835 | * management while hotplug is in progress. CPU hotplug path can't | ||
1836 | * grab %POOL_MANAGING_WORKERS to achieve this because that can | ||
1837 | * lead to idle worker depletion (all become busy thinking someone | ||
1838 | * else is managing) which in turn can result in deadlock under | ||
1839 | * extreme circumstances. Use @pool->manager_mutex to synchronize | ||
1840 | * manager against CPU hotplug. | ||
1841 | * | ||
1842 | * manager_mutex would always be free unless CPU hotplug is in | ||
1843 | * progress. trylock first without dropping @gcwq->lock. | ||
1844 | */ | ||
1845 | if (unlikely(!mutex_trylock(&pool->manager_mutex))) { | ||
1846 | spin_unlock_irq(&pool->gcwq->lock); | ||
1847 | mutex_lock(&pool->manager_mutex); | ||
1848 | /* | ||
1849 | * CPU hotplug could have happened while we were waiting | ||
1850 | * for manager_mutex. Hotplug itself can't handle us | ||
1851 | * because manager isn't either on idle or busy list, and | ||
1852 | * @gcwq's state and ours could have deviated. | ||
1853 | * | ||
1854 | * As hotplug is now excluded via manager_mutex, we can | ||
1855 | * simply try to bind. It will succeed or fail depending | ||
1856 | * on @gcwq's current state. Try it and adjust | ||
1857 | * %WORKER_UNBOUND accordingly. | ||
1858 | */ | ||
1859 | if (worker_maybe_bind_and_lock(worker)) | ||
1860 | worker->flags &= ~WORKER_UNBOUND; | ||
1861 | else | ||
1862 | worker->flags |= WORKER_UNBOUND; | ||
1863 | |||
1864 | ret = true; | ||
1865 | } | ||
1866 | |||
1800 | pool->flags &= ~POOL_MANAGE_WORKERS; | 1867 | pool->flags &= ~POOL_MANAGE_WORKERS; |
1801 | 1868 | ||
1802 | /* | 1869 | /* |
@@ -1806,6 +1873,7 @@ static bool manage_workers(struct worker *worker) | |||
1806 | ret |= maybe_destroy_workers(pool); | 1873 | ret |= maybe_destroy_workers(pool); |
1807 | ret |= maybe_create_worker(pool); | 1874 | ret |= maybe_create_worker(pool); |
1808 | 1875 | ||
1876 | pool->flags &= ~POOL_MANAGING_WORKERS; | ||
1809 | mutex_unlock(&pool->manager_mutex); | 1877 | mutex_unlock(&pool->manager_mutex); |
1810 | return ret; | 1878 | return ret; |
1811 | } | 1879 | } |