aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJames Morris <james.l.morris@oracle.com>2012-09-27 23:37:32 -0400
committerJames Morris <james.l.morris@oracle.com>2012-09-27 23:37:32 -0400
commitbf5308344527d015ac9a6d2bda4ad4d40fd7d943 (patch)
tree566e61e2cfc648c374d15cfc8c661b73e1a471f8 /kernel
parent3585e96cd1049682b8a19a0b699422156e9d735b (diff)
parent979570e02981d4a8fc20b3cc8fd651856c98ee9d (diff)
Merge tag 'v3.6-rc7' into next
Linux 3.6-rc7 Requested by David Howells so he can merge his key susbsystem work into my tree with requisite -linus changesets.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit_tree.c19
-rw-r--r--kernel/events/core.c64
-rw-r--r--kernel/events/hw_breakpoint.c11
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/pid_namespace.c6
-rw-r--r--kernel/sched/core.c108
-rw-r--r--kernel/sched/fair.c48
-rw-r--r--kernel/sched/rt.c14
-rw-r--r--kernel/sched/sched.h9
-rw-r--r--kernel/sched/stop_task.c22
-rw-r--r--kernel/task_work.c1
-rw-r--r--kernel/time/tick-sched.c1
-rw-r--r--kernel/time/timekeeping.c58
-rw-r--r--kernel/timer.c9
-rw-r--r--kernel/trace/trace_syscalls.c4
-rw-r--r--kernel/workqueue.c147
16 files changed, 329 insertions, 196 deletions
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 3a5ca582ba1e..ed206fd88cca 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -250,7 +250,6 @@ static void untag_chunk(struct node *p)
250 spin_unlock(&hash_lock); 250 spin_unlock(&hash_lock);
251 spin_unlock(&entry->lock); 251 spin_unlock(&entry->lock);
252 fsnotify_destroy_mark(entry); 252 fsnotify_destroy_mark(entry);
253 fsnotify_put_mark(entry);
254 goto out; 253 goto out;
255 } 254 }
256 255
@@ -259,7 +258,7 @@ static void untag_chunk(struct node *p)
259 258
260 fsnotify_duplicate_mark(&new->mark, entry); 259 fsnotify_duplicate_mark(&new->mark, entry);
261 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { 260 if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
262 free_chunk(new); 261 fsnotify_put_mark(&new->mark);
263 goto Fallback; 262 goto Fallback;
264 } 263 }
265 264
@@ -293,7 +292,7 @@ static void untag_chunk(struct node *p)
293 spin_unlock(&hash_lock); 292 spin_unlock(&hash_lock);
294 spin_unlock(&entry->lock); 293 spin_unlock(&entry->lock);
295 fsnotify_destroy_mark(entry); 294 fsnotify_destroy_mark(entry);
296 fsnotify_put_mark(entry); 295 fsnotify_put_mark(&new->mark); /* drop initial reference */
297 goto out; 296 goto out;
298 297
299Fallback: 298Fallback:
@@ -322,7 +321,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
322 321
323 entry = &chunk->mark; 322 entry = &chunk->mark;
324 if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { 323 if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
325 free_chunk(chunk); 324 fsnotify_put_mark(entry);
326 return -ENOSPC; 325 return -ENOSPC;
327 } 326 }
328 327
@@ -347,6 +346,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
347 insert_hash(chunk); 346 insert_hash(chunk);
348 spin_unlock(&hash_lock); 347 spin_unlock(&hash_lock);
349 spin_unlock(&entry->lock); 348 spin_unlock(&entry->lock);
349 fsnotify_put_mark(entry); /* drop initial reference */
350 return 0; 350 return 0;
351} 351}
352 352
@@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
396 fsnotify_duplicate_mark(chunk_entry, old_entry); 396 fsnotify_duplicate_mark(chunk_entry, old_entry);
397 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { 397 if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
398 spin_unlock(&old_entry->lock); 398 spin_unlock(&old_entry->lock);
399 free_chunk(chunk); 399 fsnotify_put_mark(chunk_entry);
400 fsnotify_put_mark(old_entry); 400 fsnotify_put_mark(old_entry);
401 return -ENOSPC; 401 return -ENOSPC;
402 } 402 }
@@ -444,8 +444,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
444 spin_unlock(&chunk_entry->lock); 444 spin_unlock(&chunk_entry->lock);
445 spin_unlock(&old_entry->lock); 445 spin_unlock(&old_entry->lock);
446 fsnotify_destroy_mark(old_entry); 446 fsnotify_destroy_mark(old_entry);
447 fsnotify_put_mark(chunk_entry); /* drop initial reference */
447 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ 448 fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
448 fsnotify_put_mark(old_entry); /* and kill it */
449 return 0; 449 return 0;
450} 450}
451 451
@@ -916,7 +916,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
916 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); 916 struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
917 917
918 evict_chunk(chunk); 918 evict_chunk(chunk);
919 fsnotify_put_mark(entry); 919
920 /*
921 * We are guaranteed to have at least one reference to the mark from
922 * either the inode or the caller of fsnotify_destroy_mark().
923 */
924 BUG_ON(atomic_read(&entry->refcnt) < 1);
920} 925}
921 926
922static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, 927static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b7935fcec7d9..7fee567153f0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1253,7 +1253,7 @@ retry:
1253/* 1253/*
1254 * Cross CPU call to disable a performance event 1254 * Cross CPU call to disable a performance event
1255 */ 1255 */
1256static int __perf_event_disable(void *info) 1256int __perf_event_disable(void *info)
1257{ 1257{
1258 struct perf_event *event = info; 1258 struct perf_event *event = info;
1259 struct perf_event_context *ctx = event->ctx; 1259 struct perf_event_context *ctx = event->ctx;
@@ -2935,12 +2935,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
2935/* 2935/*
2936 * Called when the last reference to the file is gone. 2936 * Called when the last reference to the file is gone.
2937 */ 2937 */
2938static int perf_release(struct inode *inode, struct file *file) 2938static void put_event(struct perf_event *event)
2939{ 2939{
2940 struct perf_event *event = file->private_data;
2941 struct task_struct *owner; 2940 struct task_struct *owner;
2942 2941
2943 file->private_data = NULL; 2942 if (!atomic_long_dec_and_test(&event->refcount))
2943 return;
2944 2944
2945 rcu_read_lock(); 2945 rcu_read_lock();
2946 owner = ACCESS_ONCE(event->owner); 2946 owner = ACCESS_ONCE(event->owner);
@@ -2975,7 +2975,13 @@ static int perf_release(struct inode *inode, struct file *file)
2975 put_task_struct(owner); 2975 put_task_struct(owner);
2976 } 2976 }
2977 2977
2978 return perf_event_release_kernel(event); 2978 perf_event_release_kernel(event);
2979}
2980
2981static int perf_release(struct inode *inode, struct file *file)
2982{
2983 put_event(file->private_data);
2984 return 0;
2979} 2985}
2980 2986
2981u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 2987u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -3227,7 +3233,7 @@ unlock:
3227 3233
3228static const struct file_operations perf_fops; 3234static const struct file_operations perf_fops;
3229 3235
3230static struct perf_event *perf_fget_light(int fd, int *fput_needed) 3236static struct file *perf_fget_light(int fd, int *fput_needed)
3231{ 3237{
3232 struct file *file; 3238 struct file *file;
3233 3239
@@ -3241,7 +3247,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed)
3241 return ERR_PTR(-EBADF); 3247 return ERR_PTR(-EBADF);
3242 } 3248 }
3243 3249
3244 return file->private_data; 3250 return file;
3245} 3251}
3246 3252
3247static int perf_event_set_output(struct perf_event *event, 3253static int perf_event_set_output(struct perf_event *event,
@@ -3273,19 +3279,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3273 3279
3274 case PERF_EVENT_IOC_SET_OUTPUT: 3280 case PERF_EVENT_IOC_SET_OUTPUT:
3275 { 3281 {
3282 struct file *output_file = NULL;
3276 struct perf_event *output_event = NULL; 3283 struct perf_event *output_event = NULL;
3277 int fput_needed = 0; 3284 int fput_needed = 0;
3278 int ret; 3285 int ret;
3279 3286
3280 if (arg != -1) { 3287 if (arg != -1) {
3281 output_event = perf_fget_light(arg, &fput_needed); 3288 output_file = perf_fget_light(arg, &fput_needed);
3282 if (IS_ERR(output_event)) 3289 if (IS_ERR(output_file))
3283 return PTR_ERR(output_event); 3290 return PTR_ERR(output_file);
3291 output_event = output_file->private_data;
3284 } 3292 }
3285 3293
3286 ret = perf_event_set_output(event, output_event); 3294 ret = perf_event_set_output(event, output_event);
3287 if (output_event) 3295 if (output_event)
3288 fput_light(output_event->filp, fput_needed); 3296 fput_light(output_file, fput_needed);
3289 3297
3290 return ret; 3298 return ret;
3291 } 3299 }
@@ -5950,6 +5958,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5950 5958
5951 mutex_init(&event->mmap_mutex); 5959 mutex_init(&event->mmap_mutex);
5952 5960
5961 atomic_long_set(&event->refcount, 1);
5953 event->cpu = cpu; 5962 event->cpu = cpu;
5954 event->attr = *attr; 5963 event->attr = *attr;
5955 event->group_leader = group_leader; 5964 event->group_leader = group_leader;
@@ -6260,12 +6269,12 @@ SYSCALL_DEFINE5(perf_event_open,
6260 return event_fd; 6269 return event_fd;
6261 6270
6262 if (group_fd != -1) { 6271 if (group_fd != -1) {
6263 group_leader = perf_fget_light(group_fd, &fput_needed); 6272 group_file = perf_fget_light(group_fd, &fput_needed);
6264 if (IS_ERR(group_leader)) { 6273 if (IS_ERR(group_file)) {
6265 err = PTR_ERR(group_leader); 6274 err = PTR_ERR(group_file);
6266 goto err_fd; 6275 goto err_fd;
6267 } 6276 }
6268 group_file = group_leader->filp; 6277 group_leader = group_file->private_data;
6269 if (flags & PERF_FLAG_FD_OUTPUT) 6278 if (flags & PERF_FLAG_FD_OUTPUT)
6270 output_event = group_leader; 6279 output_event = group_leader;
6271 if (flags & PERF_FLAG_FD_NO_GROUP) 6280 if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -6402,7 +6411,6 @@ SYSCALL_DEFINE5(perf_event_open,
6402 put_ctx(gctx); 6411 put_ctx(gctx);
6403 } 6412 }
6404 6413
6405 event->filp = event_file;
6406 WARN_ON_ONCE(ctx->parent_ctx); 6414 WARN_ON_ONCE(ctx->parent_ctx);
6407 mutex_lock(&ctx->mutex); 6415 mutex_lock(&ctx->mutex);
6408 6416
@@ -6496,7 +6504,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6496 goto err_free; 6504 goto err_free;
6497 } 6505 }
6498 6506
6499 event->filp = NULL;
6500 WARN_ON_ONCE(ctx->parent_ctx); 6507 WARN_ON_ONCE(ctx->parent_ctx);
6501 mutex_lock(&ctx->mutex); 6508 mutex_lock(&ctx->mutex);
6502 perf_install_in_context(ctx, event, cpu); 6509 perf_install_in_context(ctx, event, cpu);
@@ -6578,7 +6585,7 @@ static void sync_child_event(struct perf_event *child_event,
6578 * Release the parent event, if this was the last 6585 * Release the parent event, if this was the last
6579 * reference to it. 6586 * reference to it.
6580 */ 6587 */
6581 fput(parent_event->filp); 6588 put_event(parent_event);
6582} 6589}
6583 6590
6584static void 6591static void
@@ -6654,9 +6661,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6654 * 6661 *
6655 * __perf_event_exit_task() 6662 * __perf_event_exit_task()
6656 * sync_child_event() 6663 * sync_child_event()
6657 * fput(parent_event->filp) 6664 * put_event()
6658 * perf_release() 6665 * mutex_lock(&ctx->mutex)
6659 * mutex_lock(&ctx->mutex)
6660 * 6666 *
6661 * But since its the parent context it won't be the same instance. 6667 * But since its the parent context it won't be the same instance.
6662 */ 6668 */
@@ -6724,7 +6730,7 @@ static void perf_free_event(struct perf_event *event,
6724 list_del_init(&event->child_list); 6730 list_del_init(&event->child_list);
6725 mutex_unlock(&parent->child_mutex); 6731 mutex_unlock(&parent->child_mutex);
6726 6732
6727 fput(parent->filp); 6733 put_event(parent);
6728 6734
6729 perf_group_detach(event); 6735 perf_group_detach(event);
6730 list_del_event(event, ctx); 6736 list_del_event(event, ctx);
@@ -6804,6 +6810,12 @@ inherit_event(struct perf_event *parent_event,
6804 NULL, NULL); 6810 NULL, NULL);
6805 if (IS_ERR(child_event)) 6811 if (IS_ERR(child_event))
6806 return child_event; 6812 return child_event;
6813
6814 if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
6815 free_event(child_event);
6816 return NULL;
6817 }
6818
6807 get_ctx(child_ctx); 6819 get_ctx(child_ctx);
6808 6820
6809 /* 6821 /*
@@ -6845,14 +6857,6 @@ inherit_event(struct perf_event *parent_event,
6845 raw_spin_unlock_irqrestore(&child_ctx->lock, flags); 6857 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
6846 6858
6847 /* 6859 /*
6848 * Get a reference to the parent filp - we will fput it
6849 * when the child event exits. This is safe to do because
6850 * we are in the parent and we know that the filp still
6851 * exists and has a nonzero count:
6852 */
6853 atomic_long_inc(&parent_event->filp->f_count);
6854
6855 /*
6856 * Link this into the parent event's child list 6860 * Link this into the parent event's child list
6857 */ 6861 */
6858 WARN_ON_ONCE(parent_event->ctx->parent_ctx); 6862 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index bb38c4d3ee12..9a7b487c6fe2 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -453,7 +453,16 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
453 int old_type = bp->attr.bp_type; 453 int old_type = bp->attr.bp_type;
454 int err = 0; 454 int err = 0;
455 455
456 perf_event_disable(bp); 456 /*
457 * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
458 * will not be possible to raise IPIs that invoke __perf_event_disable.
459 * So call the function directly after making sure we are targeting the
460 * current task.
461 */
462 if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
463 __perf_event_disable(bp);
464 else
465 perf_event_disable(bp);
457 466
458 bp->attr.bp_addr = attr->bp_addr; 467 bp->attr.bp_addr = attr->bp_addr;
459 bp->attr.bp_type = attr->bp_type; 468 bp->attr.bp_type = attr->bp_type;
diff --git a/kernel/fork.c b/kernel/fork.c
index 3bd2280d79f6..2c8857e12855 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -455,8 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
455 if (retval) 455 if (retval)
456 goto out; 456 goto out;
457 457
458 if (file && uprobe_mmap(tmp)) 458 if (file)
459 goto out; 459 uprobe_mmap(tmp);
460 } 460 }
461 /* a new mm has just been created */ 461 /* a new mm has just been created */
462 arch_dup_mmap(oldmm, mm); 462 arch_dup_mmap(oldmm, mm);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index b3c7fd554250..6144bab8fd8e 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -232,15 +232,19 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
232 */ 232 */
233 233
234 tmp.data = &current->nsproxy->pid_ns->last_pid; 234 tmp.data = &current->nsproxy->pid_ns->last_pid;
235 return proc_dointvec(&tmp, write, buffer, lenp, ppos); 235 return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
236} 236}
237 237
238extern int pid_max;
239static int zero = 0;
238static struct ctl_table pid_ns_ctl_table[] = { 240static struct ctl_table pid_ns_ctl_table[] = {
239 { 241 {
240 .procname = "ns_last_pid", 242 .procname = "ns_last_pid",
241 .maxlen = sizeof(int), 243 .maxlen = sizeof(int),
242 .mode = 0666, /* permissions are checked in the handler */ 244 .mode = 0666, /* permissions are checked in the handler */
243 .proc_handler = pid_ns_ctl_handler, 245 .proc_handler = pid_ns_ctl_handler,
246 .extra1 = &zero,
247 .extra2 = &pid_max,
244 }, 248 },
245 { } 249 { }
246}; 250};
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 82ad284f823b..649c9f876cb1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3142,6 +3142,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3142# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) 3142# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3143#endif 3143#endif
3144 3144
3145static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
3146{
3147 u64 temp = (__force u64) rtime;
3148
3149 temp *= (__force u64) utime;
3150
3151 if (sizeof(cputime_t) == 4)
3152 temp = div_u64(temp, (__force u32) total);
3153 else
3154 temp = div64_u64(temp, (__force u64) total);
3155
3156 return (__force cputime_t) temp;
3157}
3158
3145void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3159void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3146{ 3160{
3147 cputime_t rtime, utime = p->utime, total = utime + p->stime; 3161 cputime_t rtime, utime = p->utime, total = utime + p->stime;
@@ -3151,13 +3165,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3151 */ 3165 */
3152 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 3166 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3153 3167
3154 if (total) { 3168 if (total)
3155 u64 temp = (__force u64) rtime; 3169 utime = scale_utime(utime, rtime, total);
3156 3170 else
3157 temp *= (__force u64) utime;
3158 do_div(temp, (__force u32) total);
3159 utime = (__force cputime_t) temp;
3160 } else
3161 utime = rtime; 3171 utime = rtime;
3162 3172
3163 /* 3173 /*
@@ -3184,13 +3194,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3184 total = cputime.utime + cputime.stime; 3194 total = cputime.utime + cputime.stime;
3185 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 3195 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3186 3196
3187 if (total) { 3197 if (total)
3188 u64 temp = (__force u64) rtime; 3198 utime = scale_utime(cputime.utime, rtime, total);
3189 3199 else
3190 temp *= (__force u64) cputime.utime;
3191 do_div(temp, (__force u32) total);
3192 utime = (__force cputime_t) temp;
3193 } else
3194 utime = rtime; 3200 utime = rtime;
3195 3201
3196 sig->prev_utime = max(sig->prev_utime, utime); 3202 sig->prev_utime = max(sig->prev_utime, utime);
@@ -5298,27 +5304,17 @@ void idle_task_exit(void)
5298} 5304}
5299 5305
5300/* 5306/*
5301 * While a dead CPU has no uninterruptible tasks queued at this point, 5307 * Since this CPU is going 'away' for a while, fold any nr_active delta
5302 * it might still have a nonzero ->nr_uninterruptible counter, because 5308 * we might have. Assumes we're called after migrate_tasks() so that the
5303 * for performance reasons the counter is not stricly tracking tasks to 5309 * nr_active count is stable.
5304 * their home CPUs. So we just add the counter to another CPU's counter, 5310 *
5305 * to keep the global sum constant after CPU-down: 5311 * Also see the comment "Global load-average calculations".
5306 */
5307static void migrate_nr_uninterruptible(struct rq *rq_src)
5308{
5309 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5310
5311 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5312 rq_src->nr_uninterruptible = 0;
5313}
5314
5315/*
5316 * remove the tasks which were accounted by rq from calc_load_tasks.
5317 */ 5312 */
5318static void calc_global_load_remove(struct rq *rq) 5313static void calc_load_migrate(struct rq *rq)
5319{ 5314{
5320 atomic_long_sub(rq->calc_load_active, &calc_load_tasks); 5315 long delta = calc_load_fold_active(rq);
5321 rq->calc_load_active = 0; 5316 if (delta)
5317 atomic_long_add(delta, &calc_load_tasks);
5322} 5318}
5323 5319
5324/* 5320/*
@@ -5346,9 +5342,6 @@ static void migrate_tasks(unsigned int dead_cpu)
5346 */ 5342 */
5347 rq->stop = NULL; 5343 rq->stop = NULL;
5348 5344
5349 /* Ensure any throttled groups are reachable by pick_next_task */
5350 unthrottle_offline_cfs_rqs(rq);
5351
5352 for ( ; ; ) { 5345 for ( ; ; ) {
5353 /* 5346 /*
5354 * There's this thread running, bail when that's the only 5347 * There's this thread running, bail when that's the only
@@ -5612,8 +5605,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5612 BUG_ON(rq->nr_running != 1); /* the migration thread */ 5605 BUG_ON(rq->nr_running != 1); /* the migration thread */
5613 raw_spin_unlock_irqrestore(&rq->lock, flags); 5606 raw_spin_unlock_irqrestore(&rq->lock, flags);
5614 5607
5615 migrate_nr_uninterruptible(rq); 5608 calc_load_migrate(rq);
5616 calc_global_load_remove(rq);
5617 break; 5609 break;
5618#endif 5610#endif
5619 } 5611 }
@@ -6022,11 +6014,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6022 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 6014 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
6023 * allows us to avoid some pointer chasing select_idle_sibling(). 6015 * allows us to avoid some pointer chasing select_idle_sibling().
6024 * 6016 *
6025 * Iterate domains and sched_groups downward, assigning CPUs to be
6026 * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
6027 * due to random perturbation self canceling, ie sw buddies pull
6028 * their counterpart to their CPU's hw counterpart.
6029 *
6030 * Also keep a unique ID per domain (we use the first cpu number in 6017 * Also keep a unique ID per domain (we use the first cpu number in
6031 * the cpumask of the domain), this allows us to quickly tell if 6018 * the cpumask of the domain), this allows us to quickly tell if
6032 * two cpus are in the same cache domain, see cpus_share_cache(). 6019 * two cpus are in the same cache domain, see cpus_share_cache().
@@ -6040,40 +6027,8 @@ static void update_top_cache_domain(int cpu)
6040 int id = cpu; 6027 int id = cpu;
6041 6028
6042 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 6029 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
6043 if (sd) { 6030 if (sd)
6044 struct sched_domain *tmp = sd;
6045 struct sched_group *sg, *prev;
6046 bool right;
6047
6048 /*
6049 * Traverse to first CPU in group, and count hops
6050 * to cpu from there, switching direction on each
6051 * hop, never ever pointing the last CPU rightward.
6052 */
6053 do {
6054 id = cpumask_first(sched_domain_span(tmp));
6055 prev = sg = tmp->groups;
6056 right = 1;
6057
6058 while (cpumask_first(sched_group_cpus(sg)) != id)
6059 sg = sg->next;
6060
6061 while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
6062 prev = sg;
6063 sg = sg->next;
6064 right = !right;
6065 }
6066
6067 /* A CPU went down, never point back to domain start. */
6068 if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
6069 right = false;
6070
6071 sg = right ? sg->next : prev;
6072 tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
6073 } while ((tmp = tmp->child));
6074
6075 id = cpumask_first(sched_domain_span(sd)); 6031 id = cpumask_first(sched_domain_span(sd));
6076 }
6077 6032
6078 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 6033 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
6079 per_cpu(sd_llc_id, cpu) = id; 6034 per_cpu(sd_llc_id, cpu) = id;
@@ -7246,6 +7201,7 @@ int in_sched_functions(unsigned long addr)
7246 7201
7247#ifdef CONFIG_CGROUP_SCHED 7202#ifdef CONFIG_CGROUP_SCHED
7248struct task_group root_task_group; 7203struct task_group root_task_group;
7204LIST_HEAD(task_groups);
7249#endif 7205#endif
7250 7206
7251DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 7207DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d0cc03b3e70b..96e2b18b6283 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2052,7 +2052,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2052 hrtimer_cancel(&cfs_b->slack_timer); 2052 hrtimer_cancel(&cfs_b->slack_timer);
2053} 2053}
2054 2054
2055void unthrottle_offline_cfs_rqs(struct rq *rq) 2055static void unthrottle_offline_cfs_rqs(struct rq *rq)
2056{ 2056{
2057 struct cfs_rq *cfs_rq; 2057 struct cfs_rq *cfs_rq;
2058 2058
@@ -2106,7 +2106,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2106 return NULL; 2106 return NULL;
2107} 2107}
2108static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} 2108static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2109void unthrottle_offline_cfs_rqs(struct rq *rq) {} 2109static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2110 2110
2111#endif /* CONFIG_CFS_BANDWIDTH */ 2111#endif /* CONFIG_CFS_BANDWIDTH */
2112 2112
@@ -2637,6 +2637,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
2637 int cpu = smp_processor_id(); 2637 int cpu = smp_processor_id();
2638 int prev_cpu = task_cpu(p); 2638 int prev_cpu = task_cpu(p);
2639 struct sched_domain *sd; 2639 struct sched_domain *sd;
2640 struct sched_group *sg;
2641 int i;
2640 2642
2641 /* 2643 /*
2642 * If the task is going to be woken-up on this cpu and if it is 2644 * If the task is going to be woken-up on this cpu and if it is
@@ -2653,17 +2655,29 @@ static int select_idle_sibling(struct task_struct *p, int target)
2653 return prev_cpu; 2655 return prev_cpu;
2654 2656
2655 /* 2657 /*
2656 * Otherwise, check assigned siblings to find an elegible idle cpu. 2658 * Otherwise, iterate the domains and find an elegible idle cpu.
2657 */ 2659 */
2658 sd = rcu_dereference(per_cpu(sd_llc, target)); 2660 sd = rcu_dereference(per_cpu(sd_llc, target));
2659
2660 for_each_lower_domain(sd) { 2661 for_each_lower_domain(sd) {
2661 if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) 2662 sg = sd->groups;
2662 continue; 2663 do {
2663 if (idle_cpu(sd->idle_buddy)) 2664 if (!cpumask_intersects(sched_group_cpus(sg),
2664 return sd->idle_buddy; 2665 tsk_cpus_allowed(p)))
2665 } 2666 goto next;
2666 2667
2668 for_each_cpu(i, sched_group_cpus(sg)) {
2669 if (!idle_cpu(i))
2670 goto next;
2671 }
2672
2673 target = cpumask_first_and(sched_group_cpus(sg),
2674 tsk_cpus_allowed(p));
2675 goto done;
2676next:
2677 sg = sg->next;
2678 } while (sg != sd->groups);
2679 }
2680done:
2667 return target; 2681 return target;
2668} 2682}
2669 2683
@@ -3387,6 +3401,14 @@ static int tg_load_down(struct task_group *tg, void *data)
3387 3401
3388static void update_h_load(long cpu) 3402static void update_h_load(long cpu)
3389{ 3403{
3404 struct rq *rq = cpu_rq(cpu);
3405 unsigned long now = jiffies;
3406
3407 if (rq->h_load_throttle == now)
3408 return;
3409
3410 rq->h_load_throttle = now;
3411
3390 rcu_read_lock(); 3412 rcu_read_lock();
3391 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 3413 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
3392 rcu_read_unlock(); 3414 rcu_read_unlock();
@@ -3650,7 +3672,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3650 * @group: sched_group whose statistics are to be updated. 3672 * @group: sched_group whose statistics are to be updated.
3651 * @load_idx: Load index of sched_domain of this_cpu for load calc. 3673 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3652 * @local_group: Does group contain this_cpu. 3674 * @local_group: Does group contain this_cpu.
3653 * @cpus: Set of cpus considered for load balancing.
3654 * @balance: Should we balance. 3675 * @balance: Should we balance.
3655 * @sgs: variable to hold the statistics for this group. 3676 * @sgs: variable to hold the statistics for this group.
3656 */ 3677 */
@@ -3797,7 +3818,6 @@ static bool update_sd_pick_busiest(struct lb_env *env,
3797/** 3818/**
3798 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 3819 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
3799 * @env: The load balancing environment. 3820 * @env: The load balancing environment.
3800 * @cpus: Set of cpus considered for load balancing.
3801 * @balance: Should we balance. 3821 * @balance: Should we balance.
3802 * @sds: variable to hold the statistics for this sched_domain. 3822 * @sds: variable to hold the statistics for this sched_domain.
3803 */ 3823 */
@@ -4293,11 +4313,10 @@ redo:
4293 env.src_rq = busiest; 4313 env.src_rq = busiest;
4294 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 4314 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
4295 4315
4316 update_h_load(env.src_cpu);
4296more_balance: 4317more_balance:
4297 local_irq_save(flags); 4318 local_irq_save(flags);
4298 double_rq_lock(this_rq, busiest); 4319 double_rq_lock(this_rq, busiest);
4299 if (!env.loop)
4300 update_h_load(env.src_cpu);
4301 4320
4302 /* 4321 /*
4303 * cur_ld_moved - load moved in current iteration 4322 * cur_ld_moved - load moved in current iteration
@@ -4949,6 +4968,9 @@ static void rq_online_fair(struct rq *rq)
4949static void rq_offline_fair(struct rq *rq) 4968static void rq_offline_fair(struct rq *rq)
4950{ 4969{
4951 update_sysctl(); 4970 update_sysctl();
4971
4972 /* Ensure any throttled groups are reachable by pick_next_task */
4973 unthrottle_offline_cfs_rqs(rq);
4952} 4974}
4953 4975
4954#endif /* CONFIG_SMP */ 4976#endif /* CONFIG_SMP */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 573e1ca01102..e0b7ba9c040f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -691,6 +691,7 @@ balanced:
691 * runtime - in which case borrowing doesn't make sense. 691 * runtime - in which case borrowing doesn't make sense.
692 */ 692 */
693 rt_rq->rt_runtime = RUNTIME_INF; 693 rt_rq->rt_runtime = RUNTIME_INF;
694 rt_rq->rt_throttled = 0;
694 raw_spin_unlock(&rt_rq->rt_runtime_lock); 695 raw_spin_unlock(&rt_rq->rt_runtime_lock);
695 raw_spin_unlock(&rt_b->rt_runtime_lock); 696 raw_spin_unlock(&rt_b->rt_runtime_lock);
696 } 697 }
@@ -788,6 +789,19 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
788 const struct cpumask *span; 789 const struct cpumask *span;
789 790
790 span = sched_rt_period_mask(); 791 span = sched_rt_period_mask();
792#ifdef CONFIG_RT_GROUP_SCHED
793 /*
794 * FIXME: isolated CPUs should really leave the root task group,
795 * whether they are isolcpus or were isolated via cpusets, lest
796 * the timer run on a CPU which does not service all runqueues,
797 * potentially leaving other CPUs indefinitely throttled. If
798 * isolation is really required, the user will turn the throttle
799 * off to kill the perturbations it causes anyway. Meanwhile,
800 * this maintains functionality for boot and/or troubleshooting.
801 */
802 if (rt_b == &root_task_group.rt_bandwidth)
803 span = cpu_online_mask;
804#endif
791 for_each_cpu(i, span) { 805 for_each_cpu(i, span) {
792 int enqueue = 0; 806 int enqueue = 0;
793 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 807 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c35a1a7dd4d6..0848fa36c383 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex;
80struct cfs_rq; 80struct cfs_rq;
81struct rt_rq; 81struct rt_rq;
82 82
83static LIST_HEAD(task_groups); 83extern struct list_head task_groups;
84 84
85struct cfs_bandwidth { 85struct cfs_bandwidth {
86#ifdef CONFIG_CFS_BANDWIDTH 86#ifdef CONFIG_CFS_BANDWIDTH
@@ -374,7 +374,11 @@ struct rq {
374#ifdef CONFIG_FAIR_GROUP_SCHED 374#ifdef CONFIG_FAIR_GROUP_SCHED
375 /* list of leaf cfs_rq on this cpu: */ 375 /* list of leaf cfs_rq on this cpu: */
376 struct list_head leaf_cfs_rq_list; 376 struct list_head leaf_cfs_rq_list;
377#endif 377#ifdef CONFIG_SMP
378 unsigned long h_load_throttle;
379#endif /* CONFIG_SMP */
380#endif /* CONFIG_FAIR_GROUP_SCHED */
381
378#ifdef CONFIG_RT_GROUP_SCHED 382#ifdef CONFIG_RT_GROUP_SCHED
379 struct list_head leaf_rt_rq_list; 383 struct list_head leaf_rt_rq_list;
380#endif 384#endif
@@ -1140,7 +1144,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
1140 1144
1141extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1145extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1142extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1146extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1143extern void unthrottle_offline_cfs_rqs(struct rq *rq);
1144 1147
1145extern void account_cfs_bandwidth_used(int enabled, int was_enabled); 1148extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
1146 1149
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 7b386e86fd23..da5eb5bed84a 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
27{ 27{
28 struct task_struct *stop = rq->stop; 28 struct task_struct *stop = rq->stop;
29 29
30 if (stop && stop->on_rq) 30 if (stop && stop->on_rq) {
31 stop->se.exec_start = rq->clock_task;
31 return stop; 32 return stop;
33 }
32 34
33 return NULL; 35 return NULL;
34} 36}
@@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq)
52 54
53static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) 55static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
54{ 56{
57 struct task_struct *curr = rq->curr;
58 u64 delta_exec;
59
60 delta_exec = rq->clock_task - curr->se.exec_start;
61 if (unlikely((s64)delta_exec < 0))
62 delta_exec = 0;
63
64 schedstat_set(curr->se.statistics.exec_max,
65 max(curr->se.statistics.exec_max, delta_exec));
66
67 curr->se.sum_exec_runtime += delta_exec;
68 account_group_exec_runtime(curr, delta_exec);
69
70 curr->se.exec_start = rq->clock_task;
71 cpuacct_charge(curr, delta_exec);
55} 72}
56 73
57static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) 74static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
@@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
60 77
61static void set_curr_task_stop(struct rq *rq) 78static void set_curr_task_stop(struct rq *rq)
62{ 79{
80 struct task_struct *stop = rq->stop;
81
82 stop->se.exec_start = rq->clock_task;
63} 83}
64 84
65static void switched_to_stop(struct rq *rq, struct task_struct *p) 85static void switched_to_stop(struct rq *rq, struct task_struct *p)
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 91d4e1742a0c..d320d44903bd 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -75,6 +75,7 @@ void task_work_run(void)
75 p = q->next; 75 p = q->next;
76 q->func(q); 76 q->func(q);
77 q = p; 77 q = p;
78 cond_resched();
78 } 79 }
79 } 80 }
80} 81}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 024540f97f74..3a9e5d5c1091 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -573,6 +573,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
573 tick_do_update_jiffies64(now); 573 tick_do_update_jiffies64(now);
574 update_cpu_load_nohz(); 574 update_cpu_load_nohz();
575 575
576 calc_load_exit_idle();
576 touch_softlockup_watchdog(); 577 touch_softlockup_watchdog();
577 /* 578 /*
578 * Cancel the scheduled timer and restore the tick 579 * Cancel the scheduled timer and restore the tick
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e16af197a2bc..d3b91e75cecd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -115,6 +115,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
115{ 115{
116 tk->xtime_sec += ts->tv_sec; 116 tk->xtime_sec += ts->tv_sec;
117 tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; 117 tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift;
118 tk_normalize_xtime(tk);
118} 119}
119 120
120static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) 121static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
@@ -276,7 +277,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
276 tk->xtime_nsec += cycle_delta * tk->mult; 277 tk->xtime_nsec += cycle_delta * tk->mult;
277 278
278 /* If arch requires, add in gettimeoffset() */ 279 /* If arch requires, add in gettimeoffset() */
279 tk->xtime_nsec += arch_gettimeoffset() << tk->shift; 280 tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift;
280 281
281 tk_normalize_xtime(tk); 282 tk_normalize_xtime(tk);
282 283
@@ -302,10 +303,11 @@ void getnstimeofday(struct timespec *ts)
302 seq = read_seqbegin(&tk->lock); 303 seq = read_seqbegin(&tk->lock);
303 304
304 ts->tv_sec = tk->xtime_sec; 305 ts->tv_sec = tk->xtime_sec;
305 ts->tv_nsec = timekeeping_get_ns(tk); 306 nsecs = timekeeping_get_ns(tk);
306 307
307 } while (read_seqretry(&tk->lock, seq)); 308 } while (read_seqretry(&tk->lock, seq));
308 309
310 ts->tv_nsec = 0;
309 timespec_add_ns(ts, nsecs); 311 timespec_add_ns(ts, nsecs);
310} 312}
311EXPORT_SYMBOL(getnstimeofday); 313EXPORT_SYMBOL(getnstimeofday);
@@ -344,6 +346,7 @@ void ktime_get_ts(struct timespec *ts)
344{ 346{
345 struct timekeeper *tk = &timekeeper; 347 struct timekeeper *tk = &timekeeper;
346 struct timespec tomono; 348 struct timespec tomono;
349 s64 nsec;
347 unsigned int seq; 350 unsigned int seq;
348 351
349 WARN_ON(timekeeping_suspended); 352 WARN_ON(timekeeping_suspended);
@@ -351,13 +354,14 @@ void ktime_get_ts(struct timespec *ts)
351 do { 354 do {
352 seq = read_seqbegin(&tk->lock); 355 seq = read_seqbegin(&tk->lock);
353 ts->tv_sec = tk->xtime_sec; 356 ts->tv_sec = tk->xtime_sec;
354 ts->tv_nsec = timekeeping_get_ns(tk); 357 nsec = timekeeping_get_ns(tk);
355 tomono = tk->wall_to_monotonic; 358 tomono = tk->wall_to_monotonic;
356 359
357 } while (read_seqretry(&tk->lock, seq)); 360 } while (read_seqretry(&tk->lock, seq));
358 361
359 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, 362 ts->tv_sec += tomono.tv_sec;
360 ts->tv_nsec + tomono.tv_nsec); 363 ts->tv_nsec = 0;
364 timespec_add_ns(ts, nsec + tomono.tv_nsec);
361} 365}
362EXPORT_SYMBOL_GPL(ktime_get_ts); 366EXPORT_SYMBOL_GPL(ktime_get_ts);
363 367
@@ -427,7 +431,7 @@ int do_settimeofday(const struct timespec *tv)
427 struct timespec ts_delta, xt; 431 struct timespec ts_delta, xt;
428 unsigned long flags; 432 unsigned long flags;
429 433
430 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) 434 if (!timespec_valid_strict(tv))
431 return -EINVAL; 435 return -EINVAL;
432 436
433 write_seqlock_irqsave(&tk->lock, flags); 437 write_seqlock_irqsave(&tk->lock, flags);
@@ -463,6 +467,8 @@ int timekeeping_inject_offset(struct timespec *ts)
463{ 467{
464 struct timekeeper *tk = &timekeeper; 468 struct timekeeper *tk = &timekeeper;
465 unsigned long flags; 469 unsigned long flags;
470 struct timespec tmp;
471 int ret = 0;
466 472
467 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) 473 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
468 return -EINVAL; 474 return -EINVAL;
@@ -471,10 +477,17 @@ int timekeeping_inject_offset(struct timespec *ts)
471 477
472 timekeeping_forward_now(tk); 478 timekeeping_forward_now(tk);
473 479
480 /* Make sure the proposed value is valid */
481 tmp = timespec_add(tk_xtime(tk), *ts);
482 if (!timespec_valid_strict(&tmp)) {
483 ret = -EINVAL;
484 goto error;
485 }
474 486
475 tk_xtime_add(tk, ts); 487 tk_xtime_add(tk, ts);
476 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); 488 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
477 489
490error: /* even if we error out, we forwarded the time, so call update */
478 timekeeping_update(tk, true); 491 timekeeping_update(tk, true);
479 492
480 write_sequnlock_irqrestore(&tk->lock, flags); 493 write_sequnlock_irqrestore(&tk->lock, flags);
@@ -482,7 +495,7 @@ int timekeeping_inject_offset(struct timespec *ts)
482 /* signal hrtimers about time change */ 495 /* signal hrtimers about time change */
483 clock_was_set(); 496 clock_was_set();
484 497
485 return 0; 498 return ret;
486} 499}
487EXPORT_SYMBOL(timekeeping_inject_offset); 500EXPORT_SYMBOL(timekeeping_inject_offset);
488 501
@@ -649,7 +662,20 @@ void __init timekeeping_init(void)
649 struct timespec now, boot, tmp; 662 struct timespec now, boot, tmp;
650 663
651 read_persistent_clock(&now); 664 read_persistent_clock(&now);
665 if (!timespec_valid_strict(&now)) {
666 pr_warn("WARNING: Persistent clock returned invalid value!\n"
667 " Check your CMOS/BIOS settings.\n");
668 now.tv_sec = 0;
669 now.tv_nsec = 0;
670 }
671
652 read_boot_clock(&boot); 672 read_boot_clock(&boot);
673 if (!timespec_valid_strict(&boot)) {
674 pr_warn("WARNING: Boot clock returned invalid value!\n"
675 " Check your CMOS/BIOS settings.\n");
676 boot.tv_sec = 0;
677 boot.tv_nsec = 0;
678 }
653 679
654 seqlock_init(&tk->lock); 680 seqlock_init(&tk->lock);
655 681
@@ -690,7 +716,7 @@ static struct timespec timekeeping_suspend_time;
690static void __timekeeping_inject_sleeptime(struct timekeeper *tk, 716static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
691 struct timespec *delta) 717 struct timespec *delta)
692{ 718{
693 if (!timespec_valid(delta)) { 719 if (!timespec_valid_strict(delta)) {
694 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " 720 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
695 "sleep delta value!\n"); 721 "sleep delta value!\n");
696 return; 722 return;
@@ -1129,6 +1155,10 @@ static void update_wall_time(void)
1129 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 1155 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
1130#endif 1156#endif
1131 1157
1158 /* Check if there's really nothing to do */
1159 if (offset < tk->cycle_interval)
1160 goto out;
1161
1132 /* 1162 /*
1133 * With NO_HZ we may have to accumulate many cycle_intervals 1163 * With NO_HZ we may have to accumulate many cycle_intervals
1134 * (think "ticks") worth of time at once. To do this efficiently, 1164 * (think "ticks") worth of time at once. To do this efficiently,
@@ -1161,9 +1191,9 @@ static void update_wall_time(void)
1161 * the vsyscall implementations are converted to use xtime_nsec 1191 * the vsyscall implementations are converted to use xtime_nsec
1162 * (shifted nanoseconds), this can be killed. 1192 * (shifted nanoseconds), this can be killed.
1163 */ 1193 */
1164 remainder = tk->xtime_nsec & ((1 << tk->shift) - 1); 1194 remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
1165 tk->xtime_nsec -= remainder; 1195 tk->xtime_nsec -= remainder;
1166 tk->xtime_nsec += 1 << tk->shift; 1196 tk->xtime_nsec += 1ULL << tk->shift;
1167 tk->ntp_error += remainder << tk->ntp_error_shift; 1197 tk->ntp_error += remainder << tk->ntp_error_shift;
1168 1198
1169 /* 1199 /*
@@ -1217,6 +1247,7 @@ void get_monotonic_boottime(struct timespec *ts)
1217{ 1247{
1218 struct timekeeper *tk = &timekeeper; 1248 struct timekeeper *tk = &timekeeper;
1219 struct timespec tomono, sleep; 1249 struct timespec tomono, sleep;
1250 s64 nsec;
1220 unsigned int seq; 1251 unsigned int seq;
1221 1252
1222 WARN_ON(timekeeping_suspended); 1253 WARN_ON(timekeeping_suspended);
@@ -1224,14 +1255,15 @@ void get_monotonic_boottime(struct timespec *ts)
1224 do { 1255 do {
1225 seq = read_seqbegin(&tk->lock); 1256 seq = read_seqbegin(&tk->lock);
1226 ts->tv_sec = tk->xtime_sec; 1257 ts->tv_sec = tk->xtime_sec;
1227 ts->tv_nsec = timekeeping_get_ns(tk); 1258 nsec = timekeeping_get_ns(tk);
1228 tomono = tk->wall_to_monotonic; 1259 tomono = tk->wall_to_monotonic;
1229 sleep = tk->total_sleep_time; 1260 sleep = tk->total_sleep_time;
1230 1261
1231 } while (read_seqretry(&tk->lock, seq)); 1262 } while (read_seqretry(&tk->lock, seq));
1232 1263
1233 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, 1264 ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
1234 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec); 1265 ts->tv_nsec = 0;
1266 timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec);
1235} 1267}
1236EXPORT_SYMBOL_GPL(get_monotonic_boottime); 1268EXPORT_SYMBOL_GPL(get_monotonic_boottime);
1237 1269
diff --git a/kernel/timer.c b/kernel/timer.c
index a61c09374eba..8c5e7b908c68 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1407,13 +1407,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
1407 1407
1408#endif 1408#endif
1409 1409
1410#ifndef __alpha__
1411
1412/*
1413 * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
1414 * should be moved into arch/i386 instead?
1415 */
1416
1417/** 1410/**
1418 * sys_getpid - return the thread group id of the current process 1411 * sys_getpid - return the thread group id of the current process
1419 * 1412 *
@@ -1469,8 +1462,6 @@ SYSCALL_DEFINE0(getegid)
1469 return from_kgid_munged(current_user_ns(), current_egid()); 1462 return from_kgid_munged(current_user_ns(), current_egid());
1470} 1463}
1471 1464
1472#endif
1473
1474static void process_timeout(unsigned long __data) 1465static void process_timeout(unsigned long __data)
1475{ 1466{
1476 wake_up_process((struct task_struct *)__data); 1467 wake_up_process((struct task_struct *)__data);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 60e4d7875672..6b245f64c8dd 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -506,6 +506,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
506 int size; 506 int size;
507 507
508 syscall_nr = syscall_get_nr(current, regs); 508 syscall_nr = syscall_get_nr(current, regs);
509 if (syscall_nr < 0)
510 return;
509 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) 511 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
510 return; 512 return;
511 513
@@ -580,6 +582,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
580 int size; 582 int size;
581 583
582 syscall_nr = syscall_get_nr(current, regs); 584 syscall_nr = syscall_get_nr(current, regs);
585 if (syscall_nr < 0)
586 return;
583 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) 587 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
584 return; 588 return;
585 589
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 692d97628a10..3c5a79e2134c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -66,6 +66,7 @@ enum {
66 66
67 /* pool flags */ 67 /* pool flags */
68 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ 68 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
69 POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */
69 70
70 /* worker flags */ 71 /* worker flags */
71 WORKER_STARTED = 1 << 0, /* started */ 72 WORKER_STARTED = 1 << 0, /* started */
@@ -652,7 +653,7 @@ static bool need_to_manage_workers(struct worker_pool *pool)
652/* Do we have too many workers and should some go away? */ 653/* Do we have too many workers and should some go away? */
653static bool too_many_workers(struct worker_pool *pool) 654static bool too_many_workers(struct worker_pool *pool)
654{ 655{
655 bool managing = mutex_is_locked(&pool->manager_mutex); 656 bool managing = pool->flags & POOL_MANAGING_WORKERS;
656 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ 657 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
657 int nr_busy = pool->nr_workers - nr_idle; 658 int nr_busy = pool->nr_workers - nr_idle;
658 659
@@ -1326,6 +1327,15 @@ static void idle_worker_rebind(struct worker *worker)
1326 1327
1327 /* we did our part, wait for rebind_workers() to finish up */ 1328 /* we did our part, wait for rebind_workers() to finish up */
1328 wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); 1329 wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
1330
1331 /*
1332 * rebind_workers() shouldn't finish until all workers passed the
1333 * above WORKER_REBIND wait. Tell it when done.
1334 */
1335 spin_lock_irq(&worker->pool->gcwq->lock);
1336 if (!--worker->idle_rebind->cnt)
1337 complete(&worker->idle_rebind->done);
1338 spin_unlock_irq(&worker->pool->gcwq->lock);
1329} 1339}
1330 1340
1331/* 1341/*
@@ -1339,8 +1349,16 @@ static void busy_worker_rebind_fn(struct work_struct *work)
1339 struct worker *worker = container_of(work, struct worker, rebind_work); 1349 struct worker *worker = container_of(work, struct worker, rebind_work);
1340 struct global_cwq *gcwq = worker->pool->gcwq; 1350 struct global_cwq *gcwq = worker->pool->gcwq;
1341 1351
1342 if (worker_maybe_bind_and_lock(worker)) 1352 worker_maybe_bind_and_lock(worker);
1343 worker_clr_flags(worker, WORKER_REBIND); 1353
1354 /*
1355 * %WORKER_REBIND must be cleared even if the above binding failed;
1356 * otherwise, we may confuse the next CPU_UP cycle or oops / get
1357 * stuck by calling idle_worker_rebind() prematurely. If CPU went
1358 * down again inbetween, %WORKER_UNBOUND would be set, so clearing
1359 * %WORKER_REBIND is always safe.
1360 */
1361 worker_clr_flags(worker, WORKER_REBIND);
1344 1362
1345 spin_unlock_irq(&gcwq->lock); 1363 spin_unlock_irq(&gcwq->lock);
1346} 1364}
@@ -1396,12 +1414,15 @@ retry:
1396 /* set REBIND and kick idle ones, we'll wait for these later */ 1414 /* set REBIND and kick idle ones, we'll wait for these later */
1397 for_each_worker_pool(pool, gcwq) { 1415 for_each_worker_pool(pool, gcwq) {
1398 list_for_each_entry(worker, &pool->idle_list, entry) { 1416 list_for_each_entry(worker, &pool->idle_list, entry) {
1417 unsigned long worker_flags = worker->flags;
1418
1399 if (worker->flags & WORKER_REBIND) 1419 if (worker->flags & WORKER_REBIND)
1400 continue; 1420 continue;
1401 1421
1402 /* morph UNBOUND to REBIND */ 1422 /* morph UNBOUND to REBIND atomically */
1403 worker->flags &= ~WORKER_UNBOUND; 1423 worker_flags &= ~WORKER_UNBOUND;
1404 worker->flags |= WORKER_REBIND; 1424 worker_flags |= WORKER_REBIND;
1425 ACCESS_ONCE(worker->flags) = worker_flags;
1405 1426
1406 idle_rebind.cnt++; 1427 idle_rebind.cnt++;
1407 worker->idle_rebind = &idle_rebind; 1428 worker->idle_rebind = &idle_rebind;
@@ -1419,25 +1440,15 @@ retry:
1419 goto retry; 1440 goto retry;
1420 } 1441 }
1421 1442
1422 /* 1443 /* all idle workers are rebound, rebind busy workers */
1423 * All idle workers are rebound and waiting for %WORKER_REBIND to
1424 * be cleared inside idle_worker_rebind(). Clear and release.
1425 * Clearing %WORKER_REBIND from this foreign context is safe
1426 * because these workers are still guaranteed to be idle.
1427 */
1428 for_each_worker_pool(pool, gcwq)
1429 list_for_each_entry(worker, &pool->idle_list, entry)
1430 worker->flags &= ~WORKER_REBIND;
1431
1432 wake_up_all(&gcwq->rebind_hold);
1433
1434 /* rebind busy workers */
1435 for_each_busy_worker(worker, i, pos, gcwq) { 1444 for_each_busy_worker(worker, i, pos, gcwq) {
1436 struct work_struct *rebind_work = &worker->rebind_work; 1445 struct work_struct *rebind_work = &worker->rebind_work;
1446 unsigned long worker_flags = worker->flags;
1437 1447
1438 /* morph UNBOUND to REBIND */ 1448 /* morph UNBOUND to REBIND atomically */
1439 worker->flags &= ~WORKER_UNBOUND; 1449 worker_flags &= ~WORKER_UNBOUND;
1440 worker->flags |= WORKER_REBIND; 1450 worker_flags |= WORKER_REBIND;
1451 ACCESS_ONCE(worker->flags) = worker_flags;
1441 1452
1442 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, 1453 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
1443 work_data_bits(rebind_work))) 1454 work_data_bits(rebind_work)))
@@ -1449,6 +1460,34 @@ retry:
1449 worker->scheduled.next, 1460 worker->scheduled.next,
1450 work_color_to_flags(WORK_NO_COLOR)); 1461 work_color_to_flags(WORK_NO_COLOR));
1451 } 1462 }
1463
1464 /*
1465 * All idle workers are rebound and waiting for %WORKER_REBIND to
1466 * be cleared inside idle_worker_rebind(). Clear and release.
1467 * Clearing %WORKER_REBIND from this foreign context is safe
1468 * because these workers are still guaranteed to be idle.
1469 *
1470 * We need to make sure all idle workers passed WORKER_REBIND wait
1471 * in idle_worker_rebind() before returning; otherwise, workers can
1472 * get stuck at the wait if hotplug cycle repeats.
1473 */
1474 idle_rebind.cnt = 1;
1475 INIT_COMPLETION(idle_rebind.done);
1476
1477 for_each_worker_pool(pool, gcwq) {
1478 list_for_each_entry(worker, &pool->idle_list, entry) {
1479 worker->flags &= ~WORKER_REBIND;
1480 idle_rebind.cnt++;
1481 }
1482 }
1483
1484 wake_up_all(&gcwq->rebind_hold);
1485
1486 if (--idle_rebind.cnt) {
1487 spin_unlock_irq(&gcwq->lock);
1488 wait_for_completion(&idle_rebind.done);
1489 spin_lock_irq(&gcwq->lock);
1490 }
1452} 1491}
1453 1492
1454static struct worker *alloc_worker(void) 1493static struct worker *alloc_worker(void)
@@ -1794,9 +1833,45 @@ static bool manage_workers(struct worker *worker)
1794 struct worker_pool *pool = worker->pool; 1833 struct worker_pool *pool = worker->pool;
1795 bool ret = false; 1834 bool ret = false;
1796 1835
1797 if (!mutex_trylock(&pool->manager_mutex)) 1836 if (pool->flags & POOL_MANAGING_WORKERS)
1798 return ret; 1837 return ret;
1799 1838
1839 pool->flags |= POOL_MANAGING_WORKERS;
1840
1841 /*
1842 * To simplify both worker management and CPU hotplug, hold off
1843 * management while hotplug is in progress. CPU hotplug path can't
1844 * grab %POOL_MANAGING_WORKERS to achieve this because that can
1845 * lead to idle worker depletion (all become busy thinking someone
1846 * else is managing) which in turn can result in deadlock under
1847 * extreme circumstances. Use @pool->manager_mutex to synchronize
1848 * manager against CPU hotplug.
1849 *
1850 * manager_mutex would always be free unless CPU hotplug is in
1851 * progress. trylock first without dropping @gcwq->lock.
1852 */
1853 if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
1854 spin_unlock_irq(&pool->gcwq->lock);
1855 mutex_lock(&pool->manager_mutex);
1856 /*
1857 * CPU hotplug could have happened while we were waiting
1858 * for manager_mutex. Hotplug itself can't handle us
1859 * because manager isn't either on idle or busy list, and
1860 * @gcwq's state and ours could have deviated.
1861 *
1862 * As hotplug is now excluded via manager_mutex, we can
1863 * simply try to bind. It will succeed or fail depending
1864 * on @gcwq's current state. Try it and adjust
1865 * %WORKER_UNBOUND accordingly.
1866 */
1867 if (worker_maybe_bind_and_lock(worker))
1868 worker->flags &= ~WORKER_UNBOUND;
1869 else
1870 worker->flags |= WORKER_UNBOUND;
1871
1872 ret = true;
1873 }
1874
1800 pool->flags &= ~POOL_MANAGE_WORKERS; 1875 pool->flags &= ~POOL_MANAGE_WORKERS;
1801 1876
1802 /* 1877 /*
@@ -1806,6 +1881,7 @@ static bool manage_workers(struct worker *worker)
1806 ret |= maybe_destroy_workers(pool); 1881 ret |= maybe_destroy_workers(pool);
1807 ret |= maybe_create_worker(pool); 1882 ret |= maybe_create_worker(pool);
1808 1883
1884 pool->flags &= ~POOL_MANAGING_WORKERS;
1809 mutex_unlock(&pool->manager_mutex); 1885 mutex_unlock(&pool->manager_mutex);
1810 return ret; 1886 return ret;
1811} 1887}
@@ -3500,18 +3576,17 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3500#ifdef CONFIG_SMP 3576#ifdef CONFIG_SMP
3501 3577
3502struct work_for_cpu { 3578struct work_for_cpu {
3503 struct completion completion; 3579 struct work_struct work;
3504 long (*fn)(void *); 3580 long (*fn)(void *);
3505 void *arg; 3581 void *arg;
3506 long ret; 3582 long ret;
3507}; 3583};
3508 3584
3509static int do_work_for_cpu(void *_wfc) 3585static void work_for_cpu_fn(struct work_struct *work)
3510{ 3586{
3511 struct work_for_cpu *wfc = _wfc; 3587 struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
3588
3512 wfc->ret = wfc->fn(wfc->arg); 3589 wfc->ret = wfc->fn(wfc->arg);
3513 complete(&wfc->completion);
3514 return 0;
3515} 3590}
3516 3591
3517/** 3592/**
@@ -3526,19 +3601,11 @@ static int do_work_for_cpu(void *_wfc)
3526 */ 3601 */
3527long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) 3602long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
3528{ 3603{
3529 struct task_struct *sub_thread; 3604 struct work_for_cpu wfc = { .fn = fn, .arg = arg };
3530 struct work_for_cpu wfc = {
3531 .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
3532 .fn = fn,
3533 .arg = arg,
3534 };
3535 3605
3536 sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu"); 3606 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
3537 if (IS_ERR(sub_thread)) 3607 schedule_work_on(cpu, &wfc.work);
3538 return PTR_ERR(sub_thread); 3608 flush_work(&wfc.work);
3539 kthread_bind(sub_thread, cpu);
3540 wake_up_process(sub_thread);
3541 wait_for_completion(&wfc.completion);
3542 return wfc.ret; 3609 return wfc.ret;
3543} 3610}
3544EXPORT_SYMBOL_GPL(work_on_cpu); 3611EXPORT_SYMBOL_GPL(work_on_cpu);