aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/kthread.c11
-rw-r--r--kernel/perf_event.c37
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/power/user.c2
-rw-r--r--kernel/resource.c104
-rw-r--r--kernel/sched.c287
-rw-r--r--kernel/taskstats.c57
-rw-r--r--kernel/timer.c8
-rw-r--r--kernel/trace/ring_buffer.c9
-rw-r--r--kernel/trace/trace.c10
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/watchdog.c3
13 files changed, 361 insertions, 171 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 067244495966..7d164e25b0f0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -275,6 +275,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
275 275
276 setup_thread_stack(tsk, orig); 276 setup_thread_stack(tsk, orig);
277 clear_user_return_notifier(tsk); 277 clear_user_return_notifier(tsk);
278 clear_tsk_need_resched(tsk);
278 stackend = end_of_stack(tsk); 279 stackend = end_of_stack(tsk);
279 *stackend = STACK_END_MAGIC; /* for overflow detection */ 280 *stackend = STACK_END_MAGIC; /* for overflow detection */
280 281
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 74cf6f5e7ade..5355cfd44a3f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -265,6 +265,17 @@ int kthreadd(void *unused)
265 return 0; 265 return 0;
266} 266}
267 267
268void __init_kthread_worker(struct kthread_worker *worker,
269 const char *name,
270 struct lock_class_key *key)
271{
272 spin_lock_init(&worker->lock);
273 lockdep_set_class_and_name(&worker->lock, key, name);
274 INIT_LIST_HEAD(&worker->work_list);
275 worker->task = NULL;
276}
277EXPORT_SYMBOL_GPL(__init_kthread_worker);
278
268/** 279/**
269 * kthread_worker_fn - kthread function to process kthread_worker 280 * kthread_worker_fn - kthread function to process kthread_worker
270 * @worker_ptr: pointer to initialized kthread_worker 281 * @worker_ptr: pointer to initialized kthread_worker
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index eac7e3364335..2870feee81dd 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3824,6 +3824,8 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3824 rcu_read_lock(); 3824 rcu_read_lock();
3825 list_for_each_entry_rcu(pmu, &pmus, entry) { 3825 list_for_each_entry_rcu(pmu, &pmus, entry) {
3826 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 3826 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3827 if (cpuctx->active_pmu != pmu)
3828 goto next;
3827 perf_event_task_ctx(&cpuctx->ctx, task_event); 3829 perf_event_task_ctx(&cpuctx->ctx, task_event);
3828 3830
3829 ctx = task_event->task_ctx; 3831 ctx = task_event->task_ctx;
@@ -3959,6 +3961,8 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3959 rcu_read_lock(); 3961 rcu_read_lock();
3960 list_for_each_entry_rcu(pmu, &pmus, entry) { 3962 list_for_each_entry_rcu(pmu, &pmus, entry) {
3961 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 3963 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3964 if (cpuctx->active_pmu != pmu)
3965 goto next;
3962 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3966 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3963 3967
3964 ctxn = pmu->task_ctx_nr; 3968 ctxn = pmu->task_ctx_nr;
@@ -4144,6 +4148,8 @@ got_name:
4144 rcu_read_lock(); 4148 rcu_read_lock();
4145 list_for_each_entry_rcu(pmu, &pmus, entry) { 4149 list_for_each_entry_rcu(pmu, &pmus, entry) {
4146 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4150 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4151 if (cpuctx->active_pmu != pmu)
4152 goto next;
4147 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, 4153 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4148 vma->vm_flags & VM_EXEC); 4154 vma->vm_flags & VM_EXEC);
4149 4155
@@ -4713,7 +4719,7 @@ static int perf_swevent_init(struct perf_event *event)
4713 break; 4719 break;
4714 } 4720 }
4715 4721
4716 if (event_id > PERF_COUNT_SW_MAX) 4722 if (event_id >= PERF_COUNT_SW_MAX)
4717 return -ENOENT; 4723 return -ENOENT;
4718 4724
4719 if (!event->parent) { 4725 if (!event->parent) {
@@ -5145,20 +5151,36 @@ static void *find_pmu_context(int ctxn)
5145 return NULL; 5151 return NULL;
5146} 5152}
5147 5153
5148static void free_pmu_context(void * __percpu cpu_context) 5154static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
5149{ 5155{
5150 struct pmu *pmu; 5156 int cpu;
5157
5158 for_each_possible_cpu(cpu) {
5159 struct perf_cpu_context *cpuctx;
5160
5161 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5162
5163 if (cpuctx->active_pmu == old_pmu)
5164 cpuctx->active_pmu = pmu;
5165 }
5166}
5167
5168static void free_pmu_context(struct pmu *pmu)
5169{
5170 struct pmu *i;
5151 5171
5152 mutex_lock(&pmus_lock); 5172 mutex_lock(&pmus_lock);
5153 /* 5173 /*
5154 * Like a real lame refcount. 5174 * Like a real lame refcount.
5155 */ 5175 */
5156 list_for_each_entry(pmu, &pmus, entry) { 5176 list_for_each_entry(i, &pmus, entry) {
5157 if (pmu->pmu_cpu_context == cpu_context) 5177 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
5178 update_pmu_context(i, pmu);
5158 goto out; 5179 goto out;
5180 }
5159 } 5181 }
5160 5182
5161 free_percpu(cpu_context); 5183 free_percpu(pmu->pmu_cpu_context);
5162out: 5184out:
5163 mutex_unlock(&pmus_lock); 5185 mutex_unlock(&pmus_lock);
5164} 5186}
@@ -5190,6 +5212,7 @@ int perf_pmu_register(struct pmu *pmu)
5190 cpuctx->ctx.pmu = pmu; 5212 cpuctx->ctx.pmu = pmu;
5191 cpuctx->jiffies_interval = 1; 5213 cpuctx->jiffies_interval = 1;
5192 INIT_LIST_HEAD(&cpuctx->rotation_list); 5214 INIT_LIST_HEAD(&cpuctx->rotation_list);
5215 cpuctx->active_pmu = pmu;
5193 } 5216 }
5194 5217
5195got_cpu_context: 5218got_cpu_context:
@@ -5241,7 +5264,7 @@ void perf_pmu_unregister(struct pmu *pmu)
5241 synchronize_rcu(); 5264 synchronize_rcu();
5242 5265
5243 free_percpu(pmu->pmu_disable_count); 5266 free_percpu(pmu->pmu_disable_count);
5244 free_pmu_context(pmu->pmu_cpu_context); 5267 free_pmu_context(pmu);
5245} 5268}
5246 5269
5247struct pmu *perf_init_event(struct perf_event *event) 5270struct pmu *perf_init_event(struct perf_event *event)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index baf667bb2794..8c7e4832b9be 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -30,7 +30,7 @@
30 30
31#include "power.h" 31#include "power.h"
32 32
33#define HIBERNATE_SIG "LINHIB0001" 33#define HIBERNATE_SIG "S1SUSPEND"
34 34
35/* 35/*
36 * The swap map is a data structure used for keeping track of each page 36 * The swap map is a data structure used for keeping track of each page
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 1b2ea31e6bd8..c36c3b9e8a84 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -137,7 +137,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
137 free_all_swap_pages(data->swap); 137 free_all_swap_pages(data->swap);
138 if (data->frozen) 138 if (data->frozen)
139 thaw_processes(); 139 thaw_processes();
140 pm_notifier_call_chain(data->mode == O_WRONLY ? 140 pm_notifier_call_chain(data->mode == O_RDONLY ?
141 PM_POST_HIBERNATION : PM_POST_RESTORE); 141 PM_POST_HIBERNATION : PM_POST_RESTORE);
142 atomic_inc(&snapshot_device_available); 142 atomic_inc(&snapshot_device_available);
143 143
diff --git a/kernel/resource.c b/kernel/resource.c
index 9fad33efd0db..798e2fae2a06 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -40,23 +40,6 @@ EXPORT_SYMBOL(iomem_resource);
40 40
41static DEFINE_RWLOCK(resource_lock); 41static DEFINE_RWLOCK(resource_lock);
42 42
43/*
44 * By default, we allocate free space bottom-up. The architecture can request
45 * top-down by clearing this flag. The user can override the architecture's
46 * choice with the "resource_alloc_from_bottom" kernel boot option, but that
47 * should only be a debugging tool.
48 */
49int resource_alloc_from_bottom = 1;
50
51static __init int setup_alloc_from_bottom(char *s)
52{
53 printk(KERN_INFO
54 "resource: allocating from bottom-up; please report a bug\n");
55 resource_alloc_from_bottom = 1;
56 return 0;
57}
58early_param("resource_alloc_from_bottom", setup_alloc_from_bottom);
59
60static void *r_next(struct seq_file *m, void *v, loff_t *pos) 43static void *r_next(struct seq_file *m, void *v, loff_t *pos)
61{ 44{
62 struct resource *p = v; 45 struct resource *p = v;
@@ -374,6 +357,10 @@ int __weak page_is_ram(unsigned long pfn)
374 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; 357 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
375} 358}
376 359
360void __weak arch_remove_reservations(struct resource *avail)
361{
362}
363
377static resource_size_t simple_align_resource(void *data, 364static resource_size_t simple_align_resource(void *data,
378 const struct resource *avail, 365 const struct resource *avail,
379 resource_size_t size, 366 resource_size_t size,
@@ -397,74 +384,7 @@ static bool resource_contains(struct resource *res1, struct resource *res2)
397} 384}
398 385
399/* 386/*
400 * Find the resource before "child" in the sibling list of "root" children.
401 */
402static struct resource *find_sibling_prev(struct resource *root, struct resource *child)
403{
404 struct resource *this;
405
406 for (this = root->child; this; this = this->sibling)
407 if (this->sibling == child)
408 return this;
409
410 return NULL;
411}
412
413/*
414 * Find empty slot in the resource tree given range and alignment. 387 * Find empty slot in the resource tree given range and alignment.
415 * This version allocates from the end of the root resource first.
416 */
417static int find_resource_from_top(struct resource *root, struct resource *new,
418 resource_size_t size, resource_size_t min,
419 resource_size_t max, resource_size_t align,
420 resource_size_t (*alignf)(void *,
421 const struct resource *,
422 resource_size_t,
423 resource_size_t),
424 void *alignf_data)
425{
426 struct resource *this;
427 struct resource tmp, avail, alloc;
428
429 tmp.start = root->end;
430 tmp.end = root->end;
431
432 this = find_sibling_prev(root, NULL);
433 for (;;) {
434 if (this) {
435 if (this->end < root->end)
436 tmp.start = this->end + 1;
437 } else
438 tmp.start = root->start;
439
440 resource_clip(&tmp, min, max);
441
442 /* Check for overflow after ALIGN() */
443 avail = *new;
444 avail.start = ALIGN(tmp.start, align);
445 avail.end = tmp.end;
446 if (avail.start >= tmp.start) {
447 alloc.start = alignf(alignf_data, &avail, size, align);
448 alloc.end = alloc.start + size - 1;
449 if (resource_contains(&avail, &alloc)) {
450 new->start = alloc.start;
451 new->end = alloc.end;
452 return 0;
453 }
454 }
455
456 if (!this || this->start == root->start)
457 break;
458
459 tmp.end = this->start - 1;
460 this = find_sibling_prev(root, this);
461 }
462 return -EBUSY;
463}
464
465/*
466 * Find empty slot in the resource tree given range and alignment.
467 * This version allocates from the beginning of the root resource first.
468 */ 388 */
469static int find_resource(struct resource *root, struct resource *new, 389static int find_resource(struct resource *root, struct resource *new,
470 resource_size_t size, resource_size_t min, 390 resource_size_t size, resource_size_t min,
@@ -478,23 +398,24 @@ static int find_resource(struct resource *root, struct resource *new,
478 struct resource *this = root->child; 398 struct resource *this = root->child;
479 struct resource tmp = *new, avail, alloc; 399 struct resource tmp = *new, avail, alloc;
480 400
401 tmp.flags = new->flags;
481 tmp.start = root->start; 402 tmp.start = root->start;
482 /* 403 /*
483 * Skip past an allocated resource that starts at 0, since the 404 * Skip past an allocated resource that starts at 0, since the assignment
484 * assignment of this->start - 1 to tmp->end below would cause an 405 * of this->start - 1 to tmp->end below would cause an underflow.
485 * underflow.
486 */ 406 */
487 if (this && this->start == 0) { 407 if (this && this->start == 0) {
488 tmp.start = this->end + 1; 408 tmp.start = this->end + 1;
489 this = this->sibling; 409 this = this->sibling;
490 } 410 }
491 for (;;) { 411 for(;;) {
492 if (this) 412 if (this)
493 tmp.end = this->start - 1; 413 tmp.end = this->start - 1;
494 else 414 else
495 tmp.end = root->end; 415 tmp.end = root->end;
496 416
497 resource_clip(&tmp, min, max); 417 resource_clip(&tmp, min, max);
418 arch_remove_reservations(&tmp);
498 419
499 /* Check for overflow after ALIGN() */ 420 /* Check for overflow after ALIGN() */
500 avail = *new; 421 avail = *new;
@@ -509,10 +430,8 @@ static int find_resource(struct resource *root, struct resource *new,
509 return 0; 430 return 0;
510 } 431 }
511 } 432 }
512
513 if (!this) 433 if (!this)
514 break; 434 break;
515
516 tmp.start = this->end + 1; 435 tmp.start = this->end + 1;
517 this = this->sibling; 436 this = this->sibling;
518 } 437 }
@@ -545,10 +464,7 @@ int allocate_resource(struct resource *root, struct resource *new,
545 alignf = simple_align_resource; 464 alignf = simple_align_resource;
546 465
547 write_lock(&resource_lock); 466 write_lock(&resource_lock);
548 if (resource_alloc_from_bottom) 467 err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
549 err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
550 else
551 err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data);
552 if (err >= 0 && __request_resource(root, new)) 468 if (err >= 0 && __request_resource(root, new))
553 err = -EBUSY; 469 err = -EBUSY;
554 write_unlock(&resource_lock); 470 write_unlock(&resource_lock);
diff --git a/kernel/sched.c b/kernel/sched.c
index 9f9dd8dda53c..f2f914e0c47c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -642,22 +642,18 @@ static inline struct task_group *task_group(struct task_struct *p)
642 642
643#endif /* CONFIG_CGROUP_SCHED */ 643#endif /* CONFIG_CGROUP_SCHED */
644 644
645static u64 irq_time_cpu(int cpu); 645static void update_rq_clock_task(struct rq *rq, s64 delta);
646static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
647 646
648inline void update_rq_clock(struct rq *rq) 647static void update_rq_clock(struct rq *rq)
649{ 648{
650 if (!rq->skip_clock_update) { 649 s64 delta;
651 int cpu = cpu_of(rq);
652 u64 irq_time;
653 650
654 rq->clock = sched_clock_cpu(cpu); 651 if (rq->skip_clock_update)
655 irq_time = irq_time_cpu(cpu); 652 return;
656 if (rq->clock - irq_time > rq->clock_task)
657 rq->clock_task = rq->clock - irq_time;
658 653
659 sched_irq_time_avg_update(rq, irq_time); 654 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
660 } 655 rq->clock += delta;
656 update_rq_clock_task(rq, delta);
661} 657}
662 658
663/* 659/*
@@ -1795,10 +1791,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1795 * They are read and saved off onto struct rq in update_rq_clock(). 1791 * They are read and saved off onto struct rq in update_rq_clock().
1796 * This may result in other CPU reading this CPU's irq time and can 1792 * This may result in other CPU reading this CPU's irq time and can
1797 * race with irq/account_system_vtime on this CPU. We would either get old 1793 * race with irq/account_system_vtime on this CPU. We would either get old
1798 * or new value (or semi updated value on 32 bit) with a side effect of 1794 * or new value with a side effect of accounting a slice of irq time to wrong
1799 * accounting a slice of irq time to wrong task when irq is in progress 1795 * task when irq is in progress while we read rq->clock. That is a worthy
1800 * while we read rq->clock. That is a worthy compromise in place of having 1796 * compromise in place of having locks on each irq in account_system_time.
1801 * locks on each irq in account_system_time.
1802 */ 1797 */
1803static DEFINE_PER_CPU(u64, cpu_hardirq_time); 1798static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1804static DEFINE_PER_CPU(u64, cpu_softirq_time); 1799static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1816,19 +1811,58 @@ void disable_sched_clock_irqtime(void)
1816 sched_clock_irqtime = 0; 1811 sched_clock_irqtime = 0;
1817} 1812}
1818 1813
1819static u64 irq_time_cpu(int cpu) 1814#ifndef CONFIG_64BIT
1815static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1816
1817static inline void irq_time_write_begin(void)
1820{ 1818{
1821 if (!sched_clock_irqtime) 1819 __this_cpu_inc(irq_time_seq.sequence);
1822 return 0; 1820 smp_wmb();
1821}
1822
1823static inline void irq_time_write_end(void)
1824{
1825 smp_wmb();
1826 __this_cpu_inc(irq_time_seq.sequence);
1827}
1828
1829static inline u64 irq_time_read(int cpu)
1830{
1831 u64 irq_time;
1832 unsigned seq;
1823 1833
1834 do {
1835 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1836 irq_time = per_cpu(cpu_softirq_time, cpu) +
1837 per_cpu(cpu_hardirq_time, cpu);
1838 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1839
1840 return irq_time;
1841}
1842#else /* CONFIG_64BIT */
1843static inline void irq_time_write_begin(void)
1844{
1845}
1846
1847static inline void irq_time_write_end(void)
1848{
1849}
1850
1851static inline u64 irq_time_read(int cpu)
1852{
1824 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); 1853 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1825} 1854}
1855#endif /* CONFIG_64BIT */
1826 1856
1857/*
1858 * Called before incrementing preempt_count on {soft,}irq_enter
1859 * and before decrementing preempt_count on {soft,}irq_exit.
1860 */
1827void account_system_vtime(struct task_struct *curr) 1861void account_system_vtime(struct task_struct *curr)
1828{ 1862{
1829 unsigned long flags; 1863 unsigned long flags;
1864 s64 delta;
1830 int cpu; 1865 int cpu;
1831 u64 now, delta;
1832 1866
1833 if (!sched_clock_irqtime) 1867 if (!sched_clock_irqtime)
1834 return; 1868 return;
@@ -1836,9 +1870,10 @@ void account_system_vtime(struct task_struct *curr)
1836 local_irq_save(flags); 1870 local_irq_save(flags);
1837 1871
1838 cpu = smp_processor_id(); 1872 cpu = smp_processor_id();
1839 now = sched_clock_cpu(cpu); 1873 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1840 delta = now - per_cpu(irq_start_time, cpu); 1874 __this_cpu_add(irq_start_time, delta);
1841 per_cpu(irq_start_time, cpu) = now; 1875
1876 irq_time_write_begin();
1842 /* 1877 /*
1843 * We do not account for softirq time from ksoftirqd here. 1878 * We do not account for softirq time from ksoftirqd here.
1844 * We want to continue accounting softirq time to ksoftirqd thread 1879 * We want to continue accounting softirq time to ksoftirqd thread
@@ -1846,33 +1881,55 @@ void account_system_vtime(struct task_struct *curr)
1846 * that do not consume any time, but still wants to run. 1881 * that do not consume any time, but still wants to run.
1847 */ 1882 */
1848 if (hardirq_count()) 1883 if (hardirq_count())
1849 per_cpu(cpu_hardirq_time, cpu) += delta; 1884 __this_cpu_add(cpu_hardirq_time, delta);
1850 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) 1885 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1851 per_cpu(cpu_softirq_time, cpu) += delta; 1886 __this_cpu_add(cpu_softirq_time, delta);
1852 1887
1888 irq_time_write_end();
1853 local_irq_restore(flags); 1889 local_irq_restore(flags);
1854} 1890}
1855EXPORT_SYMBOL_GPL(account_system_vtime); 1891EXPORT_SYMBOL_GPL(account_system_vtime);
1856 1892
1857static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) 1893static void update_rq_clock_task(struct rq *rq, s64 delta)
1858{ 1894{
1859 if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { 1895 s64 irq_delta;
1860 u64 delta_irq = curr_irq_time - rq->prev_irq_time; 1896
1861 rq->prev_irq_time = curr_irq_time; 1897 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1862 sched_rt_avg_update(rq, delta_irq); 1898
1863 } 1899 /*
1900 * Since irq_time is only updated on {soft,}irq_exit, we might run into
1901 * this case when a previous update_rq_clock() happened inside a
1902 * {soft,}irq region.
1903 *
1904 * When this happens, we stop ->clock_task and only update the
1905 * prev_irq_time stamp to account for the part that fit, so that a next
1906 * update will consume the rest. This ensures ->clock_task is
1907 * monotonic.
1908 *
1909 * It does however cause some slight miss-attribution of {soft,}irq
1910 * time, a more accurate solution would be to update the irq_time using
1911 * the current rq->clock timestamp, except that would require using
1912 * atomic ops.
1913 */
1914 if (irq_delta > delta)
1915 irq_delta = delta;
1916
1917 rq->prev_irq_time += irq_delta;
1918 delta -= irq_delta;
1919 rq->clock_task += delta;
1920
1921 if (irq_delta && sched_feat(NONIRQ_POWER))
1922 sched_rt_avg_update(rq, irq_delta);
1864} 1923}
1865 1924
1866#else 1925#else /* CONFIG_IRQ_TIME_ACCOUNTING */
1867 1926
1868static u64 irq_time_cpu(int cpu) 1927static void update_rq_clock_task(struct rq *rq, s64 delta)
1869{ 1928{
1870 return 0; 1929 rq->clock_task += delta;
1871} 1930}
1872 1931
1873static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } 1932#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1874
1875#endif
1876 1933
1877#include "sched_idletask.c" 1934#include "sched_idletask.c"
1878#include "sched_fair.c" 1935#include "sched_fair.c"
@@ -2001,7 +2058,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2001 * A queue event has occurred, and we're going to schedule. In 2058 * A queue event has occurred, and we're going to schedule. In
2002 * this case, we can save a useless back to back clock update. 2059 * this case, we can save a useless back to back clock update.
2003 */ 2060 */
2004 if (test_tsk_need_resched(rq->curr)) 2061 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
2005 rq->skip_clock_update = 1; 2062 rq->skip_clock_update = 1;
2006} 2063}
2007 2064
@@ -2988,6 +3045,15 @@ static long calc_load_fold_active(struct rq *this_rq)
2988 return delta; 3045 return delta;
2989} 3046}
2990 3047
3048static unsigned long
3049calc_load(unsigned long load, unsigned long exp, unsigned long active)
3050{
3051 load *= exp;
3052 load += active * (FIXED_1 - exp);
3053 load += 1UL << (FSHIFT - 1);
3054 return load >> FSHIFT;
3055}
3056
2991#ifdef CONFIG_NO_HZ 3057#ifdef CONFIG_NO_HZ
2992/* 3058/*
2993 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 3059 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3017,6 +3083,128 @@ static long calc_load_fold_idle(void)
3017 3083
3018 return delta; 3084 return delta;
3019} 3085}
3086
3087/**
3088 * fixed_power_int - compute: x^n, in O(log n) time
3089 *
3090 * @x: base of the power
3091 * @frac_bits: fractional bits of @x
3092 * @n: power to raise @x to.
3093 *
3094 * By exploiting the relation between the definition of the natural power
3095 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
3096 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3097 * (where: n_i \elem {0, 1}, the binary vector representing n),
3098 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3099 * of course trivially computable in O(log_2 n), the length of our binary
3100 * vector.
3101 */
3102static unsigned long
3103fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3104{
3105 unsigned long result = 1UL << frac_bits;
3106
3107 if (n) for (;;) {
3108 if (n & 1) {
3109 result *= x;
3110 result += 1UL << (frac_bits - 1);
3111 result >>= frac_bits;
3112 }
3113 n >>= 1;
3114 if (!n)
3115 break;
3116 x *= x;
3117 x += 1UL << (frac_bits - 1);
3118 x >>= frac_bits;
3119 }
3120
3121 return result;
3122}
3123
3124/*
3125 * a1 = a0 * e + a * (1 - e)
3126 *
3127 * a2 = a1 * e + a * (1 - e)
3128 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3129 * = a0 * e^2 + a * (1 - e) * (1 + e)
3130 *
3131 * a3 = a2 * e + a * (1 - e)
3132 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3133 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3134 *
3135 * ...
3136 *
3137 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3138 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3139 * = a0 * e^n + a * (1 - e^n)
3140 *
3141 * [1] application of the geometric series:
3142 *
3143 * n 1 - x^(n+1)
3144 * S_n := \Sum x^i = -------------
3145 * i=0 1 - x
3146 */
3147static unsigned long
3148calc_load_n(unsigned long load, unsigned long exp,
3149 unsigned long active, unsigned int n)
3150{
3151
3152 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3153}
3154
3155/*
3156 * NO_HZ can leave us missing all per-cpu ticks calling
3157 * calc_load_account_active(), but since an idle CPU folds its delta into
3158 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3159 * in the pending idle delta if our idle period crossed a load cycle boundary.
3160 *
3161 * Once we've updated the global active value, we need to apply the exponential
3162 * weights adjusted to the number of cycles missed.
3163 */
3164static void calc_global_nohz(unsigned long ticks)
3165{
3166 long delta, active, n;
3167
3168 if (time_before(jiffies, calc_load_update))
3169 return;
3170
3171 /*
3172 * If we crossed a calc_load_update boundary, make sure to fold
3173 * any pending idle changes, the respective CPUs might have
3174 * missed the tick driven calc_load_account_active() update
3175 * due to NO_HZ.
3176 */
3177 delta = calc_load_fold_idle();
3178 if (delta)
3179 atomic_long_add(delta, &calc_load_tasks);
3180
3181 /*
3182 * If we were idle for multiple load cycles, apply them.
3183 */
3184 if (ticks >= LOAD_FREQ) {
3185 n = ticks / LOAD_FREQ;
3186
3187 active = atomic_long_read(&calc_load_tasks);
3188 active = active > 0 ? active * FIXED_1 : 0;
3189
3190 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3191 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3192 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3193
3194 calc_load_update += n * LOAD_FREQ;
3195 }
3196
3197 /*
3198 * Its possible the remainder of the above division also crosses
3199 * a LOAD_FREQ period, the regular check in calc_global_load()
3200 * which comes after this will take care of that.
3201 *
3202 * Consider us being 11 ticks before a cycle completion, and us
3203 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
3204 * age us 4 cycles, and the test in calc_global_load() will
3205 * pick up the final one.
3206 */
3207}
3020#else 3208#else
3021static void calc_load_account_idle(struct rq *this_rq) 3209static void calc_load_account_idle(struct rq *this_rq)
3022{ 3210{
@@ -3026,6 +3214,10 @@ static inline long calc_load_fold_idle(void)
3026{ 3214{
3027 return 0; 3215 return 0;
3028} 3216}
3217
3218static void calc_global_nohz(unsigned long ticks)
3219{
3220}
3029#endif 3221#endif
3030 3222
3031/** 3223/**
@@ -3043,24 +3235,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3043 loads[2] = (avenrun[2] + offset) << shift; 3235 loads[2] = (avenrun[2] + offset) << shift;
3044} 3236}
3045 3237
3046static unsigned long
3047calc_load(unsigned long load, unsigned long exp, unsigned long active)
3048{
3049 load *= exp;
3050 load += active * (FIXED_1 - exp);
3051 return load >> FSHIFT;
3052}
3053
3054/* 3238/*
3055 * calc_load - update the avenrun load estimates 10 ticks after the 3239 * calc_load - update the avenrun load estimates 10 ticks after the
3056 * CPUs have updated calc_load_tasks. 3240 * CPUs have updated calc_load_tasks.
3057 */ 3241 */
3058void calc_global_load(void) 3242void calc_global_load(unsigned long ticks)
3059{ 3243{
3060 unsigned long upd = calc_load_update + 10;
3061 long active; 3244 long active;
3062 3245
3063 if (time_before(jiffies, upd)) 3246 calc_global_nohz(ticks);
3247
3248 if (time_before(jiffies, calc_load_update + 10))
3064 return; 3249 return;
3065 3250
3066 active = atomic_long_read(&calc_load_tasks); 3251 active = atomic_long_read(&calc_load_tasks);
@@ -3714,7 +3899,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
3714{ 3899{
3715 if (prev->se.on_rq) 3900 if (prev->se.on_rq)
3716 update_rq_clock(rq); 3901 update_rq_clock(rq);
3717 rq->skip_clock_update = 0;
3718 prev->sched_class->put_prev_task(rq, prev); 3902 prev->sched_class->put_prev_task(rq, prev);
3719} 3903}
3720 3904
@@ -3772,7 +3956,6 @@ need_resched_nonpreemptible:
3772 hrtick_clear(rq); 3956 hrtick_clear(rq);
3773 3957
3774 raw_spin_lock_irq(&rq->lock); 3958 raw_spin_lock_irq(&rq->lock);
3775 clear_tsk_need_resched(prev);
3776 3959
3777 switch_count = &prev->nivcsw; 3960 switch_count = &prev->nivcsw;
3778 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3961 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3804,6 +3987,8 @@ need_resched_nonpreemptible:
3804 3987
3805 put_prev_task(rq, prev); 3988 put_prev_task(rq, prev);
3806 next = pick_next_task(rq); 3989 next = pick_next_task(rq);
3990 clear_tsk_need_resched(prev);
3991 rq->skip_clock_update = 0;
3807 3992
3808 if (likely(prev != next)) { 3993 if (likely(prev != next)) {
3809 sched_info_switch(prev, next); 3994 sched_info_switch(prev, next);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index c8231fb15708..3308fd7f1b52 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -349,25 +349,47 @@ static int parse(struct nlattr *na, struct cpumask *mask)
349 return ret; 349 return ret;
350} 350}
351 351
352#ifdef CONFIG_IA64
353#define TASKSTATS_NEEDS_PADDING 1
354#endif
355
352static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 356static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
353{ 357{
354 struct nlattr *na, *ret; 358 struct nlattr *na, *ret;
355 int aggr; 359 int aggr;
356 360
357 /* If we don't pad, we end up with alignment on a 4 byte boundary.
358 * This causes lots of runtime warnings on systems requiring 8 byte
359 * alignment */
360 u32 pids[2] = { pid, 0 };
361 int pid_size = ALIGN(sizeof(pid), sizeof(long));
362
363 aggr = (type == TASKSTATS_TYPE_PID) 361 aggr = (type == TASKSTATS_TYPE_PID)
364 ? TASKSTATS_TYPE_AGGR_PID 362 ? TASKSTATS_TYPE_AGGR_PID
365 : TASKSTATS_TYPE_AGGR_TGID; 363 : TASKSTATS_TYPE_AGGR_TGID;
366 364
365 /*
366 * The taskstats structure is internally aligned on 8 byte
367 * boundaries but the layout of the aggregrate reply, with
368 * two NLA headers and the pid (each 4 bytes), actually
369 * force the entire structure to be unaligned. This causes
370 * the kernel to issue unaligned access warnings on some
371 * architectures like ia64. Unfortunately, some software out there
372 * doesn't properly unroll the NLA packet and assumes that the start
373 * of the taskstats structure will always be 20 bytes from the start
374 * of the netlink payload. Aligning the start of the taskstats
375 * structure breaks this software, which we don't want. So, for now
376 * the alignment only happens on architectures that require it
377 * and those users will have to update to fixed versions of those
378 * packages. Space is reserved in the packet only when needed.
379 * This ifdef should be removed in several years e.g. 2012 once
380 * we can be confident that fixed versions are installed on most
381 * systems. We add the padding before the aggregate since the
382 * aggregate is already a defined type.
383 */
384#ifdef TASKSTATS_NEEDS_PADDING
385 if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0)
386 goto err;
387#endif
367 na = nla_nest_start(skb, aggr); 388 na = nla_nest_start(skb, aggr);
368 if (!na) 389 if (!na)
369 goto err; 390 goto err;
370 if (nla_put(skb, type, pid_size, pids) < 0) 391
392 if (nla_put(skb, type, sizeof(pid), &pid) < 0)
371 goto err; 393 goto err;
372 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 394 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
373 if (!ret) 395 if (!ret)
@@ -456,6 +478,18 @@ out:
456 return rc; 478 return rc;
457} 479}
458 480
481static size_t taskstats_packet_size(void)
482{
483 size_t size;
484
485 size = nla_total_size(sizeof(u32)) +
486 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
487#ifdef TASKSTATS_NEEDS_PADDING
488 size += nla_total_size(0); /* Padding for alignment */
489#endif
490 return size;
491}
492
459static int cmd_attr_pid(struct genl_info *info) 493static int cmd_attr_pid(struct genl_info *info)
460{ 494{
461 struct taskstats *stats; 495 struct taskstats *stats;
@@ -464,8 +498,7 @@ static int cmd_attr_pid(struct genl_info *info)
464 u32 pid; 498 u32 pid;
465 int rc; 499 int rc;
466 500
467 size = nla_total_size(sizeof(u32)) + 501 size = taskstats_packet_size();
468 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
469 502
470 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 503 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
471 if (rc < 0) 504 if (rc < 0)
@@ -494,8 +527,7 @@ static int cmd_attr_tgid(struct genl_info *info)
494 u32 tgid; 527 u32 tgid;
495 int rc; 528 int rc;
496 529
497 size = nla_total_size(sizeof(u32)) + 530 size = taskstats_packet_size();
498 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
499 531
500 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 532 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
501 if (rc < 0) 533 if (rc < 0)
@@ -570,8 +602,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
570 /* 602 /*
571 * Size includes space for nested attributes 603 * Size includes space for nested attributes
572 */ 604 */
573 size = nla_total_size(sizeof(u32)) + 605 size = taskstats_packet_size();
574 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
575 606
576 is_thread_group = !!taskstats_tgid_alloc(tsk); 607 is_thread_group = !!taskstats_tgid_alloc(tsk);
577 if (is_thread_group) { 608 if (is_thread_group) {
diff --git a/kernel/timer.c b/kernel/timer.c
index 68a9ae7679b7..353b9227c2ec 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1252,6 +1252,12 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1252 struct tvec_base *base = __get_cpu_var(tvec_bases); 1252 struct tvec_base *base = __get_cpu_var(tvec_bases);
1253 unsigned long expires; 1253 unsigned long expires;
1254 1254
1255 /*
1256 * Pretend that there is no timer pending if the cpu is offline.
1257 * Possible pending timers will be migrated later to an active cpu.
1258 */
1259 if (cpu_is_offline(smp_processor_id()))
1260 return now + NEXT_TIMER_MAX_DELTA;
1255 spin_lock(&base->lock); 1261 spin_lock(&base->lock);
1256 if (time_before_eq(base->next_timer, base->timer_jiffies)) 1262 if (time_before_eq(base->next_timer, base->timer_jiffies))
1257 base->next_timer = __next_timer_interrupt(base); 1263 base->next_timer = __next_timer_interrupt(base);
@@ -1319,7 +1325,7 @@ void do_timer(unsigned long ticks)
1319{ 1325{
1320 jiffies_64 += ticks; 1326 jiffies_64 += ticks;
1321 update_wall_time(); 1327 update_wall_time();
1322 calc_global_load(); 1328 calc_global_load(ticks);
1323} 1329}
1324 1330
1325#ifdef __ARCH_WANT_SYS_ALARM 1331#ifdef __ARCH_WANT_SYS_ALARM
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9ed509a015d8..bd1c35a4fbcc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3853,6 +3853,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3853 3853
3854 /* Need to copy one event at a time */ 3854 /* Need to copy one event at a time */
3855 do { 3855 do {
3856 /* We need the size of one event, because
3857 * rb_advance_reader only advances by one event,
3858 * whereas rb_event_ts_length may include the size of
3859 * one or two events.
3860 * We have already ensured there's enough space if this
3861 * is a time extend. */
3862 size = rb_event_length(event);
3856 memcpy(bpage->data + pos, rpage->data + rpos, size); 3863 memcpy(bpage->data + pos, rpage->data + rpos, size);
3857 3864
3858 len -= size; 3865 len -= size;
@@ -3867,7 +3874,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3867 event = rb_reader_event(cpu_buffer); 3874 event = rb_reader_event(cpu_buffer);
3868 /* Always keep the time extend and data together */ 3875 /* Always keep the time extend and data together */
3869 size = rb_event_ts_length(event); 3876 size = rb_event_ts_length(event);
3870 } while (len > size); 3877 } while (len >= size);
3871 3878
3872 /* update bpage */ 3879 /* update bpage */
3873 local_set(&bpage->commit, pos); 3880 local_set(&bpage->commit, pos);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c380612273bf..f8cf959bad45 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2338,11 +2338,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
2338 return count; 2338 return count;
2339} 2339}
2340 2340
2341static loff_t tracing_seek(struct file *file, loff_t offset, int origin)
2342{
2343 if (file->f_mode & FMODE_READ)
2344 return seq_lseek(file, offset, origin);
2345 else
2346 return 0;
2347}
2348
2341static const struct file_operations tracing_fops = { 2349static const struct file_operations tracing_fops = {
2342 .open = tracing_open, 2350 .open = tracing_open,
2343 .read = seq_read, 2351 .read = seq_read,
2344 .write = tracing_write_stub, 2352 .write = tracing_write_stub,
2345 .llseek = seq_lseek, 2353 .llseek = tracing_seek,
2346 .release = tracing_release, 2354 .release = tracing_release,
2347}; 2355};
2348 2356
diff --git a/kernel/user.c b/kernel/user.c
index 2c7d8d5914b1..5c598ca781df 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -158,6 +158,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
158 spin_lock_irq(&uidhash_lock); 158 spin_lock_irq(&uidhash_lock);
159 up = uid_hash_find(uid, hashent); 159 up = uid_hash_find(uid, hashent);
160 if (up) { 160 if (up) {
161 put_user_ns(ns);
161 key_put(new->uid_keyring); 162 key_put(new->uid_keyring);
162 key_put(new->session_keyring); 163 key_put(new->session_keyring);
163 kmem_cache_free(uid_cachep, new); 164 kmem_cache_free(uid_cachep, new);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 14b8120d5232..c812c4927cab 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -364,7 +364,8 @@ static int watchdog_nmi_enable(int cpu)
364 goto out_save; 364 goto out_save;
365 } 365 }
366 366
367 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); 367 printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n",
368 cpu, PTR_ERR(event));
368 return PTR_ERR(event); 369 return PTR_ERR(event);
369 370
370 /* success path */ 371 /* success path */