aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/audit_tree.c1
-rw-r--r--kernel/auditfilter.c3
-rw-r--r--kernel/cgroup.c31
-rw-r--r--kernel/context_tracking.c41
-rw-r--r--kernel/cpu.c55
-rw-r--r--kernel/cpu/idle.c17
-rw-r--r--kernel/events/core.c233
-rw-r--r--kernel/events/hw_breakpoint.c6
-rw-r--r--kernel/events/internal.h4
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/irq/irqdomain.c9
-rw-r--r--kernel/kmod.c5
-rw-r--r--kernel/kprobes.c30
-rw-r--r--kernel/module.c21
-rw-r--r--kernel/posix-cpu-timers.c395
-rw-r--r--kernel/printk.c91
-rw-r--r--kernel/ptrace.c20
-rw-r--r--kernel/range.c19
-rw-r--r--kernel/rcutree.c21
-rw-r--r--kernel/rcutree.h2
-rw-r--r--kernel/rcutree_plugin.h4
-rw-r--r--kernel/sched/core.c23
-rw-r--r--kernel/sched/cputime.c6
-rw-r--r--kernel/sched/stats.h39
-rw-r--r--kernel/softirq.c13
-rw-r--r--kernel/sys.c29
-rw-r--r--kernel/time/ntp.c1
-rw-r--r--kernel/time/tick-broadcast.c19
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timekeeping.c8
-rw-r--r--kernel/trace/ftrace.c18
-rw-r--r--kernel/trace/ring_buffer.c3
-rw-r--r--kernel/trace/trace.c27
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_events.c4
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/workqueue.c10
38 files changed, 691 insertions, 527 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 21c7fa615bd3..91e53d04b6a9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1056,7 +1056,7 @@ static inline void audit_get_stamp(struct audit_context *ctx,
1056static void wait_for_auditd(unsigned long sleep_time) 1056static void wait_for_auditd(unsigned long sleep_time)
1057{ 1057{
1058 DECLARE_WAITQUEUE(wait, current); 1058 DECLARE_WAITQUEUE(wait, current);
1059 set_current_state(TASK_INTERRUPTIBLE); 1059 set_current_state(TASK_UNINTERRUPTIBLE);
1060 add_wait_queue(&audit_backlog_wait, &wait); 1060 add_wait_queue(&audit_backlog_wait, &wait);
1061 1061
1062 if (audit_backlog_limit && 1062 if (audit_backlog_limit &&
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index a291aa23fb3f..43c307dc9453 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -658,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
658 struct vfsmount *mnt; 658 struct vfsmount *mnt;
659 int err; 659 int err;
660 660
661 rule->tree = NULL;
661 list_for_each_entry(tree, &tree_list, list) { 662 list_for_each_entry(tree, &tree_list, list) {
662 if (!strcmp(seed->pathname, tree->pathname)) { 663 if (!strcmp(seed->pathname, tree->pathname)) {
663 put_tree(seed); 664 put_tree(seed);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 83a2970295d1..6bd4a90d1991 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1021,9 +1021,6 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
1021 * @seq: netlink audit message sequence (serial) number 1021 * @seq: netlink audit message sequence (serial) number
1022 * @data: payload data 1022 * @data: payload data
1023 * @datasz: size of payload data 1023 * @datasz: size of payload data
1024 * @loginuid: loginuid of sender
1025 * @sessionid: sessionid for netlink audit message
1026 * @sid: SE Linux Security ID of sender
1027 */ 1024 */
1028int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz) 1025int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz)
1029{ 1026{
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2a9926275f80..a7c9e6ddb979 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1686,11 +1686,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1686 */ 1686 */
1687 cgroup_drop_root(opts.new_root); 1687 cgroup_drop_root(opts.new_root);
1688 1688
1689 if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) && 1689 if (root->flags != opts.flags) {
1690 root->flags != opts.flags) { 1690 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1691 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1691 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1692 ret = -EINVAL; 1692 ret = -EINVAL;
1693 goto drop_new_super; 1693 goto drop_new_super;
1694 } else {
1695 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1696 }
1694 } 1697 }
1695 1698
1696 /* no subsys rebinding, so refcounts don't change */ 1699 /* no subsys rebinding, so refcounts don't change */
@@ -2699,13 +2702,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2699 goto out; 2702 goto out;
2700 } 2703 }
2701 2704
2705 cfe->type = (void *)cft;
2706 cfe->dentry = dentry;
2707 dentry->d_fsdata = cfe;
2708 simple_xattrs_init(&cfe->xattrs);
2709
2702 mode = cgroup_file_mode(cft); 2710 mode = cgroup_file_mode(cft);
2703 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); 2711 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2704 if (!error) { 2712 if (!error) {
2705 cfe->type = (void *)cft;
2706 cfe->dentry = dentry;
2707 dentry->d_fsdata = cfe;
2708 simple_xattrs_init(&cfe->xattrs);
2709 list_add_tail(&cfe->node, &parent->files); 2713 list_add_tail(&cfe->node, &parent->files);
2710 cfe = NULL; 2714 cfe = NULL;
2711 } 2715 }
@@ -2953,11 +2957,8 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2953 WARN_ON_ONCE(!rcu_read_lock_held()); 2957 WARN_ON_ONCE(!rcu_read_lock_held());
2954 2958
2955 /* if first iteration, pretend we just visited @cgroup */ 2959 /* if first iteration, pretend we just visited @cgroup */
2956 if (!pos) { 2960 if (!pos)
2957 if (list_empty(&cgroup->children))
2958 return NULL;
2959 pos = cgroup; 2961 pos = cgroup;
2960 }
2961 2962
2962 /* visit the first child if exists */ 2963 /* visit the first child if exists */
2963 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); 2964 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
@@ -2965,14 +2966,14 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2965 return next; 2966 return next;
2966 2967
2967 /* no child, visit my or the closest ancestor's next sibling */ 2968 /* no child, visit my or the closest ancestor's next sibling */
2968 do { 2969 while (pos != cgroup) {
2969 next = list_entry_rcu(pos->sibling.next, struct cgroup, 2970 next = list_entry_rcu(pos->sibling.next, struct cgroup,
2970 sibling); 2971 sibling);
2971 if (&next->sibling != &pos->parent->children) 2972 if (&next->sibling != &pos->parent->children)
2972 return next; 2973 return next;
2973 2974
2974 pos = pos->parent; 2975 pos = pos->parent;
2975 } while (pos != cgroup); 2976 }
2976 2977
2977 return NULL; 2978 return NULL;
2978} 2979}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 65349f07b878..383f8231e436 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -15,7 +15,6 @@
15 */ 15 */
16 16
17#include <linux/context_tracking.h> 17#include <linux/context_tracking.h>
18#include <linux/kvm_host.h>
19#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
20#include <linux/sched.h> 19#include <linux/sched.h>
21#include <linux/hardirq.h> 20#include <linux/hardirq.h>
@@ -71,6 +70,46 @@ void user_enter(void)
71 local_irq_restore(flags); 70 local_irq_restore(flags);
72} 71}
73 72
73#ifdef CONFIG_PREEMPT
74/**
75 * preempt_schedule_context - preempt_schedule called by tracing
76 *
77 * The tracing infrastructure uses preempt_enable_notrace to prevent
78 * recursion and tracing preempt enabling caused by the tracing
79 * infrastructure itself. But as tracing can happen in areas coming
80 * from userspace or just about to enter userspace, a preempt enable
81 * can occur before user_exit() is called. This will cause the scheduler
82 * to be called when the system is still in usermode.
83 *
84 * To prevent this, the preempt_enable_notrace will use this function
85 * instead of preempt_schedule() to exit user context if needed before
86 * calling the scheduler.
87 */
88void __sched notrace preempt_schedule_context(void)
89{
90 struct thread_info *ti = current_thread_info();
91 enum ctx_state prev_ctx;
92
93 if (likely(ti->preempt_count || irqs_disabled()))
94 return;
95
96 /*
97 * Need to disable preemption in case user_exit() is traced
98 * and the tracer calls preempt_enable_notrace() causing
99 * an infinite recursion.
100 */
101 preempt_disable_notrace();
102 prev_ctx = exception_enter();
103 preempt_enable_no_resched_notrace();
104
105 preempt_schedule();
106
107 preempt_disable_notrace();
108 exception_exit(prev_ctx);
109 preempt_enable_notrace();
110}
111EXPORT_SYMBOL_GPL(preempt_schedule_context);
112#endif /* CONFIG_PREEMPT */
74 113
75/** 114/**
76 * user_exit - Inform the context tracking that the CPU is 115 * user_exit - Inform the context tracking that the CPU is
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b5e4ab2d427e..198a38883e64 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -133,6 +133,27 @@ static void cpu_hotplug_done(void)
133 mutex_unlock(&cpu_hotplug.lock); 133 mutex_unlock(&cpu_hotplug.lock);
134} 134}
135 135
136/*
137 * Wait for currently running CPU hotplug operations to complete (if any) and
138 * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
139 * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
140 * hotplug path before performing hotplug operations. So acquiring that lock
141 * guarantees mutual exclusion from any currently running hotplug operations.
142 */
143void cpu_hotplug_disable(void)
144{
145 cpu_maps_update_begin();
146 cpu_hotplug_disabled = 1;
147 cpu_maps_update_done();
148}
149
150void cpu_hotplug_enable(void)
151{
152 cpu_maps_update_begin();
153 cpu_hotplug_disabled = 0;
154 cpu_maps_update_done();
155}
156
136#else /* #if CONFIG_HOTPLUG_CPU */ 157#else /* #if CONFIG_HOTPLUG_CPU */
137static void cpu_hotplug_begin(void) {} 158static void cpu_hotplug_begin(void) {}
138static void cpu_hotplug_done(void) {} 159static void cpu_hotplug_done(void) {}
@@ -541,36 +562,6 @@ static int __init alloc_frozen_cpus(void)
541core_initcall(alloc_frozen_cpus); 562core_initcall(alloc_frozen_cpus);
542 563
543/* 564/*
544 * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU
545 * hotplug when tasks are about to be frozen. Also, don't allow the freezer
546 * to continue until any currently running CPU hotplug operation gets
547 * completed.
548 * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the
549 * 'cpu_add_remove_lock'. And this same lock is also taken by the regular
550 * CPU hotplug path and released only after it is complete. Thus, we
551 * (and hence the freezer) will block here until any currently running CPU
552 * hotplug operation gets completed.
553 */
554void cpu_hotplug_disable_before_freeze(void)
555{
556 cpu_maps_update_begin();
557 cpu_hotplug_disabled = 1;
558 cpu_maps_update_done();
559}
560
561
562/*
563 * When tasks have been thawed, re-enable regular CPU hotplug (which had been
564 * disabled while beginning to freeze tasks).
565 */
566void cpu_hotplug_enable_after_thaw(void)
567{
568 cpu_maps_update_begin();
569 cpu_hotplug_disabled = 0;
570 cpu_maps_update_done();
571}
572
573/*
574 * When callbacks for CPU hotplug notifications are being executed, we must 565 * When callbacks for CPU hotplug notifications are being executed, we must
575 * ensure that the state of the system with respect to the tasks being frozen 566 * ensure that the state of the system with respect to the tasks being frozen
576 * or not, as reported by the notification, remains unchanged *throughout the 567 * or not, as reported by the notification, remains unchanged *throughout the
@@ -589,12 +580,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
589 580
590 case PM_SUSPEND_PREPARE: 581 case PM_SUSPEND_PREPARE:
591 case PM_HIBERNATION_PREPARE: 582 case PM_HIBERNATION_PREPARE:
592 cpu_hotplug_disable_before_freeze(); 583 cpu_hotplug_disable();
593 break; 584 break;
594 585
595 case PM_POST_SUSPEND: 586 case PM_POST_SUSPEND:
596 case PM_POST_HIBERNATION: 587 case PM_POST_HIBERNATION:
597 cpu_hotplug_enable_after_thaw(); 588 cpu_hotplug_enable();
598 break; 589 break;
599 590
600 default: 591 default:
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index d5585f5e038e..e695c0a0bcb5 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -5,6 +5,7 @@
5#include <linux/cpu.h> 5#include <linux/cpu.h>
6#include <linux/tick.h> 6#include <linux/tick.h>
7#include <linux/mm.h> 7#include <linux/mm.h>
8#include <linux/stackprotector.h>
8 9
9#include <asm/tlb.h> 10#include <asm/tlb.h>
10 11
@@ -58,6 +59,7 @@ void __weak arch_cpu_idle_dead(void) { }
58void __weak arch_cpu_idle(void) 59void __weak arch_cpu_idle(void)
59{ 60{
60 cpu_idle_force_poll = 1; 61 cpu_idle_force_poll = 1;
62 local_irq_enable();
61} 63}
62 64
63/* 65/*
@@ -112,6 +114,21 @@ static void cpu_idle_loop(void)
112 114
113void cpu_startup_entry(enum cpuhp_state state) 115void cpu_startup_entry(enum cpuhp_state state)
114{ 116{
117 /*
118 * This #ifdef needs to die, but it's too late in the cycle to
119 * make this generic (arm and sh have never invoked the canary
120 * init for the non boot cpus!). Will be fixed in 3.11
121 */
122#ifdef CONFIG_X86
123 /*
124 * If we're the non-boot CPU, nothing set the stack canary up
125 * for us. The boot CPU already has it initialized but no harm
126 * in doing it again. This is a good place for updating it, as
127 * we wont ever return from this function (so the invalid
128 * canaries already on the stack wont ever trigger).
129 */
130 boot_init_stack_canary();
131#endif
115 current_set_polling(); 132 current_set_polling();
116 arch_cpu_idle_prepare(); 133 arch_cpu_idle_prepare();
117 cpu_idle_loop(); 134 cpu_idle_loop();
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9dc297faf7c0..b391907d5352 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -196,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
196static void update_context_time(struct perf_event_context *ctx); 196static void update_context_time(struct perf_event_context *ctx);
197static u64 perf_event_time(struct perf_event *event); 197static u64 perf_event_time(struct perf_event *event);
198 198
199static void ring_buffer_attach(struct perf_event *event,
200 struct ring_buffer *rb);
201
202void __weak perf_event_print_debug(void) { } 199void __weak perf_event_print_debug(void) { }
203 200
204extern __weak const char *perf_pmu_name(void) 201extern __weak const char *perf_pmu_name(void)
@@ -2918,6 +2915,7 @@ static void free_event_rcu(struct rcu_head *head)
2918} 2915}
2919 2916
2920static void ring_buffer_put(struct ring_buffer *rb); 2917static void ring_buffer_put(struct ring_buffer *rb);
2918static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
2921 2919
2922static void free_event(struct perf_event *event) 2920static void free_event(struct perf_event *event)
2923{ 2921{
@@ -2942,15 +2940,30 @@ static void free_event(struct perf_event *event)
2942 if (has_branch_stack(event)) { 2940 if (has_branch_stack(event)) {
2943 static_key_slow_dec_deferred(&perf_sched_events); 2941 static_key_slow_dec_deferred(&perf_sched_events);
2944 /* is system-wide event */ 2942 /* is system-wide event */
2945 if (!(event->attach_state & PERF_ATTACH_TASK)) 2943 if (!(event->attach_state & PERF_ATTACH_TASK)) {
2946 atomic_dec(&per_cpu(perf_branch_stack_events, 2944 atomic_dec(&per_cpu(perf_branch_stack_events,
2947 event->cpu)); 2945 event->cpu));
2946 }
2948 } 2947 }
2949 } 2948 }
2950 2949
2951 if (event->rb) { 2950 if (event->rb) {
2952 ring_buffer_put(event->rb); 2951 struct ring_buffer *rb;
2953 event->rb = NULL; 2952
2953 /*
2954 * Can happen when we close an event with re-directed output.
2955 *
2956 * Since we have a 0 refcount, perf_mmap_close() will skip
2957 * over us; possibly making our ring_buffer_put() the last.
2958 */
2959 mutex_lock(&event->mmap_mutex);
2960 rb = event->rb;
2961 if (rb) {
2962 rcu_assign_pointer(event->rb, NULL);
2963 ring_buffer_detach(event, rb);
2964 ring_buffer_put(rb); /* could be last */
2965 }
2966 mutex_unlock(&event->mmap_mutex);
2954 } 2967 }
2955 2968
2956 if (is_cgroup_event(event)) 2969 if (is_cgroup_event(event))
@@ -3188,30 +3201,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3188 unsigned int events = POLL_HUP; 3201 unsigned int events = POLL_HUP;
3189 3202
3190 /* 3203 /*
3191 * Race between perf_event_set_output() and perf_poll(): perf_poll() 3204 * Pin the event->rb by taking event->mmap_mutex; otherwise
3192 * grabs the rb reference but perf_event_set_output() overrides it. 3205 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
3193 * Here is the timeline for two threads T1, T2:
3194 * t0: T1, rb = rcu_dereference(event->rb)
3195 * t1: T2, old_rb = event->rb
3196 * t2: T2, event->rb = new rb
3197 * t3: T2, ring_buffer_detach(old_rb)
3198 * t4: T1, ring_buffer_attach(rb1)
3199 * t5: T1, poll_wait(event->waitq)
3200 *
3201 * To avoid this problem, we grab mmap_mutex in perf_poll()
3202 * thereby ensuring that the assignment of the new ring buffer
3203 * and the detachment of the old buffer appear atomic to perf_poll()
3204 */ 3206 */
3205 mutex_lock(&event->mmap_mutex); 3207 mutex_lock(&event->mmap_mutex);
3206 3208 rb = event->rb;
3207 rcu_read_lock(); 3209 if (rb)
3208 rb = rcu_dereference(event->rb);
3209 if (rb) {
3210 ring_buffer_attach(event, rb);
3211 events = atomic_xchg(&rb->poll, 0); 3210 events = atomic_xchg(&rb->poll, 0);
3212 }
3213 rcu_read_unlock();
3214
3215 mutex_unlock(&event->mmap_mutex); 3211 mutex_unlock(&event->mmap_mutex);
3216 3212
3217 poll_wait(file, &event->waitq, wait); 3213 poll_wait(file, &event->waitq, wait);
@@ -3521,16 +3517,12 @@ static void ring_buffer_attach(struct perf_event *event,
3521 return; 3517 return;
3522 3518
3523 spin_lock_irqsave(&rb->event_lock, flags); 3519 spin_lock_irqsave(&rb->event_lock, flags);
3524 if (!list_empty(&event->rb_entry)) 3520 if (list_empty(&event->rb_entry))
3525 goto unlock; 3521 list_add(&event->rb_entry, &rb->event_list);
3526
3527 list_add(&event->rb_entry, &rb->event_list);
3528unlock:
3529 spin_unlock_irqrestore(&rb->event_lock, flags); 3522 spin_unlock_irqrestore(&rb->event_lock, flags);
3530} 3523}
3531 3524
3532static void ring_buffer_detach(struct perf_event *event, 3525static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
3533 struct ring_buffer *rb)
3534{ 3526{
3535 unsigned long flags; 3527 unsigned long flags;
3536 3528
@@ -3549,13 +3541,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
3549 3541
3550 rcu_read_lock(); 3542 rcu_read_lock();
3551 rb = rcu_dereference(event->rb); 3543 rb = rcu_dereference(event->rb);
3552 if (!rb) 3544 if (rb) {
3553 goto unlock; 3545 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
3554 3546 wake_up_all(&event->waitq);
3555 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) 3547 }
3556 wake_up_all(&event->waitq);
3557
3558unlock:
3559 rcu_read_unlock(); 3548 rcu_read_unlock();
3560} 3549}
3561 3550
@@ -3584,18 +3573,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3584 3573
3585static void ring_buffer_put(struct ring_buffer *rb) 3574static void ring_buffer_put(struct ring_buffer *rb)
3586{ 3575{
3587 struct perf_event *event, *n;
3588 unsigned long flags;
3589
3590 if (!atomic_dec_and_test(&rb->refcount)) 3576 if (!atomic_dec_and_test(&rb->refcount))
3591 return; 3577 return;
3592 3578
3593 spin_lock_irqsave(&rb->event_lock, flags); 3579 WARN_ON_ONCE(!list_empty(&rb->event_list));
3594 list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
3595 list_del_init(&event->rb_entry);
3596 wake_up_all(&event->waitq);
3597 }
3598 spin_unlock_irqrestore(&rb->event_lock, flags);
3599 3580
3600 call_rcu(&rb->rcu_head, rb_free_rcu); 3581 call_rcu(&rb->rcu_head, rb_free_rcu);
3601} 3582}
@@ -3605,26 +3586,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)
3605 struct perf_event *event = vma->vm_file->private_data; 3586 struct perf_event *event = vma->vm_file->private_data;
3606 3587
3607 atomic_inc(&event->mmap_count); 3588 atomic_inc(&event->mmap_count);
3589 atomic_inc(&event->rb->mmap_count);
3608} 3590}
3609 3591
3592/*
3593 * A buffer can be mmap()ed multiple times; either directly through the same
3594 * event, or through other events by use of perf_event_set_output().
3595 *
3596 * In order to undo the VM accounting done by perf_mmap() we need to destroy
3597 * the buffer here, where we still have a VM context. This means we need
3598 * to detach all events redirecting to us.
3599 */
3610static void perf_mmap_close(struct vm_area_struct *vma) 3600static void perf_mmap_close(struct vm_area_struct *vma)
3611{ 3601{
3612 struct perf_event *event = vma->vm_file->private_data; 3602 struct perf_event *event = vma->vm_file->private_data;
3613 3603
3614 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 3604 struct ring_buffer *rb = event->rb;
3615 unsigned long size = perf_data_size(event->rb); 3605 struct user_struct *mmap_user = rb->mmap_user;
3616 struct user_struct *user = event->mmap_user; 3606 int mmap_locked = rb->mmap_locked;
3617 struct ring_buffer *rb = event->rb; 3607 unsigned long size = perf_data_size(rb);
3608
3609 atomic_dec(&rb->mmap_count);
3610
3611 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
3612 return;
3618 3613
3619 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 3614 /* Detach current event from the buffer. */
3620 vma->vm_mm->pinned_vm -= event->mmap_locked; 3615 rcu_assign_pointer(event->rb, NULL);
3621 rcu_assign_pointer(event->rb, NULL); 3616 ring_buffer_detach(event, rb);
3622 ring_buffer_detach(event, rb); 3617 mutex_unlock(&event->mmap_mutex);
3618
3619 /* If there's still other mmap()s of this buffer, we're done. */
3620 if (atomic_read(&rb->mmap_count)) {
3621 ring_buffer_put(rb); /* can't be last */
3622 return;
3623 }
3624
3625 /*
3626 * No other mmap()s, detach from all other events that might redirect
3627 * into the now unreachable buffer. Somewhat complicated by the
3628 * fact that rb::event_lock otherwise nests inside mmap_mutex.
3629 */
3630again:
3631 rcu_read_lock();
3632 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
3633 if (!atomic_long_inc_not_zero(&event->refcount)) {
3634 /*
3635 * This event is en-route to free_event() which will
3636 * detach it and remove it from the list.
3637 */
3638 continue;
3639 }
3640 rcu_read_unlock();
3641
3642 mutex_lock(&event->mmap_mutex);
3643 /*
3644 * Check we didn't race with perf_event_set_output() which can
3645 * swizzle the rb from under us while we were waiting to
3646 * acquire mmap_mutex.
3647 *
3648 * If we find a different rb; ignore this event, a next
3649 * iteration will no longer find it on the list. We have to
3650 * still restart the iteration to make sure we're not now
3651 * iterating the wrong list.
3652 */
3653 if (event->rb == rb) {
3654 rcu_assign_pointer(event->rb, NULL);
3655 ring_buffer_detach(event, rb);
3656 ring_buffer_put(rb); /* can't be last, we still have one */
3657 }
3623 mutex_unlock(&event->mmap_mutex); 3658 mutex_unlock(&event->mmap_mutex);
3659 put_event(event);
3624 3660
3625 ring_buffer_put(rb); 3661 /*
3626 free_uid(user); 3662 * Restart the iteration; either we're on the wrong list or
3663 * destroyed its integrity by doing a deletion.
3664 */
3665 goto again;
3627 } 3666 }
3667 rcu_read_unlock();
3668
3669 /*
3670 * It could be there's still a few 0-ref events on the list; they'll
3671 * get cleaned up by free_event() -- they'll also still have their
3672 * ref on the rb and will free it whenever they are done with it.
3673 *
3674 * Aside from that, this buffer is 'fully' detached and unmapped,
3675 * undo the VM accounting.
3676 */
3677
3678 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
3679 vma->vm_mm->pinned_vm -= mmap_locked;
3680 free_uid(mmap_user);
3681
3682 ring_buffer_put(rb); /* could be last */
3628} 3683}
3629 3684
3630static const struct vm_operations_struct perf_mmap_vmops = { 3685static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3674,12 +3729,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3674 return -EINVAL; 3729 return -EINVAL;
3675 3730
3676 WARN_ON_ONCE(event->ctx->parent_ctx); 3731 WARN_ON_ONCE(event->ctx->parent_ctx);
3732again:
3677 mutex_lock(&event->mmap_mutex); 3733 mutex_lock(&event->mmap_mutex);
3678 if (event->rb) { 3734 if (event->rb) {
3679 if (event->rb->nr_pages == nr_pages) 3735 if (event->rb->nr_pages != nr_pages) {
3680 atomic_inc(&event->rb->refcount);
3681 else
3682 ret = -EINVAL; 3736 ret = -EINVAL;
3737 goto unlock;
3738 }
3739
3740 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
3741 /*
3742 * Raced against perf_mmap_close() through
3743 * perf_event_set_output(). Try again, hope for better
3744 * luck.
3745 */
3746 mutex_unlock(&event->mmap_mutex);
3747 goto again;
3748 }
3749
3683 goto unlock; 3750 goto unlock;
3684 } 3751 }
3685 3752
@@ -3720,12 +3787,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3720 ret = -ENOMEM; 3787 ret = -ENOMEM;
3721 goto unlock; 3788 goto unlock;
3722 } 3789 }
3723 rcu_assign_pointer(event->rb, rb); 3790
3791 atomic_set(&rb->mmap_count, 1);
3792 rb->mmap_locked = extra;
3793 rb->mmap_user = get_current_user();
3724 3794
3725 atomic_long_add(user_extra, &user->locked_vm); 3795 atomic_long_add(user_extra, &user->locked_vm);
3726 event->mmap_locked = extra; 3796 vma->vm_mm->pinned_vm += extra;
3727 event->mmap_user = get_current_user(); 3797
3728 vma->vm_mm->pinned_vm += event->mmap_locked; 3798 ring_buffer_attach(event, rb);
3799 rcu_assign_pointer(event->rb, rb);
3729 3800
3730 perf_event_update_userpage(event); 3801 perf_event_update_userpage(event);
3731 3802
@@ -3734,7 +3805,11 @@ unlock:
3734 atomic_inc(&event->mmap_count); 3805 atomic_inc(&event->mmap_count);
3735 mutex_unlock(&event->mmap_mutex); 3806 mutex_unlock(&event->mmap_mutex);
3736 3807
3737 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 3808 /*
3809 * Since pinned accounting is per vm we cannot allow fork() to copy our
3810 * vma.
3811 */
3812 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
3738 vma->vm_ops = &perf_mmap_vmops; 3813 vma->vm_ops = &perf_mmap_vmops;
3739 3814
3740 return ret; 3815 return ret;
@@ -6412,6 +6487,8 @@ set:
6412 if (atomic_read(&event->mmap_count)) 6487 if (atomic_read(&event->mmap_count))
6413 goto unlock; 6488 goto unlock;
6414 6489
6490 old_rb = event->rb;
6491
6415 if (output_event) { 6492 if (output_event) {
6416 /* get the rb we want to redirect to */ 6493 /* get the rb we want to redirect to */
6417 rb = ring_buffer_get(output_event); 6494 rb = ring_buffer_get(output_event);
@@ -6419,16 +6496,28 @@ set:
6419 goto unlock; 6496 goto unlock;
6420 } 6497 }
6421 6498
6422 old_rb = event->rb;
6423 rcu_assign_pointer(event->rb, rb);
6424 if (old_rb) 6499 if (old_rb)
6425 ring_buffer_detach(event, old_rb); 6500 ring_buffer_detach(event, old_rb);
6501
6502 if (rb)
6503 ring_buffer_attach(event, rb);
6504
6505 rcu_assign_pointer(event->rb, rb);
6506
6507 if (old_rb) {
6508 ring_buffer_put(old_rb);
6509 /*
6510 * Since we detached before setting the new rb, so that we
6511 * could attach the new rb, we could have missed a wakeup.
6512 * Provide it now.
6513 */
6514 wake_up_all(&event->waitq);
6515 }
6516
6426 ret = 0; 6517 ret = 0;
6427unlock: 6518unlock:
6428 mutex_unlock(&event->mmap_mutex); 6519 mutex_unlock(&event->mmap_mutex);
6429 6520
6430 if (old_rb)
6431 ring_buffer_put(old_rb);
6432out: 6521out:
6433 return ret; 6522 return ret;
6434} 6523}
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index a64f8aeb5c1f..20185ea64aa6 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -120,7 +120,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) { 120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
121 if (iter->hw.bp_target == tsk && 121 if (iter->hw.bp_target == tsk &&
122 find_slot_idx(iter) == type && 122 find_slot_idx(iter) == type &&
123 cpu == iter->cpu) 123 (iter->cpu < 0 || cpu == iter->cpu))
124 count += hw_breakpoint_weight(iter); 124 count += hw_breakpoint_weight(iter);
125 } 125 }
126 126
@@ -149,7 +149,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
149 return; 149 return;
150 } 150 }
151 151
152 for_each_online_cpu(cpu) { 152 for_each_possible_cpu(cpu) {
153 unsigned int nr; 153 unsigned int nr;
154 154
155 nr = per_cpu(nr_cpu_bp_pinned[type], cpu); 155 nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
@@ -235,7 +235,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
235 if (cpu >= 0) { 235 if (cpu >= 0) {
236 toggle_bp_task_slot(bp, cpu, enable, type, weight); 236 toggle_bp_task_slot(bp, cpu, enable, type, weight);
237 } else { 237 } else {
238 for_each_online_cpu(cpu) 238 for_each_possible_cpu(cpu)
239 toggle_bp_task_slot(bp, cpu, enable, type, weight); 239 toggle_bp_task_slot(bp, cpu, enable, type, weight);
240 } 240 }
241 241
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index eb675c4d59df..ca6599723be5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -31,6 +31,10 @@ struct ring_buffer {
31 spinlock_t event_lock; 31 spinlock_t event_lock;
32 struct list_head event_list; 32 struct list_head event_list;
33 33
34 atomic_t mmap_count;
35 unsigned long mmap_locked;
36 struct user_struct *mmap_user;
37
34 struct perf_event_mmap_page *user_page; 38 struct perf_event_mmap_page *user_page;
35 void *data_pages[0]; 39 void *data_pages[0];
36}; 40};
diff --git a/kernel/exit.c b/kernel/exit.c
index af2eb3cbd499..7bb73f9d09db 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -649,7 +649,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
649 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 649 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
650 */ 650 */
651 forget_original_parent(tsk); 651 forget_original_parent(tsk);
652 exit_task_namespaces(tsk);
653 652
654 write_lock_irq(&tasklist_lock); 653 write_lock_irq(&tasklist_lock);
655 if (group_dead) 654 if (group_dead)
@@ -795,6 +794,7 @@ void do_exit(long code)
795 exit_shm(tsk); 794 exit_shm(tsk);
796 exit_files(tsk); 795 exit_files(tsk);
797 exit_fs(tsk); 796 exit_fs(tsk);
797 exit_task_namespaces(tsk);
798 exit_task_work(tsk); 798 exit_task_work(tsk);
799 check_stack_usage(); 799 check_stack_usage();
800 exit_thread(); 800 exit_thread();
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 5a83dde8ca0c..54a4d5223238 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -143,7 +143,10 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
143 * irq_domain_add_simple() - Allocate and register a simple irq_domain. 143 * irq_domain_add_simple() - Allocate and register a simple irq_domain.
144 * @of_node: pointer to interrupt controller's device tree node. 144 * @of_node: pointer to interrupt controller's device tree node.
145 * @size: total number of irqs in mapping 145 * @size: total number of irqs in mapping
146 * @first_irq: first number of irq block assigned to the domain 146 * @first_irq: first number of irq block assigned to the domain,
147 * pass zero to assign irqs on-the-fly. This will result in a
148 * linear IRQ domain so it is important to use irq_create_mapping()
149 * for each used IRQ, especially when SPARSE_IRQ is enabled.
147 * @ops: map/unmap domain callbacks 150 * @ops: map/unmap domain callbacks
148 * @host_data: Controller private data pointer 151 * @host_data: Controller private data pointer
149 * 152 *
@@ -191,6 +194,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
191 /* A linear domain is the default */ 194 /* A linear domain is the default */
192 return irq_domain_add_linear(of_node, size, ops, host_data); 195 return irq_domain_add_linear(of_node, size, ops, host_data);
193} 196}
197EXPORT_SYMBOL_GPL(irq_domain_add_simple);
194 198
195/** 199/**
196 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. 200 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
@@ -397,11 +401,12 @@ static void irq_domain_disassociate_many(struct irq_domain *domain,
397 while (count--) { 401 while (count--) {
398 int irq = irq_base + count; 402 int irq = irq_base + count;
399 struct irq_data *irq_data = irq_get_irq_data(irq); 403 struct irq_data *irq_data = irq_get_irq_data(irq);
400 irq_hw_number_t hwirq = irq_data->hwirq; 404 irq_hw_number_t hwirq;
401 405
402 if (WARN_ON(!irq_data || irq_data->domain != domain)) 406 if (WARN_ON(!irq_data || irq_data->domain != domain))
403 continue; 407 continue;
404 408
409 hwirq = irq_data->hwirq;
405 irq_set_status_flags(irq, IRQ_NOREQUEST); 410 irq_set_status_flags(irq, IRQ_NOREQUEST);
406 411
407 /* remove chip and handler */ 412 /* remove chip and handler */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 1296e72e4161..8241906c4b61 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -569,6 +569,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
569 int retval = 0; 569 int retval = 0;
570 570
571 helper_lock(); 571 helper_lock();
572 if (!sub_info->path) {
573 retval = -EINVAL;
574 goto out;
575 }
576
572 if (sub_info->path[0] == '\0') 577 if (sub_info->path[0] == '\0')
573 goto out; 578 goto out;
574 579
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3fed7f0cbcdf..bddf3b201a48 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -467,6 +467,7 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
467/* Optimization staging list, protected by kprobe_mutex */ 467/* Optimization staging list, protected by kprobe_mutex */
468static LIST_HEAD(optimizing_list); 468static LIST_HEAD(optimizing_list);
469static LIST_HEAD(unoptimizing_list); 469static LIST_HEAD(unoptimizing_list);
470static LIST_HEAD(freeing_list);
470 471
471static void kprobe_optimizer(struct work_struct *work); 472static void kprobe_optimizer(struct work_struct *work);
472static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); 473static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
@@ -504,7 +505,7 @@ static __kprobes void do_optimize_kprobes(void)
504 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint 505 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
505 * if need) kprobes listed on unoptimizing_list. 506 * if need) kprobes listed on unoptimizing_list.
506 */ 507 */
507static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) 508static __kprobes void do_unoptimize_kprobes(void)
508{ 509{
509 struct optimized_kprobe *op, *tmp; 510 struct optimized_kprobe *op, *tmp;
510 511
@@ -515,9 +516,9 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
515 /* Ditto to do_optimize_kprobes */ 516 /* Ditto to do_optimize_kprobes */
516 get_online_cpus(); 517 get_online_cpus();
517 mutex_lock(&text_mutex); 518 mutex_lock(&text_mutex);
518 arch_unoptimize_kprobes(&unoptimizing_list, free_list); 519 arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
519 /* Loop free_list for disarming */ 520 /* Loop free_list for disarming */
520 list_for_each_entry_safe(op, tmp, free_list, list) { 521 list_for_each_entry_safe(op, tmp, &freeing_list, list) {
521 /* Disarm probes if marked disabled */ 522 /* Disarm probes if marked disabled */
522 if (kprobe_disabled(&op->kp)) 523 if (kprobe_disabled(&op->kp))
523 arch_disarm_kprobe(&op->kp); 524 arch_disarm_kprobe(&op->kp);
@@ -536,11 +537,11 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
536} 537}
537 538
538/* Reclaim all kprobes on the free_list */ 539/* Reclaim all kprobes on the free_list */
539static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) 540static __kprobes void do_free_cleaned_kprobes(void)
540{ 541{
541 struct optimized_kprobe *op, *tmp; 542 struct optimized_kprobe *op, *tmp;
542 543
543 list_for_each_entry_safe(op, tmp, free_list, list) { 544 list_for_each_entry_safe(op, tmp, &freeing_list, list) {
544 BUG_ON(!kprobe_unused(&op->kp)); 545 BUG_ON(!kprobe_unused(&op->kp));
545 list_del_init(&op->list); 546 list_del_init(&op->list);
546 free_aggr_kprobe(&op->kp); 547 free_aggr_kprobe(&op->kp);
@@ -556,8 +557,6 @@ static __kprobes void kick_kprobe_optimizer(void)
556/* Kprobe jump optimizer */ 557/* Kprobe jump optimizer */
557static __kprobes void kprobe_optimizer(struct work_struct *work) 558static __kprobes void kprobe_optimizer(struct work_struct *work)
558{ 559{
559 LIST_HEAD(free_list);
560
561 mutex_lock(&kprobe_mutex); 560 mutex_lock(&kprobe_mutex);
562 /* Lock modules while optimizing kprobes */ 561 /* Lock modules while optimizing kprobes */
563 mutex_lock(&module_mutex); 562 mutex_lock(&module_mutex);
@@ -566,7 +565,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
566 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) 565 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
567 * kprobes before waiting for quiesence period. 566 * kprobes before waiting for quiesence period.
568 */ 567 */
569 do_unoptimize_kprobes(&free_list); 568 do_unoptimize_kprobes();
570 569
571 /* 570 /*
572 * Step 2: Wait for quiesence period to ensure all running interrupts 571 * Step 2: Wait for quiesence period to ensure all running interrupts
@@ -581,7 +580,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
581 do_optimize_kprobes(); 580 do_optimize_kprobes();
582 581
583 /* Step 4: Free cleaned kprobes after quiesence period */ 582 /* Step 4: Free cleaned kprobes after quiesence period */
584 do_free_cleaned_kprobes(&free_list); 583 do_free_cleaned_kprobes();
585 584
586 mutex_unlock(&module_mutex); 585 mutex_unlock(&module_mutex);
587 mutex_unlock(&kprobe_mutex); 586 mutex_unlock(&kprobe_mutex);
@@ -723,8 +722,19 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p)
723 if (!list_empty(&op->list)) 722 if (!list_empty(&op->list))
724 /* Dequeue from the (un)optimization queue */ 723 /* Dequeue from the (un)optimization queue */
725 list_del_init(&op->list); 724 list_del_init(&op->list);
726
727 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 725 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
726
727 if (kprobe_unused(p)) {
728 /* Enqueue if it is unused */
729 list_add(&op->list, &freeing_list);
730 /*
731 * Remove unused probes from the hash list. After waiting
732 * for synchronization, this probe is reclaimed.
733 * (reclaiming is done by do_free_cleaned_kprobes().)
734 */
735 hlist_del_rcu(&op->kp.hlist);
736 }
737
728 /* Don't touch the code, because it is already freed. */ 738 /* Don't touch the code, because it is already freed. */
729 arch_remove_optimized_kprobe(op); 739 arch_remove_optimized_kprobe(op);
730} 740}
diff --git a/kernel/module.c b/kernel/module.c
index b049939177f6..cab4bce49c23 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2431,10 +2431,10 @@ static void kmemleak_load_module(const struct module *mod,
2431 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); 2431 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
2432 2432
2433 for (i = 1; i < info->hdr->e_shnum; i++) { 2433 for (i = 1; i < info->hdr->e_shnum; i++) {
2434 const char *name = info->secstrings + info->sechdrs[i].sh_name; 2434 /* Scan all writable sections that's not executable */
2435 if (!(info->sechdrs[i].sh_flags & SHF_ALLOC)) 2435 if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
2436 continue; 2436 !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
2437 if (!strstarts(name, ".data") && !strstarts(name, ".bss")) 2437 (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
2438 continue; 2438 continue;
2439 2439
2440 kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, 2440 kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
@@ -2769,24 +2769,11 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2769 mod->trace_events = section_objs(info, "_ftrace_events", 2769 mod->trace_events = section_objs(info, "_ftrace_events",
2770 sizeof(*mod->trace_events), 2770 sizeof(*mod->trace_events),
2771 &mod->num_trace_events); 2771 &mod->num_trace_events);
2772 /*
2773 * This section contains pointers to allocated objects in the trace
2774 * code and not scanning it leads to false positives.
2775 */
2776 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2777 mod->num_trace_events, GFP_KERNEL);
2778#endif 2772#endif
2779#ifdef CONFIG_TRACING 2773#ifdef CONFIG_TRACING
2780 mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", 2774 mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
2781 sizeof(*mod->trace_bprintk_fmt_start), 2775 sizeof(*mod->trace_bprintk_fmt_start),
2782 &mod->num_trace_bprintk_fmt); 2776 &mod->num_trace_bprintk_fmt);
2783 /*
2784 * This section contains pointers to allocated objects in the trace
2785 * code and not scanning it leads to false positives.
2786 */
2787 kmemleak_scan_area(mod->trace_bprintk_fmt_start,
2788 sizeof(*mod->trace_bprintk_fmt_start) *
2789 mod->num_trace_bprintk_fmt, GFP_KERNEL);
2790#endif 2777#endif
2791#ifdef CONFIG_FTRACE_MCOUNT_RECORD 2778#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2792 /* sechdrs[0].sh_size is always zero */ 2779 /* sechdrs[0].sh_size is always zero */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 42670e9b44e0..c7f31aa272f7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -51,59 +51,28 @@ static int check_clock(const clockid_t which_clock)
51 return error; 51 return error;
52} 52}
53 53
54static inline union cpu_time_count 54static inline unsigned long long
55timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) 55timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
56{ 56{
57 union cpu_time_count ret; 57 unsigned long long ret;
58 ret.sched = 0; /* high half always zero when .cpu used */ 58
59 ret = 0; /* high half always zero when .cpu used */
59 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 60 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
60 ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; 61 ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
61 } else { 62 } else {
62 ret.cpu = timespec_to_cputime(tp); 63 ret = cputime_to_expires(timespec_to_cputime(tp));
63 } 64 }
64 return ret; 65 return ret;
65} 66}
66 67
67static void sample_to_timespec(const clockid_t which_clock, 68static void sample_to_timespec(const clockid_t which_clock,
68 union cpu_time_count cpu, 69 unsigned long long expires,
69 struct timespec *tp) 70 struct timespec *tp)
70{ 71{
71 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) 72 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
72 *tp = ns_to_timespec(cpu.sched); 73 *tp = ns_to_timespec(expires);
73 else 74 else
74 cputime_to_timespec(cpu.cpu, tp); 75 cputime_to_timespec((__force cputime_t)expires, tp);
75}
76
77static inline int cpu_time_before(const clockid_t which_clock,
78 union cpu_time_count now,
79 union cpu_time_count then)
80{
81 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
82 return now.sched < then.sched;
83 } else {
84 return now.cpu < then.cpu;
85 }
86}
87static inline void cpu_time_add(const clockid_t which_clock,
88 union cpu_time_count *acc,
89 union cpu_time_count val)
90{
91 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
92 acc->sched += val.sched;
93 } else {
94 acc->cpu += val.cpu;
95 }
96}
97static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
98 union cpu_time_count a,
99 union cpu_time_count b)
100{
101 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
102 a.sched -= b.sched;
103 } else {
104 a.cpu -= b.cpu;
105 }
106 return a;
107} 76}
108 77
109/* 78/*
@@ -111,47 +80,31 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
111 * given the current clock sample. 80 * given the current clock sample.
112 */ 81 */
113static void bump_cpu_timer(struct k_itimer *timer, 82static void bump_cpu_timer(struct k_itimer *timer,
114 union cpu_time_count now) 83 unsigned long long now)
115{ 84{
116 int i; 85 int i;
86 unsigned long long delta, incr;
117 87
118 if (timer->it.cpu.incr.sched == 0) 88 if (timer->it.cpu.incr == 0)
119 return; 89 return;
120 90
121 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { 91 if (now < timer->it.cpu.expires)
122 unsigned long long delta, incr; 92 return;
123 93
124 if (now.sched < timer->it.cpu.expires.sched) 94 incr = timer->it.cpu.incr;
125 return; 95 delta = now + incr - timer->it.cpu.expires;
126 incr = timer->it.cpu.incr.sched;
127 delta = now.sched + incr - timer->it.cpu.expires.sched;
128 /* Don't use (incr*2 < delta), incr*2 might overflow. */
129 for (i = 0; incr < delta - incr; i++)
130 incr = incr << 1;
131 for (; i >= 0; incr >>= 1, i--) {
132 if (delta < incr)
133 continue;
134 timer->it.cpu.expires.sched += incr;
135 timer->it_overrun += 1 << i;
136 delta -= incr;
137 }
138 } else {
139 cputime_t delta, incr;
140 96
141 if (now.cpu < timer->it.cpu.expires.cpu) 97 /* Don't use (incr*2 < delta), incr*2 might overflow. */
142 return; 98 for (i = 0; incr < delta - incr; i++)
143 incr = timer->it.cpu.incr.cpu; 99 incr = incr << 1;
144 delta = now.cpu + incr - timer->it.cpu.expires.cpu; 100
145 /* Don't use (incr*2 < delta), incr*2 might overflow. */ 101 for (; i >= 0; incr >>= 1, i--) {
146 for (i = 0; incr < delta - incr; i++) 102 if (delta < incr)
147 incr += incr; 103 continue;
148 for (; i >= 0; incr = incr >> 1, i--) { 104
149 if (delta < incr) 105 timer->it.cpu.expires += incr;
150 continue; 106 timer->it_overrun += 1 << i;
151 timer->it.cpu.expires.cpu += incr; 107 delta -= incr;
152 timer->it_overrun += 1 << i;
153 delta -= incr;
154 }
155 } 108 }
156} 109}
157 110
@@ -170,21 +123,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime)
170 return 0; 123 return 0;
171} 124}
172 125
173static inline cputime_t prof_ticks(struct task_struct *p) 126static inline unsigned long long prof_ticks(struct task_struct *p)
174{ 127{
175 cputime_t utime, stime; 128 cputime_t utime, stime;
176 129
177 task_cputime(p, &utime, &stime); 130 task_cputime(p, &utime, &stime);
178 131
179 return utime + stime; 132 return cputime_to_expires(utime + stime);
180} 133}
181static inline cputime_t virt_ticks(struct task_struct *p) 134static inline unsigned long long virt_ticks(struct task_struct *p)
182{ 135{
183 cputime_t utime; 136 cputime_t utime;
184 137
185 task_cputime(p, &utime, NULL); 138 task_cputime(p, &utime, NULL);
186 139
187 return utime; 140 return cputime_to_expires(utime);
188} 141}
189 142
190static int 143static int
@@ -225,19 +178,19 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
225 * Sample a per-thread clock for the given task. 178 * Sample a per-thread clock for the given task.
226 */ 179 */
227static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, 180static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
228 union cpu_time_count *cpu) 181 unsigned long long *sample)
229{ 182{
230 switch (CPUCLOCK_WHICH(which_clock)) { 183 switch (CPUCLOCK_WHICH(which_clock)) {
231 default: 184 default:
232 return -EINVAL; 185 return -EINVAL;
233 case CPUCLOCK_PROF: 186 case CPUCLOCK_PROF:
234 cpu->cpu = prof_ticks(p); 187 *sample = prof_ticks(p);
235 break; 188 break;
236 case CPUCLOCK_VIRT: 189 case CPUCLOCK_VIRT:
237 cpu->cpu = virt_ticks(p); 190 *sample = virt_ticks(p);
238 break; 191 break;
239 case CPUCLOCK_SCHED: 192 case CPUCLOCK_SCHED:
240 cpu->sched = task_sched_runtime(p); 193 *sample = task_sched_runtime(p);
241 break; 194 break;
242 } 195 }
243 return 0; 196 return 0;
@@ -284,7 +237,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
284 */ 237 */
285static int cpu_clock_sample_group(const clockid_t which_clock, 238static int cpu_clock_sample_group(const clockid_t which_clock,
286 struct task_struct *p, 239 struct task_struct *p,
287 union cpu_time_count *cpu) 240 unsigned long long *sample)
288{ 241{
289 struct task_cputime cputime; 242 struct task_cputime cputime;
290 243
@@ -293,15 +246,15 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
293 return -EINVAL; 246 return -EINVAL;
294 case CPUCLOCK_PROF: 247 case CPUCLOCK_PROF:
295 thread_group_cputime(p, &cputime); 248 thread_group_cputime(p, &cputime);
296 cpu->cpu = cputime.utime + cputime.stime; 249 *sample = cputime_to_expires(cputime.utime + cputime.stime);
297 break; 250 break;
298 case CPUCLOCK_VIRT: 251 case CPUCLOCK_VIRT:
299 thread_group_cputime(p, &cputime); 252 thread_group_cputime(p, &cputime);
300 cpu->cpu = cputime.utime; 253 *sample = cputime_to_expires(cputime.utime);
301 break; 254 break;
302 case CPUCLOCK_SCHED: 255 case CPUCLOCK_SCHED:
303 thread_group_cputime(p, &cputime); 256 thread_group_cputime(p, &cputime);
304 cpu->sched = cputime.sum_exec_runtime; 257 *sample = cputime.sum_exec_runtime;
305 break; 258 break;
306 } 259 }
307 return 0; 260 return 0;
@@ -312,7 +265,7 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
312{ 265{
313 const pid_t pid = CPUCLOCK_PID(which_clock); 266 const pid_t pid = CPUCLOCK_PID(which_clock);
314 int error = -EINVAL; 267 int error = -EINVAL;
315 union cpu_time_count rtn; 268 unsigned long long rtn;
316 269
317 if (pid == 0) { 270 if (pid == 0) {
318 /* 271 /*
@@ -446,6 +399,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
446 return ret; 399 return ret;
447} 400}
448 401
402static void cleanup_timers_list(struct list_head *head,
403 unsigned long long curr)
404{
405 struct cpu_timer_list *timer, *next;
406
407 list_for_each_entry_safe(timer, next, head, entry)
408 list_del_init(&timer->entry);
409}
410
449/* 411/*
450 * Clean out CPU timers still ticking when a thread exited. The task 412 * Clean out CPU timers still ticking when a thread exited. The task
451 * pointer is cleared, and the expiry time is replaced with the residual 413 * pointer is cleared, and the expiry time is replaced with the residual
@@ -456,37 +418,12 @@ static void cleanup_timers(struct list_head *head,
456 cputime_t utime, cputime_t stime, 418 cputime_t utime, cputime_t stime,
457 unsigned long long sum_exec_runtime) 419 unsigned long long sum_exec_runtime)
458{ 420{
459 struct cpu_timer_list *timer, *next;
460 cputime_t ptime = utime + stime;
461
462 list_for_each_entry_safe(timer, next, head, entry) {
463 list_del_init(&timer->entry);
464 if (timer->expires.cpu < ptime) {
465 timer->expires.cpu = 0;
466 } else {
467 timer->expires.cpu -= ptime;
468 }
469 }
470 421
471 ++head; 422 cputime_t ptime = utime + stime;
472 list_for_each_entry_safe(timer, next, head, entry) {
473 list_del_init(&timer->entry);
474 if (timer->expires.cpu < utime) {
475 timer->expires.cpu = 0;
476 } else {
477 timer->expires.cpu -= utime;
478 }
479 }
480 423
481 ++head; 424 cleanup_timers_list(head, cputime_to_expires(ptime));
482 list_for_each_entry_safe(timer, next, head, entry) { 425 cleanup_timers_list(++head, cputime_to_expires(utime));
483 list_del_init(&timer->entry); 426 cleanup_timers_list(++head, sum_exec_runtime);
484 if (timer->expires.sched < sum_exec_runtime) {
485 timer->expires.sched = 0;
486 } else {
487 timer->expires.sched -= sum_exec_runtime;
488 }
489 }
490} 427}
491 428
492/* 429/*
@@ -516,17 +453,21 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
516 tsk->se.sum_exec_runtime + sig->sum_sched_runtime); 453 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
517} 454}
518 455
519static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) 456static void clear_dead_task(struct k_itimer *itimer, unsigned long long now)
520{ 457{
458 struct cpu_timer_list *timer = &itimer->it.cpu;
459
521 /* 460 /*
522 * That's all for this thread or process. 461 * That's all for this thread or process.
523 * We leave our residual in expires to be reported. 462 * We leave our residual in expires to be reported.
524 */ 463 */
525 put_task_struct(timer->it.cpu.task); 464 put_task_struct(timer->task);
526 timer->it.cpu.task = NULL; 465 timer->task = NULL;
527 timer->it.cpu.expires = cpu_time_sub(timer->it_clock, 466 if (timer->expires < now) {
528 timer->it.cpu.expires, 467 timer->expires = 0;
529 now); 468 } else {
469 timer->expires -= now;
470 }
530} 471}
531 472
532static inline int expires_gt(cputime_t expires, cputime_t new_exp) 473static inline int expires_gt(cputime_t expires, cputime_t new_exp)
@@ -558,14 +499,14 @@ static void arm_timer(struct k_itimer *timer)
558 499
559 listpos = head; 500 listpos = head;
560 list_for_each_entry(next, head, entry) { 501 list_for_each_entry(next, head, entry) {
561 if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) 502 if (nt->expires < next->expires)
562 break; 503 break;
563 listpos = &next->entry; 504 listpos = &next->entry;
564 } 505 }
565 list_add(&nt->entry, listpos); 506 list_add(&nt->entry, listpos);
566 507
567 if (listpos == head) { 508 if (listpos == head) {
568 union cpu_time_count *exp = &nt->expires; 509 unsigned long long exp = nt->expires;
569 510
570 /* 511 /*
571 * We are the new earliest-expiring POSIX 1.b timer, hence 512 * We are the new earliest-expiring POSIX 1.b timer, hence
@@ -576,17 +517,17 @@ static void arm_timer(struct k_itimer *timer)
576 517
577 switch (CPUCLOCK_WHICH(timer->it_clock)) { 518 switch (CPUCLOCK_WHICH(timer->it_clock)) {
578 case CPUCLOCK_PROF: 519 case CPUCLOCK_PROF:
579 if (expires_gt(cputime_expires->prof_exp, exp->cpu)) 520 if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp)))
580 cputime_expires->prof_exp = exp->cpu; 521 cputime_expires->prof_exp = expires_to_cputime(exp);
581 break; 522 break;
582 case CPUCLOCK_VIRT: 523 case CPUCLOCK_VIRT:
583 if (expires_gt(cputime_expires->virt_exp, exp->cpu)) 524 if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp)))
584 cputime_expires->virt_exp = exp->cpu; 525 cputime_expires->virt_exp = expires_to_cputime(exp);
585 break; 526 break;
586 case CPUCLOCK_SCHED: 527 case CPUCLOCK_SCHED:
587 if (cputime_expires->sched_exp == 0 || 528 if (cputime_expires->sched_exp == 0 ||
588 cputime_expires->sched_exp > exp->sched) 529 cputime_expires->sched_exp > exp)
589 cputime_expires->sched_exp = exp->sched; 530 cputime_expires->sched_exp = exp;
590 break; 531 break;
591 } 532 }
592 } 533 }
@@ -601,20 +542,20 @@ static void cpu_timer_fire(struct k_itimer *timer)
601 /* 542 /*
602 * User don't want any signal. 543 * User don't want any signal.
603 */ 544 */
604 timer->it.cpu.expires.sched = 0; 545 timer->it.cpu.expires = 0;
605 } else if (unlikely(timer->sigq == NULL)) { 546 } else if (unlikely(timer->sigq == NULL)) {
606 /* 547 /*
607 * This a special case for clock_nanosleep, 548 * This a special case for clock_nanosleep,
608 * not a normal timer from sys_timer_create. 549 * not a normal timer from sys_timer_create.
609 */ 550 */
610 wake_up_process(timer->it_process); 551 wake_up_process(timer->it_process);
611 timer->it.cpu.expires.sched = 0; 552 timer->it.cpu.expires = 0;
612 } else if (timer->it.cpu.incr.sched == 0) { 553 } else if (timer->it.cpu.incr == 0) {
613 /* 554 /*
614 * One-shot timer. Clear it as soon as it's fired. 555 * One-shot timer. Clear it as soon as it's fired.
615 */ 556 */
616 posix_timer_event(timer, 0); 557 posix_timer_event(timer, 0);
617 timer->it.cpu.expires.sched = 0; 558 timer->it.cpu.expires = 0;
618 } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { 559 } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
619 /* 560 /*
620 * The signal did not get queued because the signal 561 * The signal did not get queued because the signal
@@ -632,7 +573,7 @@ static void cpu_timer_fire(struct k_itimer *timer)
632 */ 573 */
633static int cpu_timer_sample_group(const clockid_t which_clock, 574static int cpu_timer_sample_group(const clockid_t which_clock,
634 struct task_struct *p, 575 struct task_struct *p,
635 union cpu_time_count *cpu) 576 unsigned long long *sample)
636{ 577{
637 struct task_cputime cputime; 578 struct task_cputime cputime;
638 579
@@ -641,13 +582,13 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
641 default: 582 default:
642 return -EINVAL; 583 return -EINVAL;
643 case CPUCLOCK_PROF: 584 case CPUCLOCK_PROF:
644 cpu->cpu = cputime.utime + cputime.stime; 585 *sample = cputime_to_expires(cputime.utime + cputime.stime);
645 break; 586 break;
646 case CPUCLOCK_VIRT: 587 case CPUCLOCK_VIRT:
647 cpu->cpu = cputime.utime; 588 *sample = cputime_to_expires(cputime.utime);
648 break; 589 break;
649 case CPUCLOCK_SCHED: 590 case CPUCLOCK_SCHED:
650 cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); 591 *sample = cputime.sum_exec_runtime + task_delta_exec(p);
651 break; 592 break;
652 } 593 }
653 return 0; 594 return 0;
@@ -694,7 +635,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
694 struct itimerspec *new, struct itimerspec *old) 635 struct itimerspec *new, struct itimerspec *old)
695{ 636{
696 struct task_struct *p = timer->it.cpu.task; 637 struct task_struct *p = timer->it.cpu.task;
697 union cpu_time_count old_expires, new_expires, old_incr, val; 638 unsigned long long old_expires, new_expires, old_incr, val;
698 int ret; 639 int ret;
699 640
700 if (unlikely(p == NULL)) { 641 if (unlikely(p == NULL)) {
@@ -749,7 +690,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
749 } 690 }
750 691
751 if (old) { 692 if (old) {
752 if (old_expires.sched == 0) { 693 if (old_expires == 0) {
753 old->it_value.tv_sec = 0; 694 old->it_value.tv_sec = 0;
754 old->it_value.tv_nsec = 0; 695 old->it_value.tv_nsec = 0;
755 } else { 696 } else {
@@ -764,11 +705,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
764 * new setting. 705 * new setting.
765 */ 706 */
766 bump_cpu_timer(timer, val); 707 bump_cpu_timer(timer, val);
767 if (cpu_time_before(timer->it_clock, val, 708 if (val < timer->it.cpu.expires) {
768 timer->it.cpu.expires)) { 709 old_expires = timer->it.cpu.expires - val;
769 old_expires = cpu_time_sub(
770 timer->it_clock,
771 timer->it.cpu.expires, val);
772 sample_to_timespec(timer->it_clock, 710 sample_to_timespec(timer->it_clock,
773 old_expires, 711 old_expires,
774 &old->it_value); 712 &old->it_value);
@@ -791,8 +729,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
791 goto out; 729 goto out;
792 } 730 }
793 731
794 if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { 732 if (new_expires != 0 && !(flags & TIMER_ABSTIME)) {
795 cpu_time_add(timer->it_clock, &new_expires, val); 733 new_expires += val;
796 } 734 }
797 735
798 /* 736 /*
@@ -801,8 +739,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
801 * arm the timer (we'll just fake it for timer_gettime). 739 * arm the timer (we'll just fake it for timer_gettime).
802 */ 740 */
803 timer->it.cpu.expires = new_expires; 741 timer->it.cpu.expires = new_expires;
804 if (new_expires.sched != 0 && 742 if (new_expires != 0 && val < new_expires) {
805 cpu_time_before(timer->it_clock, val, new_expires)) {
806 arm_timer(timer); 743 arm_timer(timer);
807 } 744 }
808 745
@@ -826,8 +763,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
826 timer->it_overrun_last = 0; 763 timer->it_overrun_last = 0;
827 timer->it_overrun = -1; 764 timer->it_overrun = -1;
828 765
829 if (new_expires.sched != 0 && 766 if (new_expires != 0 && !(val < new_expires)) {
830 !cpu_time_before(timer->it_clock, val, new_expires)) {
831 /* 767 /*
832 * The designated time already passed, so we notify 768 * The designated time already passed, so we notify
833 * immediately, even if the thread never runs to 769 * immediately, even if the thread never runs to
@@ -849,7 +785,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
849 785
850static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) 786static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
851{ 787{
852 union cpu_time_count now; 788 unsigned long long now;
853 struct task_struct *p = timer->it.cpu.task; 789 struct task_struct *p = timer->it.cpu.task;
854 int clear_dead; 790 int clear_dead;
855 791
@@ -859,7 +795,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
859 sample_to_timespec(timer->it_clock, 795 sample_to_timespec(timer->it_clock,
860 timer->it.cpu.incr, &itp->it_interval); 796 timer->it.cpu.incr, &itp->it_interval);
861 797
862 if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */ 798 if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */
863 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; 799 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
864 return; 800 return;
865 } 801 }
@@ -891,7 +827,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
891 */ 827 */
892 put_task_struct(p); 828 put_task_struct(p);
893 timer->it.cpu.task = NULL; 829 timer->it.cpu.task = NULL;
894 timer->it.cpu.expires.sched = 0; 830 timer->it.cpu.expires = 0;
895 read_unlock(&tasklist_lock); 831 read_unlock(&tasklist_lock);
896 goto dead; 832 goto dead;
897 } else { 833 } else {
@@ -912,10 +848,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
912 goto dead; 848 goto dead;
913 } 849 }
914 850
915 if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { 851 if (now < timer->it.cpu.expires) {
916 sample_to_timespec(timer->it_clock, 852 sample_to_timespec(timer->it_clock,
917 cpu_time_sub(timer->it_clock, 853 timer->it.cpu.expires - now,
918 timer->it.cpu.expires, now),
919 &itp->it_value); 854 &itp->it_value);
920 } else { 855 } else {
921 /* 856 /*
@@ -927,6 +862,28 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
927 } 862 }
928} 863}
929 864
865static unsigned long long
866check_timers_list(struct list_head *timers,
867 struct list_head *firing,
868 unsigned long long curr)
869{
870 int maxfire = 20;
871
872 while (!list_empty(timers)) {
873 struct cpu_timer_list *t;
874
875 t = list_first_entry(timers, struct cpu_timer_list, entry);
876
877 if (!--maxfire || curr < t->expires)
878 return t->expires;
879
880 t->firing = 1;
881 list_move_tail(&t->entry, firing);
882 }
883
884 return 0;
885}
886
930/* 887/*
931 * Check for any per-thread CPU timers that have fired and move them off 888 * Check for any per-thread CPU timers that have fired and move them off
932 * the tsk->cpu_timers[N] list onto the firing list. Here we update the 889 * the tsk->cpu_timers[N] list onto the firing list. Here we update the
@@ -935,54 +892,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
935static void check_thread_timers(struct task_struct *tsk, 892static void check_thread_timers(struct task_struct *tsk,
936 struct list_head *firing) 893 struct list_head *firing)
937{ 894{
938 int maxfire;
939 struct list_head *timers = tsk->cpu_timers; 895 struct list_head *timers = tsk->cpu_timers;
940 struct signal_struct *const sig = tsk->signal; 896 struct signal_struct *const sig = tsk->signal;
897 struct task_cputime *tsk_expires = &tsk->cputime_expires;
898 unsigned long long expires;
941 unsigned long soft; 899 unsigned long soft;
942 900
943 maxfire = 20; 901 expires = check_timers_list(timers, firing, prof_ticks(tsk));
944 tsk->cputime_expires.prof_exp = 0; 902 tsk_expires->prof_exp = expires_to_cputime(expires);
945 while (!list_empty(timers)) {
946 struct cpu_timer_list *t = list_first_entry(timers,
947 struct cpu_timer_list,
948 entry);
949 if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
950 tsk->cputime_expires.prof_exp = t->expires.cpu;
951 break;
952 }
953 t->firing = 1;
954 list_move_tail(&t->entry, firing);
955 }
956 903
957 ++timers; 904 expires = check_timers_list(++timers, firing, virt_ticks(tsk));
958 maxfire = 20; 905 tsk_expires->virt_exp = expires_to_cputime(expires);
959 tsk->cputime_expires.virt_exp = 0;
960 while (!list_empty(timers)) {
961 struct cpu_timer_list *t = list_first_entry(timers,
962 struct cpu_timer_list,
963 entry);
964 if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
965 tsk->cputime_expires.virt_exp = t->expires.cpu;
966 break;
967 }
968 t->firing = 1;
969 list_move_tail(&t->entry, firing);
970 }
971 906
972 ++timers; 907 tsk_expires->sched_exp = check_timers_list(++timers, firing,
973 maxfire = 20; 908 tsk->se.sum_exec_runtime);
974 tsk->cputime_expires.sched_exp = 0;
975 while (!list_empty(timers)) {
976 struct cpu_timer_list *t = list_first_entry(timers,
977 struct cpu_timer_list,
978 entry);
979 if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
980 tsk->cputime_expires.sched_exp = t->expires.sched;
981 break;
982 }
983 t->firing = 1;
984 list_move_tail(&t->entry, firing);
985 }
986 909
987 /* 910 /*
988 * Check for the special case thread timers. 911 * Check for the special case thread timers.
@@ -1030,7 +953,8 @@ static void stop_process_timers(struct signal_struct *sig)
1030static u32 onecputick; 953static u32 onecputick;
1031 954
1032static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, 955static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1033 cputime_t *expires, cputime_t cur_time, int signo) 956 unsigned long long *expires,
957 unsigned long long cur_time, int signo)
1034{ 958{
1035 if (!it->expires) 959 if (!it->expires)
1036 return; 960 return;
@@ -1066,9 +990,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1066static void check_process_timers(struct task_struct *tsk, 990static void check_process_timers(struct task_struct *tsk,
1067 struct list_head *firing) 991 struct list_head *firing)
1068{ 992{
1069 int maxfire;
1070 struct signal_struct *const sig = tsk->signal; 993 struct signal_struct *const sig = tsk->signal;
1071 cputime_t utime, ptime, virt_expires, prof_expires; 994 unsigned long long utime, ptime, virt_expires, prof_expires;
1072 unsigned long long sum_sched_runtime, sched_expires; 995 unsigned long long sum_sched_runtime, sched_expires;
1073 struct list_head *timers = sig->cpu_timers; 996 struct list_head *timers = sig->cpu_timers;
1074 struct task_cputime cputime; 997 struct task_cputime cputime;
@@ -1078,52 +1001,13 @@ static void check_process_timers(struct task_struct *tsk,
1078 * Collect the current process totals. 1001 * Collect the current process totals.
1079 */ 1002 */
1080 thread_group_cputimer(tsk, &cputime); 1003 thread_group_cputimer(tsk, &cputime);
1081 utime = cputime.utime; 1004 utime = cputime_to_expires(cputime.utime);
1082 ptime = utime + cputime.stime; 1005 ptime = utime + cputime_to_expires(cputime.stime);
1083 sum_sched_runtime = cputime.sum_exec_runtime; 1006 sum_sched_runtime = cputime.sum_exec_runtime;
1084 maxfire = 20;
1085 prof_expires = 0;
1086 while (!list_empty(timers)) {
1087 struct cpu_timer_list *tl = list_first_entry(timers,
1088 struct cpu_timer_list,
1089 entry);
1090 if (!--maxfire || ptime < tl->expires.cpu) {
1091 prof_expires = tl->expires.cpu;
1092 break;
1093 }
1094 tl->firing = 1;
1095 list_move_tail(&tl->entry, firing);
1096 }
1097 1007
1098 ++timers; 1008 prof_expires = check_timers_list(timers, firing, ptime);
1099 maxfire = 20; 1009 virt_expires = check_timers_list(++timers, firing, utime);
1100 virt_expires = 0; 1010 sched_expires = check_timers_list(++timers, firing, sum_sched_runtime);
1101 while (!list_empty(timers)) {
1102 struct cpu_timer_list *tl = list_first_entry(timers,
1103 struct cpu_timer_list,
1104 entry);
1105 if (!--maxfire || utime < tl->expires.cpu) {
1106 virt_expires = tl->expires.cpu;
1107 break;
1108 }
1109 tl->firing = 1;
1110 list_move_tail(&tl->entry, firing);
1111 }
1112
1113 ++timers;
1114 maxfire = 20;
1115 sched_expires = 0;
1116 while (!list_empty(timers)) {
1117 struct cpu_timer_list *tl = list_first_entry(timers,
1118 struct cpu_timer_list,
1119 entry);
1120 if (!--maxfire || sum_sched_runtime < tl->expires.sched) {
1121 sched_expires = tl->expires.sched;
1122 break;
1123 }
1124 tl->firing = 1;
1125 list_move_tail(&tl->entry, firing);
1126 }
1127 1011
1128 /* 1012 /*
1129 * Check for the special case process timers. 1013 * Check for the special case process timers.
@@ -1162,8 +1046,8 @@ static void check_process_timers(struct task_struct *tsk,
1162 } 1046 }
1163 } 1047 }
1164 1048
1165 sig->cputime_expires.prof_exp = prof_expires; 1049 sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires);
1166 sig->cputime_expires.virt_exp = virt_expires; 1050 sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);
1167 sig->cputime_expires.sched_exp = sched_expires; 1051 sig->cputime_expires.sched_exp = sched_expires;
1168 if (task_cputime_zero(&sig->cputime_expires)) 1052 if (task_cputime_zero(&sig->cputime_expires))
1169 stop_process_timers(sig); 1053 stop_process_timers(sig);
@@ -1176,7 +1060,7 @@ static void check_process_timers(struct task_struct *tsk,
1176void posix_cpu_timer_schedule(struct k_itimer *timer) 1060void posix_cpu_timer_schedule(struct k_itimer *timer)
1177{ 1061{
1178 struct task_struct *p = timer->it.cpu.task; 1062 struct task_struct *p = timer->it.cpu.task;
1179 union cpu_time_count now; 1063 unsigned long long now;
1180 1064
1181 if (unlikely(p == NULL)) 1065 if (unlikely(p == NULL))
1182 /* 1066 /*
@@ -1205,7 +1089,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1205 */ 1089 */
1206 put_task_struct(p); 1090 put_task_struct(p);
1207 timer->it.cpu.task = p = NULL; 1091 timer->it.cpu.task = p = NULL;
1208 timer->it.cpu.expires.sched = 0; 1092 timer->it.cpu.expires = 0;
1209 goto out_unlock; 1093 goto out_unlock;
1210 } else if (unlikely(p->exit_state) && thread_group_empty(p)) { 1094 } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
1211 /* 1095 /*
@@ -1213,6 +1097,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1213 * not yet reaped. Take this opportunity to 1097 * not yet reaped. Take this opportunity to
1214 * drop our task ref. 1098 * drop our task ref.
1215 */ 1099 */
1100 cpu_timer_sample_group(timer->it_clock, p, &now);
1216 clear_dead_task(timer, now); 1101 clear_dead_task(timer, now);
1217 goto out_unlock; 1102 goto out_unlock;
1218 } 1103 }
@@ -1387,7 +1272,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1387void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, 1272void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1388 cputime_t *newval, cputime_t *oldval) 1273 cputime_t *newval, cputime_t *oldval)
1389{ 1274{
1390 union cpu_time_count now; 1275 unsigned long long now;
1391 1276
1392 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1277 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1393 cpu_timer_sample_group(clock_idx, tsk, &now); 1278 cpu_timer_sample_group(clock_idx, tsk, &now);
@@ -1399,17 +1284,17 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1399 * it to be absolute. 1284 * it to be absolute.
1400 */ 1285 */
1401 if (*oldval) { 1286 if (*oldval) {
1402 if (*oldval <= now.cpu) { 1287 if (*oldval <= now) {
1403 /* Just about to fire. */ 1288 /* Just about to fire. */
1404 *oldval = cputime_one_jiffy; 1289 *oldval = cputime_one_jiffy;
1405 } else { 1290 } else {
1406 *oldval -= now.cpu; 1291 *oldval -= now;
1407 } 1292 }
1408 } 1293 }
1409 1294
1410 if (!*newval) 1295 if (!*newval)
1411 goto out; 1296 goto out;
1412 *newval += now.cpu; 1297 *newval += now;
1413 } 1298 }
1414 1299
1415 /* 1300 /*
@@ -1459,7 +1344,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1459 } 1344 }
1460 1345
1461 while (!signal_pending(current)) { 1346 while (!signal_pending(current)) {
1462 if (timer.it.cpu.expires.sched == 0) { 1347 if (timer.it.cpu.expires == 0) {
1463 /* 1348 /*
1464 * Our timer fired and was reset, below 1349 * Our timer fired and was reset, below
1465 * deletion can not fail. 1350 * deletion can not fail.
diff --git a/kernel/printk.c b/kernel/printk.c
index fa36e1494420..8212c1aef125 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -363,6 +363,53 @@ static void log_store(int facility, int level,
363 log_next_seq++; 363 log_next_seq++;
364} 364}
365 365
366#ifdef CONFIG_SECURITY_DMESG_RESTRICT
367int dmesg_restrict = 1;
368#else
369int dmesg_restrict;
370#endif
371
372static int syslog_action_restricted(int type)
373{
374 if (dmesg_restrict)
375 return 1;
376 /*
377 * Unless restricted, we allow "read all" and "get buffer size"
378 * for everybody.
379 */
380 return type != SYSLOG_ACTION_READ_ALL &&
381 type != SYSLOG_ACTION_SIZE_BUFFER;
382}
383
384static int check_syslog_permissions(int type, bool from_file)
385{
386 /*
387 * If this is from /proc/kmsg and we've already opened it, then we've
388 * already done the capabilities checks at open time.
389 */
390 if (from_file && type != SYSLOG_ACTION_OPEN)
391 return 0;
392
393 if (syslog_action_restricted(type)) {
394 if (capable(CAP_SYSLOG))
395 return 0;
396 /*
397 * For historical reasons, accept CAP_SYS_ADMIN too, with
398 * a warning.
399 */
400 if (capable(CAP_SYS_ADMIN)) {
401 pr_warn_once("%s (%d): Attempt to access syslog with "
402 "CAP_SYS_ADMIN but no CAP_SYSLOG "
403 "(deprecated).\n",
404 current->comm, task_pid_nr(current));
405 return 0;
406 }
407 return -EPERM;
408 }
409 return security_syslog(type);
410}
411
412
366/* /dev/kmsg - userspace message inject/listen interface */ 413/* /dev/kmsg - userspace message inject/listen interface */
367struct devkmsg_user { 414struct devkmsg_user {
368 u64 seq; 415 u64 seq;
@@ -620,7 +667,8 @@ static int devkmsg_open(struct inode *inode, struct file *file)
620 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 667 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
621 return 0; 668 return 0;
622 669
623 err = security_syslog(SYSLOG_ACTION_READ_ALL); 670 err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
671 SYSLOG_FROM_READER);
624 if (err) 672 if (err)
625 return err; 673 return err;
626 674
@@ -813,45 +861,6 @@ static inline void boot_delay_msec(int level)
813} 861}
814#endif 862#endif
815 863
816#ifdef CONFIG_SECURITY_DMESG_RESTRICT
817int dmesg_restrict = 1;
818#else
819int dmesg_restrict;
820#endif
821
822static int syslog_action_restricted(int type)
823{
824 if (dmesg_restrict)
825 return 1;
826 /* Unless restricted, we allow "read all" and "get buffer size" for everybody */
827 return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
828}
829
830static int check_syslog_permissions(int type, bool from_file)
831{
832 /*
833 * If this is from /proc/kmsg and we've already opened it, then we've
834 * already done the capabilities checks at open time.
835 */
836 if (from_file && type != SYSLOG_ACTION_OPEN)
837 return 0;
838
839 if (syslog_action_restricted(type)) {
840 if (capable(CAP_SYSLOG))
841 return 0;
842 /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
843 if (capable(CAP_SYS_ADMIN)) {
844 printk_once(KERN_WARNING "%s (%d): "
845 "Attempt to access syslog with CAP_SYS_ADMIN "
846 "but no CAP_SYSLOG (deprecated).\n",
847 current->comm, task_pid_nr(current));
848 return 0;
849 }
850 return -EPERM;
851 }
852 return 0;
853}
854
855#if defined(CONFIG_PRINTK_TIME) 864#if defined(CONFIG_PRINTK_TIME)
856static bool printk_time = 1; 865static bool printk_time = 1;
857#else 866#else
@@ -1249,7 +1258,7 @@ out:
1249 1258
1250SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) 1259SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
1251{ 1260{
1252 return do_syslog(type, buf, len, SYSLOG_FROM_CALL); 1261 return do_syslog(type, buf, len, SYSLOG_FROM_READER);
1253} 1262}
1254 1263
1255/* 1264/*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index aed981a3f69c..335a7ae697f5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -665,20 +665,22 @@ static int ptrace_peek_siginfo(struct task_struct *child,
665 if (unlikely(is_compat_task())) { 665 if (unlikely(is_compat_task())) {
666 compat_siginfo_t __user *uinfo = compat_ptr(data); 666 compat_siginfo_t __user *uinfo = compat_ptr(data);
667 667
668 ret = copy_siginfo_to_user32(uinfo, &info); 668 if (copy_siginfo_to_user32(uinfo, &info) ||
669 ret |= __put_user(info.si_code, &uinfo->si_code); 669 __put_user(info.si_code, &uinfo->si_code)) {
670 ret = -EFAULT;
671 break;
672 }
673
670 } else 674 } else
671#endif 675#endif
672 { 676 {
673 siginfo_t __user *uinfo = (siginfo_t __user *) data; 677 siginfo_t __user *uinfo = (siginfo_t __user *) data;
674 678
675 ret = copy_siginfo_to_user(uinfo, &info); 679 if (copy_siginfo_to_user(uinfo, &info) ||
676 ret |= __put_user(info.si_code, &uinfo->si_code); 680 __put_user(info.si_code, &uinfo->si_code)) {
677 } 681 ret = -EFAULT;
678 682 break;
679 if (ret) { 683 }
680 ret = -EFAULT;
681 break;
682 } 684 }
683 685
684 data += sizeof(siginfo_t); 686 data += sizeof(siginfo_t);
diff --git a/kernel/range.c b/kernel/range.c
index 071b0ab455cb..322ea8e93e4b 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -4,7 +4,7 @@
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/init.h> 5#include <linux/init.h>
6#include <linux/sort.h> 6#include <linux/sort.h>
7 7#include <linux/string.h>
8#include <linux/range.h> 8#include <linux/range.h>
9 9
10int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) 10int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
@@ -32,9 +32,8 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
32 if (start >= end) 32 if (start >= end)
33 return nr_range; 33 return nr_range;
34 34
35 /* Try to merge it with old one: */ 35 /* get new start/end: */
36 for (i = 0; i < nr_range; i++) { 36 for (i = 0; i < nr_range; i++) {
37 u64 final_start, final_end;
38 u64 common_start, common_end; 37 u64 common_start, common_end;
39 38
40 if (!range[i].end) 39 if (!range[i].end)
@@ -45,12 +44,16 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
45 if (common_start > common_end) 44 if (common_start > common_end)
46 continue; 45 continue;
47 46
48 final_start = min(range[i].start, start); 47 /* new start/end, will add it back at last */
49 final_end = max(range[i].end, end); 48 start = min(range[i].start, start);
49 end = max(range[i].end, end);
50 50
51 range[i].start = final_start; 51 memmove(&range[i], &range[i + 1],
52 range[i].end = final_end; 52 (nr_range - (i + 1)) * sizeof(range[i]));
53 return nr_range; 53 range[nr_range - 1].start = 0;
54 range[nr_range - 1].end = 0;
55 nr_range--;
56 i--;
54 } 57 }
55 58
56 /* Need to add it: */ 59 /* Need to add it: */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 16ea67925015..35380019f0fc 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1451,9 +1451,9 @@ static int rcu_gp_init(struct rcu_state *rsp)
1451 rnp->grphi, rnp->qsmask); 1451 rnp->grphi, rnp->qsmask);
1452 raw_spin_unlock_irq(&rnp->lock); 1452 raw_spin_unlock_irq(&rnp->lock);
1453#ifdef CONFIG_PROVE_RCU_DELAY 1453#ifdef CONFIG_PROVE_RCU_DELAY
1454 if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 && 1454 if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 &&
1455 system_state == SYSTEM_RUNNING) 1455 system_state == SYSTEM_RUNNING)
1456 schedule_timeout_uninterruptible(2); 1456 udelay(200);
1457#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ 1457#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
1458 cond_resched(); 1458 cond_resched();
1459 } 1459 }
@@ -1613,6 +1613,14 @@ static int __noreturn rcu_gp_kthread(void *arg)
1613 } 1613 }
1614} 1614}
1615 1615
1616static void rsp_wakeup(struct irq_work *work)
1617{
1618 struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work);
1619
1620 /* Wake up rcu_gp_kthread() to start the grace period. */
1621 wake_up(&rsp->gp_wq);
1622}
1623
1616/* 1624/*
1617 * Start a new RCU grace period if warranted, re-initializing the hierarchy 1625 * Start a new RCU grace period if warranted, re-initializing the hierarchy
1618 * in preparation for detecting the next grace period. The caller must hold 1626 * in preparation for detecting the next grace period. The caller must hold
@@ -1637,8 +1645,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1637 } 1645 }
1638 rsp->gp_flags = RCU_GP_FLAG_INIT; 1646 rsp->gp_flags = RCU_GP_FLAG_INIT;
1639 1647
1640 /* Wake up rcu_gp_kthread() to start the grace period. */ 1648 /*
1641 wake_up(&rsp->gp_wq); 1649 * We can't do wakeups while holding the rnp->lock, as that
1650 * could cause possible deadlocks with the rq->lock. Deter
1651 * the wakeup to interrupt context.
1652 */
1653 irq_work_queue(&rsp->wakeup_work);
1642} 1654}
1643 1655
1644/* 1656/*
@@ -3235,6 +3247,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3235 3247
3236 rsp->rda = rda; 3248 rsp->rda = rda;
3237 init_waitqueue_head(&rsp->gp_wq); 3249 init_waitqueue_head(&rsp->gp_wq);
3250 init_irq_work(&rsp->wakeup_work, rsp_wakeup);
3238 rnp = rsp->level[rcu_num_lvls - 1]; 3251 rnp = rsp->level[rcu_num_lvls - 1];
3239 for_each_possible_cpu(i) { 3252 for_each_possible_cpu(i) {
3240 while (i > rnp->grphi) 3253 while (i > rnp->grphi)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index da77a8f57ff9..4df503470e42 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -27,6 +27,7 @@
27#include <linux/threads.h> 27#include <linux/threads.h>
28#include <linux/cpumask.h> 28#include <linux/cpumask.h>
29#include <linux/seqlock.h> 29#include <linux/seqlock.h>
30#include <linux/irq_work.h>
30 31
31/* 32/*
32 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and 33 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -442,6 +443,7 @@ struct rcu_state {
442 char *name; /* Name of structure. */ 443 char *name; /* Name of structure. */
443 char abbr; /* Abbreviated name. */ 444 char abbr; /* Abbreviated name. */
444 struct list_head flavors; /* List of RCU flavors. */ 445 struct list_head flavors; /* List of RCU flavors. */
446 struct irq_work wakeup_work; /* Postponed wakeups */
445}; 447};
446 448
447/* Values for rcu_state structure's gp_flags field. */ 449/* Values for rcu_state structure's gp_flags field. */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 170814dc418f..3db5a375d8dd 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -88,7 +88,7 @@ static void __init rcu_bootup_announce_oddness(void)
88#ifdef CONFIG_RCU_NOCB_CPU 88#ifdef CONFIG_RCU_NOCB_CPU
89#ifndef CONFIG_RCU_NOCB_CPU_NONE 89#ifndef CONFIG_RCU_NOCB_CPU_NONE
90 if (!have_rcu_nocb_mask) { 90 if (!have_rcu_nocb_mask) {
91 alloc_bootmem_cpumask_var(&rcu_nocb_mask); 91 zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL);
92 have_rcu_nocb_mask = true; 92 have_rcu_nocb_mask = true;
93 } 93 }
94#ifdef CONFIG_RCU_NOCB_CPU_ZERO 94#ifdef CONFIG_RCU_NOCB_CPU_ZERO
@@ -1667,7 +1667,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
1667 rdtp->last_accelerate = jiffies; 1667 rdtp->last_accelerate = jiffies;
1668 1668
1669 /* Request timer delay depending on laziness, and round. */ 1669 /* Request timer delay depending on laziness, and round. */
1670 if (rdtp->all_lazy) { 1670 if (!rdtp->all_lazy) {
1671 *dj = round_up(rcu_idle_gp_delay + jiffies, 1671 *dj = round_up(rcu_idle_gp_delay + jiffies,
1672 rcu_idle_gp_delay) - jiffies; 1672 rcu_idle_gp_delay) - jiffies;
1673 } else { 1673 } else {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 58453b8272fd..e8b335016c52 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -633,7 +633,19 @@ void wake_up_nohz_cpu(int cpu)
633static inline bool got_nohz_idle_kick(void) 633static inline bool got_nohz_idle_kick(void)
634{ 634{
635 int cpu = smp_processor_id(); 635 int cpu = smp_processor_id();
636 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 636
637 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
638 return false;
639
640 if (idle_cpu(cpu) && !need_resched())
641 return true;
642
643 /*
644 * We can't run Idle Load Balance on this CPU for this time so we
645 * cancel it and clear NOHZ_BALANCE_KICK
646 */
647 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
648 return false;
637} 649}
638 650
639#else /* CONFIG_NO_HZ_COMMON */ 651#else /* CONFIG_NO_HZ_COMMON */
@@ -1393,8 +1405,9 @@ static void sched_ttwu_pending(void)
1393 1405
1394void scheduler_ipi(void) 1406void scheduler_ipi(void)
1395{ 1407{
1396 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() 1408 if (llist_empty(&this_rq()->wake_list)
1397 && !tick_nohz_full_cpu(smp_processor_id())) 1409 && !tick_nohz_full_cpu(smp_processor_id())
1410 && !got_nohz_idle_kick())
1398 return; 1411 return;
1399 1412
1400 /* 1413 /*
@@ -1417,7 +1430,7 @@ void scheduler_ipi(void)
1417 /* 1430 /*
1418 * Check if someone kicked us for doing the nohz idle load balance. 1431 * Check if someone kicked us for doing the nohz idle load balance.
1419 */ 1432 */
1420 if (unlikely(got_nohz_idle_kick() && !need_resched())) { 1433 if (unlikely(got_nohz_idle_kick())) {
1421 this_rq()->idle_balance = 1; 1434 this_rq()->idle_balance = 1;
1422 raise_softirq_irqoff(SCHED_SOFTIRQ); 1435 raise_softirq_irqoff(SCHED_SOFTIRQ);
1423 } 1436 }
@@ -4745,7 +4758,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4745 */ 4758 */
4746 idle->sched_class = &idle_sched_class; 4759 idle->sched_class = &idle_sched_class;
4747 ftrace_graph_init_idle_task(idle, cpu); 4760 ftrace_graph_init_idle_task(idle, cpu);
4748 vtime_init_idle(idle); 4761 vtime_init_idle(idle, cpu);
4749#if defined(CONFIG_SMP) 4762#if defined(CONFIG_SMP)
4750 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4763 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4751#endif 4764#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index cc2dc3eea8a3..b5ccba22603b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -747,17 +747,17 @@ void arch_vtime_task_switch(struct task_struct *prev)
747 747
748 write_seqlock(&current->vtime_seqlock); 748 write_seqlock(&current->vtime_seqlock);
749 current->vtime_snap_whence = VTIME_SYS; 749 current->vtime_snap_whence = VTIME_SYS;
750 current->vtime_snap = sched_clock(); 750 current->vtime_snap = sched_clock_cpu(smp_processor_id());
751 write_sequnlock(&current->vtime_seqlock); 751 write_sequnlock(&current->vtime_seqlock);
752} 752}
753 753
754void vtime_init_idle(struct task_struct *t) 754void vtime_init_idle(struct task_struct *t, int cpu)
755{ 755{
756 unsigned long flags; 756 unsigned long flags;
757 757
758 write_seqlock_irqsave(&t->vtime_seqlock, flags); 758 write_seqlock_irqsave(&t->vtime_seqlock, flags);
759 t->vtime_snap_whence = VTIME_SYS; 759 t->vtime_snap_whence = VTIME_SYS;
760 t->vtime_snap = sched_clock(); 760 t->vtime_snap = sched_clock_cpu(cpu);
761 write_sequnlock_irqrestore(&t->vtime_seqlock, flags); 761 write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
762} 762}
763 763
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 2ef90a51ec5e..71bac979d5ee 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -162,6 +162,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
162 */ 162 */
163 163
164/** 164/**
165 * cputimer_running - return true if cputimer is running
166 *
167 * @tsk: Pointer to target task.
168 */
169static inline bool cputimer_running(struct task_struct *tsk)
170
171{
172 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
173
174 if (!cputimer->running)
175 return false;
176
177 /*
178 * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
179 * in __exit_signal(), we won't account to the signal struct further
180 * cputime consumed by that task, even though the task can still be
181 * ticking after __exit_signal().
182 *
183 * In order to keep a consistent behaviour between thread group cputime
184 * and thread group cputimer accounting, lets also ignore the cputime
185 * elapsing after __exit_signal() in any thread group timer running.
186 *
187 * This makes sure that POSIX CPU clocks and timers are synchronized, so
188 * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
189 * clock delta is behind the expiring timer value.
190 */
191 if (unlikely(!tsk->sighand))
192 return false;
193
194 return true;
195}
196
197/**
165 * account_group_user_time - Maintain utime for a thread group. 198 * account_group_user_time - Maintain utime for a thread group.
166 * 199 *
167 * @tsk: Pointer to task structure. 200 * @tsk: Pointer to task structure.
@@ -176,7 +209,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
176{ 209{
177 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 210 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
178 211
179 if (!cputimer->running) 212 if (!cputimer_running(tsk))
180 return; 213 return;
181 214
182 raw_spin_lock(&cputimer->lock); 215 raw_spin_lock(&cputimer->lock);
@@ -199,7 +232,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
199{ 232{
200 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 233 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
201 234
202 if (!cputimer->running) 235 if (!cputimer_running(tsk))
203 return; 236 return;
204 237
205 raw_spin_lock(&cputimer->lock); 238 raw_spin_lock(&cputimer->lock);
@@ -222,7 +255,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
222{ 255{
223 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 256 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
224 257
225 if (!cputimer->running) 258 if (!cputimer_running(tsk))
226 return; 259 return;
227 260
228 raw_spin_lock(&cputimer->lock); 261 raw_spin_lock(&cputimer->lock);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b5197dcb0dad..3d6833f125d3 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -195,8 +195,12 @@ void local_bh_enable_ip(unsigned long ip)
195EXPORT_SYMBOL(local_bh_enable_ip); 195EXPORT_SYMBOL(local_bh_enable_ip);
196 196
197/* 197/*
198 * We restart softirq processing for at most 2 ms, 198 * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
199 * and if need_resched() is not set. 199 * but break the loop if need_resched() is set or after 2 ms.
200 * The MAX_SOFTIRQ_TIME provides a nice upper bound in most cases, but in
201 * certain cases, such as stop_machine(), jiffies may cease to
202 * increment and so we need the MAX_SOFTIRQ_RESTART limit as
203 * well to make sure we eventually return from this method.
200 * 204 *
201 * These limits have been established via experimentation. 205 * These limits have been established via experimentation.
202 * The two things to balance is latency against fairness - 206 * The two things to balance is latency against fairness -
@@ -204,6 +208,7 @@ EXPORT_SYMBOL(local_bh_enable_ip);
204 * should not be able to lock up the box. 208 * should not be able to lock up the box.
205 */ 209 */
206#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) 210#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
211#define MAX_SOFTIRQ_RESTART 10
207 212
208asmlinkage void __do_softirq(void) 213asmlinkage void __do_softirq(void)
209{ 214{
@@ -212,6 +217,7 @@ asmlinkage void __do_softirq(void)
212 unsigned long end = jiffies + MAX_SOFTIRQ_TIME; 217 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
213 int cpu; 218 int cpu;
214 unsigned long old_flags = current->flags; 219 unsigned long old_flags = current->flags;
220 int max_restart = MAX_SOFTIRQ_RESTART;
215 221
216 /* 222 /*
217 * Mask out PF_MEMALLOC s current task context is borrowed for the 223 * Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -265,7 +271,8 @@ restart:
265 271
266 pending = local_softirq_pending(); 272 pending = local_softirq_pending();
267 if (pending) { 273 if (pending) {
268 if (time_before(jiffies, end) && !need_resched()) 274 if (time_before(jiffies, end) && !need_resched() &&
275 --max_restart)
269 goto restart; 276 goto restart;
270 277
271 wakeup_softirqd(); 278 wakeup_softirqd();
diff --git a/kernel/sys.c b/kernel/sys.c
index b95d3c72ba21..2bbd9a73b54c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -362,6 +362,29 @@ int unregister_reboot_notifier(struct notifier_block *nb)
362} 362}
363EXPORT_SYMBOL(unregister_reboot_notifier); 363EXPORT_SYMBOL(unregister_reboot_notifier);
364 364
365/* Add backwards compatibility for stable trees. */
366#ifndef PF_NO_SETAFFINITY
367#define PF_NO_SETAFFINITY PF_THREAD_BOUND
368#endif
369
370static void migrate_to_reboot_cpu(void)
371{
372 /* The boot cpu is always logical cpu 0 */
373 int cpu = 0;
374
375 cpu_hotplug_disable();
376
377 /* Make certain the cpu I'm about to reboot on is online */
378 if (!cpu_online(cpu))
379 cpu = cpumask_first(cpu_online_mask);
380
381 /* Prevent races with other tasks migrating this task */
382 current->flags |= PF_NO_SETAFFINITY;
383
384 /* Make certain I only run on the appropriate processor */
385 set_cpus_allowed_ptr(current, cpumask_of(cpu));
386}
387
365/** 388/**
366 * kernel_restart - reboot the system 389 * kernel_restart - reboot the system
367 * @cmd: pointer to buffer containing command to execute for restart 390 * @cmd: pointer to buffer containing command to execute for restart
@@ -373,7 +396,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
373void kernel_restart(char *cmd) 396void kernel_restart(char *cmd)
374{ 397{
375 kernel_restart_prepare(cmd); 398 kernel_restart_prepare(cmd);
376 disable_nonboot_cpus(); 399 migrate_to_reboot_cpu();
377 syscore_shutdown(); 400 syscore_shutdown();
378 if (!cmd) 401 if (!cmd)
379 printk(KERN_EMERG "Restarting system.\n"); 402 printk(KERN_EMERG "Restarting system.\n");
@@ -400,7 +423,7 @@ static void kernel_shutdown_prepare(enum system_states state)
400void kernel_halt(void) 423void kernel_halt(void)
401{ 424{
402 kernel_shutdown_prepare(SYSTEM_HALT); 425 kernel_shutdown_prepare(SYSTEM_HALT);
403 disable_nonboot_cpus(); 426 migrate_to_reboot_cpu();
404 syscore_shutdown(); 427 syscore_shutdown();
405 printk(KERN_EMERG "System halted.\n"); 428 printk(KERN_EMERG "System halted.\n");
406 kmsg_dump(KMSG_DUMP_HALT); 429 kmsg_dump(KMSG_DUMP_HALT);
@@ -419,7 +442,7 @@ void kernel_power_off(void)
419 kernel_shutdown_prepare(SYSTEM_POWER_OFF); 442 kernel_shutdown_prepare(SYSTEM_POWER_OFF);
420 if (pm_power_off_prepare) 443 if (pm_power_off_prepare)
421 pm_power_off_prepare(); 444 pm_power_off_prepare();
422 disable_nonboot_cpus(); 445 migrate_to_reboot_cpu();
423 syscore_shutdown(); 446 syscore_shutdown();
424 printk(KERN_EMERG "Power down.\n"); 447 printk(KERN_EMERG "Power down.\n");
425 kmsg_dump(KMSG_DUMP_POWEROFF); 448 kmsg_dump(KMSG_DUMP_POWEROFF);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 12ff13a838c6..8f5b3b98577b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -874,7 +874,6 @@ static void hardpps_update_phase(long error)
874void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) 874void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
875{ 875{
876 struct pps_normtime pts_norm, freq_norm; 876 struct pps_normtime pts_norm, freq_norm;
877 unsigned long flags;
878 877
879 pts_norm = pps_normalize_ts(*phase_ts); 878 pts_norm = pps_normalize_ts(*phase_ts);
880 879
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 4430fa695b48..6d3f91631de6 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -583,6 +583,12 @@ again:
583 } 583 }
584 } 584 }
585 585
586 /*
587 * Remove the current cpu from the pending mask. The event is
588 * delivered immediately in tick_do_broadcast() !
589 */
590 cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask);
591
586 /* Take care of enforced broadcast requests */ 592 /* Take care of enforced broadcast requests */
587 cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); 593 cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask);
588 cpumask_clear(tick_broadcast_force_mask); 594 cpumask_clear(tick_broadcast_force_mask);
@@ -654,8 +660,8 @@ void tick_broadcast_oneshot_control(unsigned long reason)
654 660
655 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 661 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
656 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { 662 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
657 WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
658 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { 663 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
664 WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
659 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); 665 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
660 /* 666 /*
661 * We only reprogram the broadcast timer if we 667 * We only reprogram the broadcast timer if we
@@ -672,8 +678,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)
672 } else { 678 } else {
673 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { 679 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
674 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 680 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
675 if (dev->next_event.tv64 == KTIME_MAX)
676 goto out;
677 /* 681 /*
678 * The cpu which was handling the broadcast 682 * The cpu which was handling the broadcast
679 * timer marked this cpu in the broadcast 683 * timer marked this cpu in the broadcast
@@ -688,6 +692,11 @@ void tick_broadcast_oneshot_control(unsigned long reason)
688 goto out; 692 goto out;
689 693
690 /* 694 /*
695 * Bail out if there is no next event.
696 */
697 if (dev->next_event.tv64 == KTIME_MAX)
698 goto out;
699 /*
691 * If the pending bit is not set, then we are 700 * If the pending bit is not set, then we are
692 * either the CPU handling the broadcast 701 * either the CPU handling the broadcast
693 * interrupt or we got woken by something else. 702 * interrupt or we got woken by something else.
@@ -771,10 +780,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
771 780
772 bc->event_handler = tick_handle_oneshot_broadcast; 781 bc->event_handler = tick_handle_oneshot_broadcast;
773 782
774 /* Take the do_timer update */
775 if (!tick_nohz_full_cpu(cpu))
776 tick_do_timer_cpu = cpu;
777
778 /* 783 /*
779 * We must be careful here. There might be other CPUs 784 * We must be careful here. There might be other CPUs
780 * waiting for periodic broadcast. We need to set the 785 * waiting for periodic broadcast. We need to set the
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f4208138fbf4..0cf1c1453181 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -306,7 +306,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
306 * we can't safely shutdown that CPU. 306 * we can't safely shutdown that CPU.
307 */ 307 */
308 if (have_nohz_full_mask && tick_do_timer_cpu == cpu) 308 if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
309 return -EINVAL; 309 return NOTIFY_BAD;
310 break; 310 break;
311 } 311 }
312 return NOTIFY_OK; 312 return NOTIFY_OK;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 846d0a1f235e..48b9fffabdc2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -991,6 +991,14 @@ static int timekeeping_suspend(void)
991 991
992 read_persistent_clock(&timekeeping_suspend_time); 992 read_persistent_clock(&timekeeping_suspend_time);
993 993
994 /*
995 * On some systems the persistent_clock can not be detected at
996 * timekeeping_init by its return value, so if we see a valid
997 * value returned, update the persistent_clock_exists flag.
998 */
999 if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
1000 persistent_clock_exist = true;
1001
994 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1002 raw_spin_lock_irqsave(&timekeeper_lock, flags);
995 write_seqcount_begin(&timekeeper_seq); 1003 write_seqcount_begin(&timekeeper_seq);
996 timekeeping_forward_now(tk); 1004 timekeeping_forward_now(tk);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b549b0f5b977..6c508ff33c62 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -120,22 +120,22 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
120 120
121/* 121/*
122 * Traverse the ftrace_global_list, invoking all entries. The reason that we 122 * Traverse the ftrace_global_list, invoking all entries. The reason that we
123 * can use rcu_dereference_raw() is that elements removed from this list 123 * can use rcu_dereference_raw_notrace() is that elements removed from this list
124 * are simply leaked, so there is no need to interact with a grace-period 124 * are simply leaked, so there is no need to interact with a grace-period
125 * mechanism. The rcu_dereference_raw() calls are needed to handle 125 * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle
126 * concurrent insertions into the ftrace_global_list. 126 * concurrent insertions into the ftrace_global_list.
127 * 127 *
128 * Silly Alpha and silly pointer-speculation compiler optimizations! 128 * Silly Alpha and silly pointer-speculation compiler optimizations!
129 */ 129 */
130#define do_for_each_ftrace_op(op, list) \ 130#define do_for_each_ftrace_op(op, list) \
131 op = rcu_dereference_raw(list); \ 131 op = rcu_dereference_raw_notrace(list); \
132 do 132 do
133 133
134/* 134/*
135 * Optimized for just a single item in the list (as that is the normal case). 135 * Optimized for just a single item in the list (as that is the normal case).
136 */ 136 */
137#define while_for_each_ftrace_op(op) \ 137#define while_for_each_ftrace_op(op) \
138 while (likely(op = rcu_dereference_raw((op)->next)) && \ 138 while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \
139 unlikely((op) != &ftrace_list_end)) 139 unlikely((op) != &ftrace_list_end))
140 140
141static inline void ftrace_ops_init(struct ftrace_ops *ops) 141static inline void ftrace_ops_init(struct ftrace_ops *ops)
@@ -779,7 +779,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
779 if (hlist_empty(hhd)) 779 if (hlist_empty(hhd))
780 return NULL; 780 return NULL;
781 781
782 hlist_for_each_entry_rcu(rec, hhd, node) { 782 hlist_for_each_entry_rcu_notrace(rec, hhd, node) {
783 if (rec->ip == ip) 783 if (rec->ip == ip)
784 return rec; 784 return rec;
785 } 785 }
@@ -1165,7 +1165,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1165 1165
1166 hhd = &hash->buckets[key]; 1166 hhd = &hash->buckets[key];
1167 1167
1168 hlist_for_each_entry_rcu(entry, hhd, hlist) { 1168 hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) {
1169 if (entry->ip == ip) 1169 if (entry->ip == ip)
1170 return entry; 1170 return entry;
1171 } 1171 }
@@ -1422,8 +1422,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1422 struct ftrace_hash *notrace_hash; 1422 struct ftrace_hash *notrace_hash;
1423 int ret; 1423 int ret;
1424 1424
1425 filter_hash = rcu_dereference_raw(ops->filter_hash); 1425 filter_hash = rcu_dereference_raw_notrace(ops->filter_hash);
1426 notrace_hash = rcu_dereference_raw(ops->notrace_hash); 1426 notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash);
1427 1427
1428 if ((ftrace_hash_empty(filter_hash) || 1428 if ((ftrace_hash_empty(filter_hash) ||
1429 ftrace_lookup_ip(filter_hash, ip)) && 1429 ftrace_lookup_ip(filter_hash, ip)) &&
@@ -2920,7 +2920,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
2920 * on the hash. rcu_read_lock is too dangerous here. 2920 * on the hash. rcu_read_lock is too dangerous here.
2921 */ 2921 */
2922 preempt_disable_notrace(); 2922 preempt_disable_notrace();
2923 hlist_for_each_entry_rcu(entry, hhd, node) { 2923 hlist_for_each_entry_rcu_notrace(entry, hhd, node) {
2924 if (entry->ip == ip) 2924 if (entry->ip == ip)
2925 entry->ops->func(ip, parent_ip, &entry->data); 2925 entry->ops->func(ip, parent_ip, &entry->data);
2926 } 2926 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b59aea2c48c2..e444ff88f0a4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -620,6 +620,9 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
620 if (cpu == RING_BUFFER_ALL_CPUS) 620 if (cpu == RING_BUFFER_ALL_CPUS)
621 work = &buffer->irq_work; 621 work = &buffer->irq_work;
622 else { 622 else {
623 if (!cpumask_test_cpu(cpu, buffer->cpumask))
624 return -EINVAL;
625
623 cpu_buffer = buffer->buffers[cpu]; 626 cpu_buffer = buffer->buffers[cpu];
624 work = &cpu_buffer->irq_work; 627 work = &cpu_buffer->irq_work;
625 } 628 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ae6fa2d1cdf7..e71a8be4a6ee 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -652,8 +652,6 @@ static struct {
652 ARCH_TRACE_CLOCKS 652 ARCH_TRACE_CLOCKS
653}; 653};
654 654
655int trace_clock_id;
656
657/* 655/*
658 * trace_parser_get_init - gets the buffer for trace parser 656 * trace_parser_get_init - gets the buffer for trace parser
659 */ 657 */
@@ -843,7 +841,15 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
843 841
844 memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); 842 memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
845 max_data->pid = tsk->pid; 843 max_data->pid = tsk->pid;
846 max_data->uid = task_uid(tsk); 844 /*
845 * If tsk == current, then use current_uid(), as that does not use
846 * RCU. The irq tracer can be called out of RCU scope.
847 */
848 if (tsk == current)
849 max_data->uid = current_uid();
850 else
851 max_data->uid = task_uid(tsk);
852
847 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; 853 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
848 max_data->policy = tsk->policy; 854 max_data->policy = tsk->policy;
849 max_data->rt_priority = tsk->rt_priority; 855 max_data->rt_priority = tsk->rt_priority;
@@ -2818,7 +2824,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2818 iter->iter_flags |= TRACE_FILE_ANNOTATE; 2824 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2819 2825
2820 /* Output in nanoseconds only if we are using a clock in nanoseconds. */ 2826 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
2821 if (trace_clocks[trace_clock_id].in_ns) 2827 if (trace_clocks[tr->clock_id].in_ns)
2822 iter->iter_flags |= TRACE_FILE_TIME_IN_NS; 2828 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
2823 2829
2824 /* stop the trace while dumping if we are not opening "snapshot" */ 2830 /* stop the trace while dumping if we are not opening "snapshot" */
@@ -3817,7 +3823,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3817 iter->iter_flags |= TRACE_FILE_LAT_FMT; 3823 iter->iter_flags |= TRACE_FILE_LAT_FMT;
3818 3824
3819 /* Output in nanoseconds only if we are using a clock in nanoseconds. */ 3825 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
3820 if (trace_clocks[trace_clock_id].in_ns) 3826 if (trace_clocks[tr->clock_id].in_ns)
3821 iter->iter_flags |= TRACE_FILE_TIME_IN_NS; 3827 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
3822 3828
3823 iter->cpu_file = tc->cpu; 3829 iter->cpu_file = tc->cpu;
@@ -5087,7 +5093,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
5087 cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); 5093 cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
5088 trace_seq_printf(s, "bytes: %ld\n", cnt); 5094 trace_seq_printf(s, "bytes: %ld\n", cnt);
5089 5095
5090 if (trace_clocks[trace_clock_id].in_ns) { 5096 if (trace_clocks[tr->clock_id].in_ns) {
5091 /* local or global for trace_clock */ 5097 /* local or global for trace_clock */
5092 t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); 5098 t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
5093 usec_rem = do_div(t, USEC_PER_SEC); 5099 usec_rem = do_div(t, USEC_PER_SEC);
@@ -6216,10 +6222,15 @@ __init static int tracer_alloc_buffers(void)
6216 6222
6217 trace_init_cmdlines(); 6223 trace_init_cmdlines();
6218 6224
6219 register_tracer(&nop_trace); 6225 /*
6220 6226 * register_tracer() might reference current_trace, so it
6227 * needs to be set before we register anything. This is
6228 * just a bootstrap of current_trace anyway.
6229 */
6221 global_trace.current_trace = &nop_trace; 6230 global_trace.current_trace = &nop_trace;
6222 6231
6232 register_tracer(&nop_trace);
6233
6223 /* All seems OK, enable tracing */ 6234 /* All seems OK, enable tracing */
6224 tracing_disabled = 0; 6235 tracing_disabled = 0;
6225 6236
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 711ca7d3e7f1..20572ed88c5c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -700,8 +700,6 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
700 700
701extern unsigned long trace_flags; 701extern unsigned long trace_flags;
702 702
703extern int trace_clock_id;
704
705/* Standard output formatting function used for function return traces */ 703/* Standard output formatting function used for function return traces */
706#ifdef CONFIG_FUNCTION_GRAPH_TRACER 704#ifdef CONFIG_FUNCTION_GRAPH_TRACER
707 705
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7a0cf68027cc..27963e2bf4bf 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2072,8 +2072,10 @@ event_enable_func(struct ftrace_hash *hash,
2072 out_reg: 2072 out_reg:
2073 /* Don't let event modules unload while probe registered */ 2073 /* Don't let event modules unload while probe registered */
2074 ret = try_module_get(file->event_call->mod); 2074 ret = try_module_get(file->event_call->mod);
2075 if (!ret) 2075 if (!ret) {
2076 ret = -EBUSY;
2076 goto out_free; 2077 goto out_free;
2078 }
2077 2079
2078 ret = __ftrace_event_enable_disable(file, 1, 1); 2080 ret = __ftrace_event_enable_disable(file, 1, 1);
2079 if (ret < 0) 2081 if (ret < 0)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 55e2cf66967b..2901e3b88590 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1159,7 +1159,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
1159 /* stop the tracing. */ 1159 /* stop the tracing. */
1160 tracing_stop(); 1160 tracing_stop();
1161 /* check the trace buffer */ 1161 /* check the trace buffer */
1162 ret = trace_test_buffer(tr, &count); 1162 ret = trace_test_buffer(&tr->trace_buffer, &count);
1163 trace->reset(tr); 1163 trace->reset(tr);
1164 tracing_start(); 1164 tracing_start();
1165 1165
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1ae602809efb..ee8e29a2320c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -296,7 +296,7 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
296static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; 296static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
297 297
298struct workqueue_struct *system_wq __read_mostly; 298struct workqueue_struct *system_wq __read_mostly;
299EXPORT_SYMBOL_GPL(system_wq); 299EXPORT_SYMBOL(system_wq);
300struct workqueue_struct *system_highpri_wq __read_mostly; 300struct workqueue_struct *system_highpri_wq __read_mostly;
301EXPORT_SYMBOL_GPL(system_highpri_wq); 301EXPORT_SYMBOL_GPL(system_highpri_wq);
302struct workqueue_struct *system_long_wq __read_mostly; 302struct workqueue_struct *system_long_wq __read_mostly;
@@ -1411,7 +1411,7 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
1411 local_irq_restore(flags); 1411 local_irq_restore(flags);
1412 return ret; 1412 return ret;
1413} 1413}
1414EXPORT_SYMBOL_GPL(queue_work_on); 1414EXPORT_SYMBOL(queue_work_on);
1415 1415
1416void delayed_work_timer_fn(unsigned long __data) 1416void delayed_work_timer_fn(unsigned long __data)
1417{ 1417{
@@ -1485,7 +1485,7 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1485 local_irq_restore(flags); 1485 local_irq_restore(flags);
1486 return ret; 1486 return ret;
1487} 1487}
1488EXPORT_SYMBOL_GPL(queue_delayed_work_on); 1488EXPORT_SYMBOL(queue_delayed_work_on);
1489 1489
1490/** 1490/**
1491 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU 1491 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
@@ -2059,6 +2059,7 @@ static bool manage_workers(struct worker *worker)
2059 if (unlikely(!mutex_trylock(&pool->manager_mutex))) { 2059 if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
2060 spin_unlock_irq(&pool->lock); 2060 spin_unlock_irq(&pool->lock);
2061 mutex_lock(&pool->manager_mutex); 2061 mutex_lock(&pool->manager_mutex);
2062 spin_lock_irq(&pool->lock);
2062 ret = true; 2063 ret = true;
2063 } 2064 }
2064 2065
@@ -4904,7 +4905,8 @@ static void __init wq_numa_init(void)
4904 BUG_ON(!tbl); 4905 BUG_ON(!tbl);
4905 4906
4906 for_each_node(node) 4907 for_each_node(node)
4907 BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node)); 4908 BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
4909 node_online(node) ? node : NUMA_NO_NODE));
4908 4910
4909 for_each_possible_cpu(cpu) { 4911 for_each_possible_cpu(cpu) {
4910 node = cpu_to_node(cpu); 4912 node = cpu_to_node(cpu);