From 438145c7ef5c9445f25bb8fc4d52e2c9d11fdc7c Mon Sep 17 00:00:00 2001 From: Jeremy Erickson Date: Fri, 11 Apr 2014 13:24:45 -0400 Subject: Update from 2.6.36 to 2.6.36.4 --- kernel/exit.c | 17 +++++ kernel/fork.c | 1 + kernel/futex.c | 31 ++++---- kernel/irq/proc.c | 2 +- kernel/latencytop.c | 17 +++-- kernel/perf_event.c | 30 +++++++- kernel/pm_qos_params.c | 4 +- kernel/power/hibernate.c | 22 +++--- kernel/power/suspend.c | 5 +- kernel/power/user.c | 4 +- kernel/printk.c | 4 +- kernel/ptrace.c | 2 +- kernel/sched.c | 179 +++++++++++++++++++++++++++++++++++++++++------ kernel/smp.c | 30 ++++++++ kernel/sys.c | 3 +- kernel/timer.c | 8 ++- kernel/trace/trace.c | 10 ++- kernel/user.c | 1 + kernel/watchdog.c | 26 ++++--- 19 files changed, 313 insertions(+), 83 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index b9d3bc6c21ec..ad7e51488291 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -96,6 +96,14 @@ static void __exit_signal(struct task_struct *tsk) tty = sig->tty; sig->tty = NULL; } else { + /* + * This can only happen if the caller is de_thread(). + * FIXME: this is the temporary hack, we should teach + * posix-cpu-timers to handle this case correctly. + */ + if (unlikely(has_group_leader_pid(tsk))) + posix_cpu_timers_exit_group(tsk); + /* * If there is any task waiting for the group exit * then notify it: @@ -905,6 +913,15 @@ NORET_TYPE void do_exit(long code) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); + /* + * If do_exit is called because this processes oopsed, it's possible + * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before + * continuing. Amongst other possible reasons, this is to prevent + * mm_release()->clear_child_tid() from writing to a user-controlled + * kernel address. + */ + set_fs(USER_DS); + tracehook_report_exit(&code); validate_creds_for_do_exit(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index ab7f29d906c7..d01a7514125b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -279,6 +279,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); + clear_tsk_need_resched(tsk); stackend = end_of_stack(tsk); *stackend = STACK_END_MAGIC; /* for overflow detection */ diff --git a/kernel/futex.c b/kernel/futex.c index 6a3a5fa1526d..e328f574c97c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1363,7 +1363,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) { struct futex_hash_bucket *hb; - get_futex_key_refs(&q->key); hb = hash_futex(&q->key); q->lock_ptr = &hb->lock; @@ -1375,7 +1374,6 @@ static inline void queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) { spin_unlock(&hb->lock); - drop_futex_key_refs(&q->key); } /** @@ -1480,8 +1478,6 @@ static void unqueue_me_pi(struct futex_q *q) q->pi_state = NULL; spin_unlock(q->lock_ptr); - - drop_futex_key_refs(&q->key); } /* @@ -1812,7 +1808,10 @@ static int futex_wait(u32 __user *uaddr, int fshared, } retry: - /* Prepare to wait on uaddr. */ + /* + * Prepare to wait on uaddr. On success, holds hb lock and increments + * q.key refs. + */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) goto out; @@ -1822,24 +1821,23 @@ retry: /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; + /* unqueue_me() drops q.key ref */ if (!unqueue_me(&q)) - goto out_put_key; + goto out; ret = -ETIMEDOUT; if (to && !to->task) - goto out_put_key; + goto out; /* * We expect signal_pending(current), but we might be the * victim of a spurious wakeup as well. */ - if (!signal_pending(current)) { - put_futex_key(fshared, &q.key); + if (!signal_pending(current)) goto retry; - } ret = -ERESTARTSYS; if (!abs_time) - goto out_put_key; + goto out; restart = ¤t_thread_info()->restart_block; restart->fn = futex_wait_restart; @@ -1856,8 +1854,6 @@ retry: ret = -ERESTART_RESTARTBLOCK; -out_put_key: - put_futex_key(fshared, &q.key); out: if (to) { hrtimer_cancel(&to->timer); @@ -2236,7 +2232,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, q.rt_waiter = &rt_waiter; q.requeue_pi_key = &key2; - /* Prepare to wait on uaddr. */ + /* + * Prepare to wait on uaddr. On success, increments q.key (key1) ref + * count. + */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) goto out_key2; @@ -2254,7 +2253,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, * In order for us to be here, we know our q.key == key2, and since * we took the hb->lock above, we also know that futex_requeue() has * completed and we no longer have to concern ourselves with a wakeup - * race with the atomic proxy lock acquition by the requeue code. + * race with the atomic proxy lock acquisition by the requeue code. The + * futex_requeue dropped our key1 reference and incremented our key2 + * reference count. */ /* Check if the requeue code acquired the second futex for us. */ diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 09a2ee540bd2..345e0b75fe1e 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -214,7 +214,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v) static int irq_spurious_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_spurious_proc_show, NULL); + return single_open(file, irq_spurious_proc_show, PDE(inode)->data); } static const struct file_operations irq_spurious_proc_fops = { diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 877fb306d415..17110a4a4fc2 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) account_global_scheduler_latency(tsk, &lat); - /* - * short term hack; if we're > 32 we stop; future we recycle: - */ - tsk->latency_record_count++; - if (tsk->latency_record_count >= LT_SAVECOUNT) - goto out_unlock; - - for (i = 0; i < LT_SAVECOUNT; i++) { + for (i = 0; i < tsk->latency_record_count; i++) { struct latency_record *mylat; int same = 1; @@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) } } + /* + * short term hack; if we're > 32 we stop; future we recycle: + */ + if (tsk->latency_record_count >= LT_SAVECOUNT) + goto out_unlock; + /* Allocated a new one: */ - i = tsk->latency_record_count; + i = tsk->latency_record_count++; memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); out_unlock: diff --git a/kernel/perf_event.c b/kernel/perf_event.c index b98bed3d8182..65b09a836cc3 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1620,8 +1620,12 @@ static void rotate_ctx(struct perf_event_context *ctx) { raw_spin_lock(&ctx->lock); - /* Rotate the first entry last of non-pinned groups */ - list_rotate_left(&ctx->flexible_groups); + /* + * Rotate the first entry last of non-pinned groups. Rotation might be + * disabled by the inheritance code. + */ + if (!ctx->rotate_disable) + list_rotate_left(&ctx->flexible_groups); raw_spin_unlock(&ctx->lock); } @@ -1773,7 +1777,13 @@ static u64 perf_event_read(struct perf_event *event) unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); - update_context_time(ctx); + /* + * may read while context is not active + * (e.g., thread is blocked), in that case + * we cannot update context time + */ + if (ctx->is_active) + update_context_time(ctx); update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -5616,6 +5626,7 @@ int perf_event_init_task(struct task_struct *child) struct perf_event *event; struct task_struct *parent = current; int inherited_all = 1; + unsigned long flags; int ret = 0; child->perf_event_ctxp = NULL; @@ -5656,6 +5667,15 @@ int perf_event_init_task(struct task_struct *child) break; } + /* + * We can't hold ctx->lock when iterating the ->flexible_group list due + * to allocations, but we need to prevent rotation because + * rotate_ctx() will change the list from interrupt context. + */ + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 1; + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { ret = inherit_task_group(event, parent, parent_ctx, child, &inherited_all); @@ -5663,6 +5683,10 @@ int perf_event_init_task(struct task_struct *child) break; } + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 0; + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + child_ctx = child->perf_event_ctxp; if (child_ctx && inherited_all) { diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 645e541a45f6..0da2837416eb 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -120,10 +120,10 @@ static inline int pm_qos_get_value(struct pm_qos_object *o) switch (o->type) { case PM_QOS_MIN: - return plist_last(&o->requests)->prio; + return plist_first(&o->requests)->prio; case PM_QOS_MAX: - return plist_first(&o->requests)->prio; + return plist_last(&o->requests)->prio; default: /* runtime check for not using enum */ diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8dc31e02ae12..7a931a90e4a2 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -326,7 +326,6 @@ static int create_image(int platform_mode) int hibernation_snapshot(int platform_mode) { int error; - gfp_t saved_mask; error = platform_begin(platform_mode); if (error) @@ -338,7 +337,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); - saved_mask = clear_gfp_allowed_mask(GFP_IOFS); + pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Recover_platform; @@ -347,7 +346,10 @@ int hibernation_snapshot(int platform_mode) goto Recover_platform; error = create_image(platform_mode); - /* Control returns here after successful restore */ + /* + * Control returns here (1) after the image has been created or the + * image creation has failed and (2) after a successful restore. + */ Resume_devices: /* We may need to release the preallocated image pages here. */ @@ -356,7 +358,10 @@ int hibernation_snapshot(int platform_mode) dpm_resume_end(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - set_gfp_allowed_mask(saved_mask); + + if (error || !in_suspend) + pm_restore_gfp_mask(); + resume_console(); Close: platform_end(platform_mode); @@ -451,17 +456,16 @@ static int resume_target_kernel(bool platform_mode) int hibernation_restore(int platform_mode) { int error; - gfp_t saved_mask; pm_prepare_console(); suspend_console(); - saved_mask = clear_gfp_allowed_mask(GFP_IOFS); + pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); dpm_resume_end(PMSG_RECOVER); } - set_gfp_allowed_mask(saved_mask); + pm_restore_gfp_mask(); resume_console(); pm_restore_console(); return error; @@ -475,7 +479,6 @@ int hibernation_restore(int platform_mode) int hibernation_platform_enter(void) { int error; - gfp_t saved_mask; if (!hibernation_ops) return -ENOSYS; @@ -491,7 +494,6 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); - saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -535,7 +537,6 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); - set_gfp_allowed_mask(saved_mask); resume_console(); Close: @@ -643,6 +644,7 @@ int hibernate(void) swsusp_free(); if (!error) power_down(); + pm_restore_gfp_mask(); } else { pr_debug("PM: Image restored successfully.\n"); } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 7335952ee473..ecf770509d0d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -197,7 +197,6 @@ static int suspend_enter(suspend_state_t state) int suspend_devices_and_enter(suspend_state_t state) { int error; - gfp_t saved_mask; if (!suspend_ops) return -ENOSYS; @@ -208,7 +207,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); - saved_mask = clear_gfp_allowed_mask(GFP_IOFS); + pm_restrict_gfp_mask(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -225,7 +224,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); - set_gfp_allowed_mask(saved_mask); + pm_restore_gfp_mask(); resume_console(); Close: if (suspend_ops->end) diff --git a/kernel/power/user.c b/kernel/power/user.c index e819e17877ca..c36c3b9e8a84 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -137,7 +137,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) free_all_swap_pages(data->swap); if (data->frozen) thaw_processes(); - pm_notifier_call_chain(data->mode == O_WRONLY ? + pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); atomic_inc(&snapshot_device_available); @@ -263,6 +263,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, case SNAPSHOT_UNFREEZE: if (!data->frozen || data->ready) break; + pm_restore_gfp_mask(); thaw_processes(); usermodehelper_enable(); data->frozen = 0; @@ -275,6 +276,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = -EPERM; break; } + pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); if (!error) error = put_user(in_suspend, (int __user *)arg); diff --git a/kernel/printk.c b/kernel/printk.c index 9dc8ea140426..2dc36b49d2d2 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1072,13 +1072,15 @@ void printk_tick(void) int printk_needs_cpu(int cpu) { + if (unlikely(cpu_is_offline(cpu))) + printk_tick(); return per_cpu(printk_pending, cpu); } void wake_up_klogd(void) { if (!trace_override && waitqueue_active(&log_wait)) - __raw_get_cpu_var(printk_pending) = 1; + this_cpu_write(printk_pending, 1); } /** diff --git a/kernel/ptrace.c b/kernel/ptrace.c index f34d798ef4a2..bf768d739e7d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -313,7 +313,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) child->exit_code = data; dead = __ptrace_detach(current, child); if (!child->exit_state) - wake_up_process(child); + wake_up_state(child, TASK_TRACED | TASK_STOPPED); } write_unlock_irq(&tasklist_lock); diff --git a/kernel/sched.c b/kernel/sched.c index 71907dcd8112..297aed09d3ce 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -741,7 +741,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; - char *cmp = buf; + char *cmp; int neg = 0; int i; @@ -752,6 +752,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, return -EFAULT; buf[cnt] = 0; + cmp = strstrip(buf); if (strncmp(buf, "NO_", 3) == 0) { neg = 1; @@ -759,9 +760,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, } for (i = 0; sched_feat_names[i]; i++) { - int len = strlen(sched_feat_names[i]); - - if (strncmp(cmp, sched_feat_names[i], len) == 0) { + if (strcmp(cmp, sched_feat_names[i]) == 0) { if (neg) sysctl_sched_features &= ~(1UL << i); else @@ -1877,12 +1876,6 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { - if (task_has_rt_policy(p)) { - p->se.load.weight = 0; - p->se.load.inv_weight = WMULT_CONST; - return; - } - /* * SCHED_IDLE tasks get minimal weight: */ @@ -3011,6 +3004,15 @@ static long calc_load_fold_active(struct rq *this_rq) return delta; } +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + load += 1UL << (FSHIFT - 1); + return load >> FSHIFT; +} + #ifdef CONFIG_NO_HZ /* * For NO_HZ we delay the active fold to the next LOAD_FREQ update. @@ -3040,6 +3042,128 @@ static long calc_load_fold_idle(void) return delta; } + +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + + return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +static unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + +/* + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary. + * + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed. + */ +static void calc_global_nohz(unsigned long ticks) +{ + long delta, active, n; + + if (time_before(jiffies, calc_load_update)) + return; + + /* + * If we crossed a calc_load_update boundary, make sure to fold + * any pending idle changes, the respective CPUs might have + * missed the tick driven calc_load_account_active() update + * due to NO_HZ. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + /* + * If we were idle for multiple load cycles, apply them. + */ + if (ticks >= LOAD_FREQ) { + n = ticks / LOAD_FREQ; + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + + calc_load_update += n * LOAD_FREQ; + } + + /* + * Its possible the remainder of the above division also crosses + * a LOAD_FREQ period, the regular check in calc_global_load() + * which comes after this will take care of that. + * + * Consider us being 11 ticks before a cycle completion, and us + * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will + * age us 4 cycles, and the test in calc_global_load() will + * pick up the final one. + */ +} #else static void calc_load_account_idle(struct rq *this_rq) { @@ -3049,6 +3173,10 @@ static inline long calc_load_fold_idle(void) { return 0; } + +static void calc_global_nohz(unsigned long ticks) +{ +} #endif /** @@ -3066,24 +3194,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) loads[2] = (avenrun[2] + offset) << shift; } -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - load *= exp; - load += active * (FIXED_1 - exp); - return load >> FSHIFT; -} - /* * calc_load - update the avenrun load estimates 10 ticks after the * CPUs have updated calc_load_tasks. */ -void calc_global_load(void) +void calc_global_load(unsigned long ticks) { - unsigned long upd = calc_load_update + 10; long active; - if (time_before(jiffies, upd)) + calc_global_nohz(ticks); + + if (time_before(jiffies, calc_load_update + 10)) return; active = atomic_long_read(&calc_load_tasks); @@ -3745,7 +3866,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) { if (prev->se.on_rq) update_rq_clock(rq); - rq->skip_clock_update = 0; prev->sched_class->put_prev_task(rq, prev); } @@ -3819,7 +3939,6 @@ need_resched_nonpreemptible: hrtick_clear(rq); raw_spin_lock_irq(&rq->lock); - clear_tsk_need_resched(prev); switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { @@ -3851,6 +3970,8 @@ need_resched_nonpreemptible: put_prev_task(rq, prev); next = pick_next_task(rq); + clear_tsk_need_resched(prev); + rq->skip_clock_update = 0; if (likely(prev != next)) { sched_info_switch(prev, next); @@ -5435,7 +5556,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); + /* + * We're having a chicken and egg problem, even though we are + * holding rq->lock, the cpu isn't yet set to this cpu so the + * lockdep check in task_group() will fail. + * + * Similar case to sched_fork(). / Alternatively we could + * use task_rq_lock() here and obtain the other rq->lock. + * + * Silence PROVE_RCU + */ + rcu_read_lock(); __set_task_cpu(idle, cpu); + rcu_read_unlock(); rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) diff --git a/kernel/smp.c b/kernel/smp.c index ed6aacfcb7ef..1ba1ba4b42f8 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -194,6 +194,24 @@ void generic_smp_call_function_interrupt(void) list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; + /* + * Since we walk the list without any locks, we might + * see an entry that was completed, removed from the + * list and is in the process of being reused. + * + * We must check that the cpu is in the cpumask before + * checking the refs, and both must be set before + * executing the callback on this cpu. + */ + + if (!cpumask_test_cpu(cpu, data->cpumask)) + continue; + + smp_rmb(); + + if (atomic_read(&data->refs) == 0) + continue; + if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) continue; @@ -202,6 +220,8 @@ void generic_smp_call_function_interrupt(void) refs = atomic_dec_return(&data->refs); WARN_ON(refs < 0); if (!refs) { + WARN_ON(!cpumask_empty(data->cpumask)); + raw_spin_lock(&call_function.lock); list_del_rcu(&data->csd.list); raw_spin_unlock(&call_function.lock); @@ -453,11 +473,21 @@ void smp_call_function_many(const struct cpumask *mask, data = &__get_cpu_var(cfd_data); csd_lock(&data->csd); + BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); data->csd.func = func; data->csd.info = info; cpumask_and(data->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, data->cpumask); + + /* + * To ensure the interrupt handler gets an complete view + * we order the cpumask and refs writes and order the read + * of them in the interrupt handler. In addition we may + * only clear our own cpu bit from the mask. + */ + smp_wmb(); + atomic_set(&data->refs, cpumask_weight(data->cpumask)); raw_spin_lock_irqsave(&call_function.lock, flags); diff --git a/kernel/sys.c b/kernel/sys.c index 7f5a0cd296a9..66136ca33a7b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1377,7 +1377,8 @@ static int check_prlimit_permission(struct task_struct *task) const struct cred *cred = current_cred(), *tcred; tcred = __task_cred(task); - if ((cred->uid != tcred->euid || + if (current != task && + (cred->uid != tcred->euid || cred->uid != tcred->suid || cred->uid != tcred->uid || cred->gid != tcred->egid || diff --git a/kernel/timer.c b/kernel/timer.c index 97bf05baade7..102ad370dddb 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1252,6 +1252,12 @@ unsigned long get_next_timer_interrupt(unsigned long now) struct tvec_base *base = __get_cpu_var(tvec_bases); unsigned long expires; + /* + * Pretend that there is no timer pending if the cpu is offline. + * Possible pending timers will be migrated later to an active cpu. + */ + if (cpu_is_offline(smp_processor_id())) + return now + NEXT_TIMER_MAX_DELTA; spin_lock(&base->lock); if (time_before_eq(base->next_timer, base->timer_jiffies)) base->next_timer = __next_timer_interrupt(base); @@ -1316,7 +1322,7 @@ void do_timer(unsigned long ticks) { jiffies_64 += ticks; update_wall_time(); - calc_global_load(); + calc_global_load(ticks); } #ifdef __ARCH_WANT_SYS_ALARM diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9ec59f541156..7702f5aecd07 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2320,11 +2320,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf, return count; } +static loff_t tracing_seek(struct file *file, loff_t offset, int origin) +{ + if (file->f_mode & FMODE_READ) + return seq_lseek(file, offset, origin); + else + return 0; +} + static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, .write = tracing_write_stub, - .llseek = seq_lseek, + .llseek = tracing_seek, .release = tracing_release, }; diff --git a/kernel/user.c b/kernel/user.c index 7e72614b736d..8ce395f74d47 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -157,6 +157,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { + put_user_ns(ns); key_put(new->uid_keyring); key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7f9c3c52ecc1..e359b2e7e7d5 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -377,7 +377,8 @@ static int watchdog_nmi_enable(int cpu) goto out_save; } - printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); + printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n", + cpu, PTR_ERR(event)); return -1; /* success path */ @@ -440,9 +441,6 @@ static int watchdog_enable(int cpu) wake_up_process(p); } - /* if any cpu succeeds, watchdog is considered enabled for the system */ - watchdog_enabled = 1; - return 0; } @@ -470,12 +468,16 @@ static void watchdog_disable(int cpu) static void watchdog_enable_all_cpus(void) { int cpu; - int result = 0; + + watchdog_enabled = 0; for_each_online_cpu(cpu) - result += watchdog_enable(cpu); + if (!watchdog_enable(cpu)) + /* if any cpu succeeds, watchdog is considered + enabled for the system */ + watchdog_enabled = 1; - if (result) + if (!watchdog_enabled) printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); } @@ -503,10 +505,12 @@ int proc_dowatchdog_enabled(struct ctl_table *table, int write, { proc_dointvec(table, write, buffer, length, ppos); - if (watchdog_enabled) - watchdog_enable_all_cpus(); - else - watchdog_disable_all_cpus(); + if (write) { + if (watchdog_enabled) + watchdog_enable_all_cpus(); + else + watchdog_disable_all_cpus(); + } return 0; } -- cgit v1.2.2