Merge commit 'v2.6.32-rc6' into perf/core

Conflicts: tools/perf/Makefile Merge reason: Resolve the conflict, merge to upstream and merge in perf fixes so we can add a dependent patch. Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2009-11-04 05:54:15 -0500
committer: Ingo Molnar <mingo@elte.hu> 2009-11-04 05:59:45 -0500
commit: a2e71271535fde493c32803b1f34789f97efcb5e (patch)
tree: 90d7139bea2f49e947f27af92614fa6eca50b64d /kernel
parent: 6d7aa9d721c8c640066142fd9534afcdf68d7f9d (diff)
parent: b419148e567728f6af0c3b01965c1cc141e3e13a (diff)
21 files changed, 266 insertions, 136 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ca83b73fba19..0249f4be9b5c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1710,14 +1710,13 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
                return -EFAULT;
        buffer[nbytes] = 0;     /* nul-terminate */
-        strstrip(buffer);
        if (cft->write_u64) {
-                u64 val = simple_strtoull(buffer, &end, 0);
+                u64 val = simple_strtoull(strstrip(buffer), &end, 0);
                if (*end)
                        return -EINVAL;
                retval = cft->write_u64(cgrp, cft, val);
        } else {
-                s64 val = simple_strtoll(buffer, &end, 0);
+                s64 val = simple_strtoll(strstrip(buffer), &end, 0);
                if (*end)
                        return -EINVAL;
                retval = cft->write_s64(cgrp, cft, val);
@@ -1753,8 +1752,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
        }
        buffer[nbytes] = 0;     /* nul-terminate */
-        strstrip(buffer);
+        retval = cft->write_string(cgrp, cft, strstrip(buffer));
-        retval = cft->write_string(cgrp, cft, buffer);
        if (!retval)
                retval = nbytes;
 out:
diff --git a/kernel/exit.c b/kernel/exit.c
index e61891f80123..f7864ac2ecc1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -359,10 +359,8 @@ void __set_special_pids(struct pid *pid)
 {
        struct task_struct *curr = current->group_leader;
-        if (task_session(curr) != pid) {
+        if (task_session(curr) != pid)
                change_pid(curr, PIDTYPE_SID, pid);
-                proc_sid_connector(curr);
-        }
        if (task_pgrp(curr) != pid)
                change_pid(curr, PIDTYPE_PGID, pid);
diff --git a/kernel/fork.c b/kernel/fork.c
index 4c20fff8c13a..166b8c49257c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -91,7 +91,7 @@ int nr_processes(void)
        int cpu;
        int total = 0;
-        for_each_online_cpu(cpu)
+        for_each_possible_cpu(cpu)
                total += per_cpu(process_counts, cpu);
        return total;
diff --git a/kernel/futex.c b/kernel/futex.c
index 4949d336d88d..fb65e822fc41 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -150,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
 */
 static inline int match_futex(union futex_key *key1, union futex_key *key2)
 {
-        return (key1->both.word == key2->both.word
+        return (key1 && key2
+                && key1->both.word == key2->both.word
                && key1->both.ptr == key2->both.ptr
                && key1->both.offset == key2->both.offset);
 }
@@ -1028,7 +1029,6 @@ static inline
 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
                           struct futex_hash_bucket *hb)
 {
-        drop_futex_key_refs(&q->key);
        get_futex_key_refs(key);
        q->key = *key;
@@ -1226,6 +1226,7 @@ retry_private:
                 */
                if (ret == 1) {
                        WARN_ON(pi_state);
+                        drop_count++;
                        task_count++;
                        ret = get_futex_value_locked(&curval2, uaddr2);
                        if (!ret)
@@ -1304,6 +1305,7 @@ retry_private:
                        if (ret == 1) {
                                /* We got the lock. */
                                requeue_pi_wake_futex(this, &key2, hb2);
+                                drop_count++;
                                continue;
                        } else if (ret) {
                                /* -EDEADLK */
@@ -1791,6 +1793,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
                                             current->timer_slack_ns);
        }
+retry:
        /* Prepare to wait on uaddr. */
        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
        if (ret)
@@ -1808,9 +1811,14 @@ static int futex_wait(u32 __user *uaddr, int fshared,
                goto out_put_key;
        /*
-         * We expect signal_pending(current), but another thread may
+         * We expect signal_pending(current), but we might be the
-         * have handled it for us already.
+         * victim of a spurious wakeup as well.
         */
+        if (!signal_pending(current)) {
+                put_futex_key(fshared, &q.key);
+                goto retry;
+        }
        ret = -ERESTARTSYS;
        if (!abs_time)
                goto out_put_key;
@@ -2118,9 +2126,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
                 */
                plist_del(&q->list, &q->list.plist);
+                /* Handle spurious wakeups gracefully */
+                ret = -EWOULDBLOCK;
                if (timeout && !timeout->task)
                        ret = -ETIMEDOUT;
-                else
+                else if (signal_pending(current))
                        ret = -ERESTARTNOINTR;
        }
        return ret;
diff --git a/kernel/params.c b/kernel/params.c
index 9da58eabdcb2..d656c276508d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -218,15 +218,11 @@ int param_set_charp(const char *val, struct kernel_param *kp)
                return -ENOSPC;
        }
-        if (kp->flags & KPARAM_KMALLOCED)
-                kfree(*(char **)kp->arg);
        /* This is a hack.  We can't need to strdup in early boot, and we
         * don't need to; this mangled commandline is preserved. */
        if (slab_is_available()) {
-                kp->flags |= KPARAM_KMALLOCED;
                *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
-                if (!kp->arg)
+                if (!*(char **)kp->arg)
                        return -ENOMEM;
        } else
                *(const char **)kp->arg = val;
@@ -304,6 +300,7 @@ static int param_array(const char *name,
                       unsigned int min, unsigned int max,
                       void *elem, int elemsize,
                       int (*set)(const char *, struct kernel_param *kp),
+                       u16 flags,
                       unsigned int *num)
 {
        int ret;
@@ -313,6 +310,7 @@ static int param_array(const char *name,
        /* Get the name right for errors. */
        kp.name = name;
        kp.arg = elem;
+        kp.flags = flags;
        /* No equals sign? */
        if (!val) {
@@ -358,7 +356,8 @@ int param_array_set(const char *val, struct kernel_param *kp)
        unsigned int temp_num;
        return param_array(kp->name, val, 1, arr->max, arr->elem,
-                           arr->elemsize, arr->set, arr->num ?: &temp_num);
+                           arr->elemsize, arr->set, kp->flags,
+                           arr->num ?: &temp_num);
 }
 int param_array_get(char *buffer, struct kernel_param *kp)
@@ -605,11 +604,7 @@ void module_param_sysfs_remove(struct module *mod)
 void destroy_params(const struct kernel_param *params, unsigned num)
 {
-        unsigned int i;
+        /* FIXME: This should free kmalloced charp parameters.  It doesn't. */
-        for (i = 0; i < num; i++)
-                if (params[i].flags & KPARAM_KMALLOCED)
-                        kfree(*(char **)params[i].arg);
 }
 static void __init kernel_add_sysfs_param(const char *name,
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 9ecaa45ab6b2..a69d4ed6a666 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3976,8 +3976,9 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
                regs = task_pt_regs(current);
        if (regs) {
-                if (perf_event_overflow(event, 0, &data, regs))
+                if (!(event->attr.exclude_idle && current->pid == 0))
-                        ret = HRTIMER_NORESTART;
+                        if (perf_event_overflow(event, 0, &data, regs))
+                                ret = HRTIMER_NORESTART;
        }
        period = max_t(u64, 10000, event->hw.sample_period);
@@ -3986,6 +3987,42 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        return ret;
 }
+static void perf_swevent_start_hrtimer(struct perf_event *event)
+{
+        struct hw_perf_event *hwc = &event->hw;
+        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        hwc->hrtimer.function = perf_swevent_hrtimer;
+        if (hwc->sample_period) {
+                u64 period;
+                if (hwc->remaining) {
+                        if (hwc->remaining < 0)
+                                period = 10000;
+                        else
+                                period = hwc->remaining;
+                        hwc->remaining = 0;
+                } else {
+                        period = max_t(u64, 10000, hwc->sample_period);
+                }
+                __hrtimer_start_range_ns(&hwc->hrtimer,
+                                ns_to_ktime(period), 0,
+                                HRTIMER_MODE_REL, 0);
+        }
+}
+static void perf_swevent_cancel_hrtimer(struct perf_event *event)
+{
+        struct hw_perf_event *hwc = &event->hw;
+        if (hwc->sample_period) {
+                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+                hwc->remaining = ktime_to_ns(remaining);
+                hrtimer_cancel(&hwc->hrtimer);
+        }
+}
 /*
 * Software event: cpu wall time clock
 */
@@ -4008,22 +4045,14 @@ static int cpu_clock_perf_event_enable(struct perf_event *event)
        int cpu = raw_smp_processor_id();
        atomic64_set(&hwc->prev_count, cpu_clock(cpu));
-        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        perf_swevent_start_hrtimer(event);
-        hwc->hrtimer.function = perf_swevent_hrtimer;
-        if (hwc->sample_period) {
-                u64 period = max_t(u64, 10000, hwc->sample_period);
-                __hrtimer_start_range_ns(&hwc->hrtimer,
-                                ns_to_ktime(period), 0,
-                                HRTIMER_MODE_REL, 0);
-        }
        return 0;
 }
 static void cpu_clock_perf_event_disable(struct perf_event *event)
 {
-        if (event->hw.sample_period)
+        perf_swevent_cancel_hrtimer(event);
-                hrtimer_cancel(&event->hw.hrtimer);
        cpu_clock_perf_event_update(event);
 }
@@ -4060,22 +4089,15 @@ static int task_clock_perf_event_enable(struct perf_event *event)
        now = event->ctx->time;
        atomic64_set(&hwc->prev_count, now);
-        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        hwc->hrtimer.function = perf_swevent_hrtimer;
+        perf_swevent_start_hrtimer(event);
-        if (hwc->sample_period) {
-                u64 period = max_t(u64, 10000, hwc->sample_period);
-                __hrtimer_start_range_ns(&hwc->hrtimer,
-                                ns_to_ktime(period), 0,
-                                HRTIMER_MODE_REL, 0);
-        }
        return 0;
 }
 static void task_clock_perf_event_disable(struct perf_event *event)
 {
-        if (event->hw.sample_period)
+        perf_swevent_cancel_hrtimer(event);
-                hrtimer_cancel(&event->hw.hrtimer);
        task_clock_perf_event_update(event, event->ctx->time);
 }
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 04b3a83d686f..04a9e90d248f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -693,21 +693,22 @@ static int software_resume(void)
        /* The snapshot device should not be opened while we're running */
        if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
                error = -EBUSY;
+                swsusp_close(FMODE_READ);
                goto Unlock;
        }
        pm_prepare_console();
        error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
        if (error)
-                goto Finish;
+                goto close_finish;
        error = usermodehelper_disable();
        if (error)
-                goto Finish;
+                goto close_finish;
        error = create_basic_memory_bitmaps();
        if (error)
-                goto Finish;
+                goto close_finish;
        pr_debug("PM: Preparing processes for restore.\n");
        error = prepare_processes();
@@ -719,6 +720,7 @@ static int software_resume(void)
        pr_debug("PM: Reading hibernation image.\n");
        error = swsusp_read(&flags);
+        swsusp_close(FMODE_READ);
        if (!error)
                hibernation_restore(flags & SF_PLATFORM_MODE);
@@ -737,6 +739,9 @@ static int software_resume(void)
        mutex_unlock(&pm_mutex);
        pr_debug("PM: Resume from disk failed.\n");
        return error;
+close_finish:
+        swsusp_close(FMODE_READ);
+        goto Finish;
 }
 late_initcall(software_resume);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 17d8bb1acf9c..25596e450ac7 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -19,7 +19,7 @@
 * The time it takes is system-specific though, so when we test this
 * during system bootup we allow a LOT of time.
 */
-#define TEST_SUSPEND_SECONDS    5
+#define TEST_SUSPEND_SECONDS    10
 static unsigned long suspend_test_start_time;
@@ -49,7 +49,8 @@ void suspend_test_finish(const char *label)
         * has some performance issues.  The stack dump of a WARN_ON
         * is more likely to get the right attention than a printk...
         */
-        WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
+        WARN(msec > (TEST_SUSPEND_SECONDS * 1000),
+             "Component: %s, time: %u\n", label, msec);
 }
 /*
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b101cdc4df3f..890f6b11b1d3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -314,7 +314,6 @@ static int save_image(struct swap_map_handle *handle,
 {
        unsigned int m;
        int ret;
-        int error = 0;
        int nr_pages;
        int err2;
        struct bio *bio;
@@ -329,26 +328,27 @@ static int save_image(struct swap_map_handle *handle,
        nr_pages = 0;
        bio = NULL;
        do_gettimeofday(&start);
-        do {
+        while (1) {
                ret = snapshot_read_next(snapshot, PAGE_SIZE);
-                if (ret > 0) {
+                if (ret <= 0)
-                        error = swap_write_page(handle, data_of(*snapshot),
+                        break;
-                                                &bio);
+                ret = swap_write_page(handle, data_of(*snapshot), &bio);
-                        if (error)
+                if (ret)
-                                break;
+                        break;
-                        if (!(nr_pages % m))
+                if (!(nr_pages % m))
-                                printk("\b\b\b\b%3d%%", nr_pages / m);
+                        printk("\b\b\b\b%3d%%", nr_pages / m);
-                        nr_pages++;
+                nr_pages++;
-                }
+        }
-        } while (ret > 0);
        err2 = wait_on_bio_chain(&bio);
        do_gettimeofday(&stop);
-        if (!error)
+        if (!ret)
-                error = err2;
+                ret = err2;
-        if (!error)
+        if (!ret)
                printk("\b\b\b\bdone\n");
+        else
+                printk("\n");
        swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
-        return error;
+        return ret;
 }
 /**
@@ -536,7 +536,8 @@ static int load_image(struct swap_map_handle *handle,
                snapshot_write_finalize(snapshot);
                if (!snapshot_image_loaded(snapshot))
                        error = -ENODATA;
-        }
+        } else
+                printk("\n");
        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
        return error;
 }
@@ -572,8 +573,6 @@ int swsusp_read(unsigned int *flags_p)
                error = load_image(&handle, &snapshot, header->pages - 1);
        release_swap_reader(&handle);
-        blkdev_put(resume_bdev, FMODE_READ);
        if (!error)
                pr_debug("PM: Image successfully loaded\n");
        else
@@ -596,7 +595,7 @@ int swsusp_check(void)
                error = bio_read_page(swsusp_resume_block,
                                        swsusp_header, NULL);
                if (error)
-                        return error;
+                        goto put;
                if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
                        memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
@@ -604,8 +603,10 @@ int swsusp_check(void)
                        error = bio_write_page(swsusp_resume_block,
                                                swsusp_header, NULL);
                } else {
-                        return -EINVAL;
+                        error = -EINVAL;
                }
+put:
                if (error)
                        blkdev_put(resume_bdev, FMODE_READ);
                else
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 705f02ac7433..0536125b0497 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -913,7 +913,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
                        spin_unlock(&rnp->lock); /* irqs remain disabled. */
                        break;
                }
-                rcu_preempt_offline_tasks(rsp, rnp, rdp);
+                /*
+                 * If there was a task blocking the current grace period,
+                 * and if all CPUs have checked in, we need to propagate
+                 * the quiescent state up the rcu_node hierarchy.  But that
+                 * is inconvenient at the moment due to deadlock issues if
+                 * this should end the current grace period.  So set the
+                 * offlined CPU's bit in ->qsmask in order to force the
+                 * next force_quiescent_state() invocation to clean up this
+                 * mess in a deadlock-free manner.
+                 */
+                if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask)
+                        rnp->qsmask |= mask;
                mask = rnp->grpmask;
                spin_unlock(&rnp->lock);        /* irqs remain disabled. */
                rnp = rnp->parent;
@@ -958,7 +971,7 @@ static void rcu_offline_cpu(int cpu)
 * Invoke any RCU callbacks that have made it to the end of their grace
 * period.  Thottle as specified by rdp->blimit.
 */
-static void rcu_do_batch(struct rcu_data *rdp)
+static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 {
        unsigned long flags;
        struct rcu_head *next, *list, **tail;
@@ -1011,6 +1024,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
        if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
                rdp->blimit = blimit;
+        /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
+        if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
+                rdp->qlen_last_fqs_check = 0;
+                rdp->n_force_qs_snap = rsp->n_force_qs;
+        } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
+                rdp->qlen_last_fqs_check = rdp->qlen;
        local_irq_restore(flags);
        /* Re-raise the RCU softirq if there are callbacks remaining. */
@@ -1224,7 +1244,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
        }
        /* If there are callbacks ready, invoke them. */
-        rcu_do_batch(rdp);
+        rcu_do_batch(rsp, rdp);
 }
 /*
@@ -1288,10 +1308,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
                rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
        }
-        /* Force the grace period if too many callbacks or too long waiting. */
+        /*
-        if (unlikely(++rdp->qlen > qhimark)) {
+         * Force the grace period if too many callbacks or too long waiting.
+         * Enforce hysteresis, and don't invoke force_quiescent_state()
+         * if some other CPU has recently done so.  Also, don't bother
+         * invoking force_quiescent_state() if the newly enqueued callback
+         * is the only one waiting for a grace period to complete.
+         */
+        if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
                rdp->blimit = LONG_MAX;
-                force_quiescent_state(rsp, 0);
+                if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+                    *rdp->nxttail[RCU_DONE_TAIL] != head)
+                        force_quiescent_state(rsp, 0);
+                rdp->n_force_qs_snap = rsp->n_force_qs;
+                rdp->qlen_last_fqs_check = rdp->qlen;
        } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
                force_quiescent_state(rsp, 1);
        local_irq_restore(flags);
@@ -1523,6 +1553,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
        rdp->beenonline = 1;     /* We have now been online. */
        rdp->preemptable = preemptable;
        rdp->passed_quiesc_completed = lastcomp - 1;
+        rdp->qlen_last_fqs_check = 0;
+        rdp->n_force_qs_snap = rsp->n_force_qs;
        rdp->blimit = blimit;
        spin_unlock(&rnp->lock);                /* irqs remain disabled. */
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index b40ac5706040..1823c6e20609 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -167,6 +167,10 @@ struct rcu_data {
        struct rcu_head *nxtlist;
        struct rcu_head **nxttail[RCU_NEXT_SIZE];
        long            qlen;           /* # of queued callbacks */
+        long            qlen_last_fqs_check;
+                                        /* qlen at last check for QS forcing */
+        unsigned long   n_force_qs_snap;
+                                        /* did other CPU force QS recently? */
        long            blimit;         /* Upper limit on a processed batch */
 #ifdef CONFIG_NO_HZ
@@ -302,9 +306,9 @@ static void rcu_print_task_stall(struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
-static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
+static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-                                      struct rcu_node *rnp,
+                                     struct rcu_node *rnp,
-                                      struct rcu_data *rdp);
+                                     struct rcu_data *rdp);
 static void rcu_preempt_offline_cpu(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_preempt_check_callbacks(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c0cb783aa16a..ef2a58c2b9d5 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -304,21 +304,25 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 * parent is to remove the need for rcu_read_unlock_special() to
 * make more than two attempts to acquire the target rcu_node's lock.
 *
+ * Returns 1 if there was previously a task blocking the current grace
+ * period on the specified rcu_node structure.
+ *
 * The caller must hold rnp->lock with irqs disabled.
 */
-static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
+static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-                                      struct rcu_node *rnp,
+                                     struct rcu_node *rnp,
-                                      struct rcu_data *rdp)
+                                     struct rcu_data *rdp)
 {
        int i;
        struct list_head *lp;
        struct list_head *lp_root;
+        int retval = rcu_preempted_readers(rnp);
        struct rcu_node *rnp_root = rcu_get_root(rsp);
        struct task_struct *tp;
        if (rnp == rnp_root) {
                WARN_ONCE(1, "Last CPU thought to be offlined?");
-                return;  /* Shouldn't happen: at least one CPU online. */
+                return 0;  /* Shouldn't happen: at least one CPU online. */
        }
        WARN_ON_ONCE(rnp != rdp->mynode &&
                     (!list_empty(&rnp->blocked_tasks[0]) ||
@@ -342,6 +346,8 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
                        spin_unlock(&rnp_root->lock); /* irqs remain disabled */
                }
        }
+        return retval;
 }
 /*
@@ -393,6 +399,17 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 EXPORT_SYMBOL_GPL(call_rcu);
 /*
+ * Wait for an rcu-preempt grace period.  We are supposed to expedite the
+ * grace period, but this is the crude slow compatability hack, so just
+ * invoke synchronize_rcu().
+ */
+void synchronize_rcu_expedited(void)
+{
+        synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+/*
 * Check to see if there is any immediate preemptable-RCU-related work
 * to be done.
 */
@@ -521,12 +538,15 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 /*
 * Because preemptable RCU does not exist, it never needs to migrate
- * tasks that were blocked within RCU read-side critical sections.
+ * tasks that were blocked within RCU read-side critical sections, and
+ * such non-existent tasks cannot possibly have been blocking the current
+ * grace period.
 */
-static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
+static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
-                                      struct rcu_node *rnp,
+                                     struct rcu_node *rnp,
-                                      struct rcu_data *rdp)
+                                     struct rcu_data *rdp)
 {
+        return 0;
 }
 /*
@@ -565,6 +585,16 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 EXPORT_SYMBOL_GPL(call_rcu);
 /*
+ * Wait for an rcu-preempt grace period, but make it happen quickly.
+ * But because preemptable RCU does not exist, map to rcu-sched.
+ */
+void synchronize_rcu_expedited(void)
+{
+        synchronize_sched_expedited();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+/*
 * Because preemptable RCU does not exist, it never has any work to do.
 */
 static int rcu_preempt_pending(int cpu)
diff --git a/kernel/sched.c b/kernel/sched.c
index e88689522e66..a455dca884a6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1564,11 +1564,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 #ifdef CONFIG_FAIR_GROUP_SCHED
-struct update_shares_data {
+static __read_mostly unsigned long *update_shares_data;
-        unsigned long rq_weight[NR_CPUS];
-};
-static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
@@ -1578,12 +1574,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 static void update_group_shares_cpu(struct task_group *tg, int cpu,
                                    unsigned long sd_shares,
                                    unsigned long sd_rq_weight,
-                                    struct update_shares_data *usd)
+                                    unsigned long *usd_rq_weight)
 {
        unsigned long shares, rq_weight;
        int boost = 0;
-        rq_weight = usd->rq_weight[cpu];
+        rq_weight = usd_rq_weight[cpu];
        if (!rq_weight) {
                boost = 1;
                rq_weight = NICE_0_LOAD;
@@ -1618,7 +1614,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
 static int tg_shares_up(struct task_group *tg, void *data)
 {
        unsigned long weight, rq_weight = 0, shares = 0;
-        struct update_shares_data *usd;
+        unsigned long *usd_rq_weight;
        struct sched_domain *sd = data;
        unsigned long flags;
        int i;
@@ -1627,11 +1623,11 @@ static int tg_shares_up(struct task_group *tg, void *data)
                return 0;
        local_irq_save(flags);
-        usd = &__get_cpu_var(update_shares_data);
+        usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
        for_each_cpu(i, sched_domain_span(sd)) {
                weight = tg->cfs_rq[i]->load.weight;
-                usd->rq_weight[i] = weight;
+                usd_rq_weight[i] = weight;
                /*
                 * If there are currently no tasks on the cpu pretend there
@@ -1652,7 +1648,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
                shares = tg->shares;
        for_each_cpu(i, sched_domain_span(sd))
-                update_group_shares_cpu(tg, i, shares, rq_weight, usd);
+                update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
        local_irq_restore(flags);
@@ -9407,6 +9403,10 @@ void __init sched_init(void)
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_GROUP_SCHED */
+#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
+        update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
+                                            __alignof__(unsigned long));
+#endif
        for_each_possible_cpu(i) {
                struct rq *rq;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4e777b47eeda..c32c3e643daa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -861,12 +861,21 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *se = __pick_next_entity(cfs_rq);
+        struct sched_entity *buddy;
-        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
+        if (cfs_rq->next) {
-                return cfs_rq->next;
+                buddy = cfs_rq->next;
+                cfs_rq->next = NULL;
+                if (wakeup_preempt_entity(buddy, se) < 1)
+                        return buddy;
+        }
-        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
+        if (cfs_rq->last) {
-                return cfs_rq->last;
+                buddy = cfs_rq->last;
+                cfs_rq->last = NULL;
+                if (wakeup_preempt_entity(buddy, se) < 1)
+                        return buddy;
+        }
        return se;
 }
@@ -1654,16 +1663,6 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
        do {
                se = pick_next_entity(cfs_rq);
-                /*
-                 * If se was a buddy, clear it so that it will have to earn
-                 * the favour again.
-                 *
-                 * If se was not a buddy, clear the buddies because neither
-                 * was elegible to run, let them earn it again.
-                 *
-                 * IOW. unconditionally clear buddies.
-                 */
-                __clear_buddies(cfs_rq, NULL);
                set_next_entity(cfs_rq, se);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);
diff --git a/kernel/sys.c b/kernel/sys.c
index 255475d163e0..ce17760d9c51 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1110,6 +1110,8 @@ SYSCALL_DEFINE0(setsid)
        err = session;
 out:
        write_unlock_irq(&tasklist_lock);
+        if (err > 0)
+                proc_sid_connector(group_leader);
        return err;
 }
@@ -1546,24 +1548,37 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        if (arg4 | arg5)
                                return -EINVAL;
                        switch (arg2) {
-                        case 0:
+                        case PR_MCE_KILL_CLEAR:
                                if (arg3 != 0)
                                        return -EINVAL;
                                current->flags &= ~PF_MCE_PROCESS;
                                break;
-                        case 1:
+                        case PR_MCE_KILL_SET:
                                current->flags |= PF_MCE_PROCESS;
-                                if (arg3 != 0)
+                                if (arg3 == PR_MCE_KILL_EARLY)
                                        current->flags |= PF_MCE_EARLY;
-                                else
+                                else if (arg3 == PR_MCE_KILL_LATE)
                                        current->flags &= ~PF_MCE_EARLY;
+                                else if (arg3 == PR_MCE_KILL_DEFAULT)
+                                        current->flags &=
+                                                ~(PF_MCE_EARLY|PF_MCE_PROCESS);
+                                else
+                                        return -EINVAL;
                                break;
                        default:
                                return -EINVAL;
                        }
                        error = 0;
                        break;
+                case PR_MCE_KILL_GET:
+                        if (arg2 | arg3 | arg4 | arg5)
+                                return -EINVAL;
+                        if (current->flags & PF_MCE_PROCESS)
+                                error = (current->flags & PF_MCE_EARLY) ?
+                                        PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+                        else
+                                error = PR_MCE_KILL_DEFAULT;
+                        break;
                default:
                        error = -EINVAL;
                        break;
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b38423ca711a..b6e7aaea4604 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1521,7 +1521,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
                        if (!table->ctl_name && table->strategy)
                                set_fail(&fail, table, "Strategy without ctl_name");
 #endif
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
                        if (table->procname && !table->proc_handler)
                                set_fail(&fail, table, "No proc_handler");
 #endif
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b10c0d90a6ff..1ed514fe3a30 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -751,7 +751,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 out:
        mutex_unlock(&ftrace_profile_lock);
-        filp->f_pos += cnt;
+        *ppos += cnt;
        return cnt;
 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e43c928356ee..63446f12e470 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -486,7 +486,7 @@ struct ring_buffer_iter {
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
-static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu)
+static inline u64 rb_time_stamp(struct ring_buffer *buffer)
 {
        /* shift to debug/test normalization and TIME_EXTENTS */
        return buffer->clock() << DEBUG_SHIFT;
@@ -497,7 +497,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
        u64 time;
        preempt_disable_notrace();
-        time = rb_time_stamp(buffer, cpu);
+        time = rb_time_stamp(buffer);
        preempt_enable_no_resched_notrace();
        return time;
@@ -602,7 +602,7 @@ static struct list_head *rb_list_head(struct list_head *list)
 }
 /*
- * rb_is_head_page - test if the give page is the head page
+ * rb_is_head_page - test if the given page is the head page
 *
 * Because the reader may move the head_page pointer, we can
 * not trust what the head page is (it may be pointing to
@@ -1871,7 +1871,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
                 * Nested commits always have zero deltas, so
                 * just reread the time stamp
                 */
-                *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
+                *ts = rb_time_stamp(buffer);
                next_page->page->time_stamp = *ts;
        }
@@ -2114,7 +2114,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
                goto out_fail;
-        ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
+        ts = rb_time_stamp(cpu_buffer->buffer);
        /*
         * Only the first commit can update the timestamp.
@@ -2684,7 +2684,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
 EXPORT_SYMBOL_GPL(ring_buffer_entries);
 /**
- * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ * ring_buffer_overruns - get the number of overruns in buffer
 * @buffer: The ring buffer
 *
 * Returns the total number of overruns in the ring buffer
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 026e715a0c7a..9d3067a62d43 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2440,7 +2440,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
                        return ret;
        }
-        filp->f_pos += cnt;
+        *ppos += cnt;
        return cnt;
 }
@@ -2582,7 +2582,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
        }
        mutex_unlock(&trace_types_lock);
-        filp->f_pos += cnt;
+        *ppos += cnt;
        return cnt;
 }
@@ -2764,7 +2764,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
        if (err)
                return err;
-        filp->f_pos += ret;
+        *ppos += ret;
        return ret;
 }
@@ -3299,7 +3299,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
                }
        }
-        filp->f_pos += cnt;
+        *ppos += cnt;
        /* If check pages failed, return ENOMEM */
        if (tracing_disabled)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ed17565826b0..b6c12c6a1bcd 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -69,6 +69,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
 * @s: trace sequence descriptor
 * @fmt: printf format string
 *
+ * It returns 0 if the trace oversizes the buffer's free
+ * space, 1 otherwise.
+ *
 * The tracer may use either sequence operations or its own
 * copy to user routines. To simplify formating of a trace
 * trace_seq_printf is used to store strings into a special
@@ -95,7 +98,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
        s->len += ret;
-        return len;
+        return 1;
 }
 EXPORT_SYMBOL_GPL(trace_seq_printf);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ccefe574dcf7..12328147132c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -647,7 +647,7 @@ EXPORT_SYMBOL(schedule_delayed_work);
 */
 void flush_delayed_work(struct delayed_work *dwork)
 {
-        if (del_timer(&dwork->timer)) {
+        if (del_timer_sync(&dwork->timer)) {
                struct cpu_workqueue_struct *cwq;
                cwq = wq_per_cpu(keventd_wq, get_cpu());
                __queue_work(cwq, &dwork->work);
@@ -685,21 +685,38 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
 int schedule_on_each_cpu(work_func_t func)
 {
        int cpu;
+        int orig = -1;
        struct work_struct *works;
        works = alloc_percpu(struct work_struct);
        if (!works)
                return -ENOMEM;
+        /*
+         * when running in keventd don't schedule a work item on itself.
+         * Can just call directly because the work queue is already bound.
+         * This also is faster.
+         * Make this a generic parameter for other workqueues?
+         */
+        if (current_is_keventd()) {
+                orig = raw_smp_processor_id();
+                INIT_WORK(per_cpu_ptr(works, orig), func);
+                func(per_cpu_ptr(works, orig));
+        }
        get_online_cpus();
        for_each_online_cpu(cpu) {
                struct work_struct *work = per_cpu_ptr(works, cpu);
+                if (cpu == orig)
+                        continue;
                INIT_WORK(work, func);
                schedule_work_on(cpu, work);
        }
-        for_each_online_cpu(cpu)
+        for_each_online_cpu(cpu) {
-                flush_work(per_cpu_ptr(works, cpu));
+                if (cpu != orig)
+                        flush_work(per_cpu_ptr(works, cpu));
+        }
        put_online_cpus();
        free_percpu(works);
        return 0;
author	Ingo Molnar <mingo@elte.hu>	2009-11-04 05:54:15 -0500
committer	Ingo Molnar <mingo@elte.hu>	2009-11-04 05:59:45 -0500
commit	a2e71271535fde493c32803b1f34789f97efcb5e (patch)
tree	90d7139bea2f49e947f27af92614fa6eca50b64d /kernel
parent	6d7aa9d721c8c640066142fd9534afcdf68d7f9d (diff)
parent	b419148e567728f6af0c3b01965c1cc141e3e13a (diff)