Merge branch 'tracing/urgent' into tracing/ftrace

Conflicts: kernel/trace/trace.c
author: Ingo Molnar <mingo@elte.hu> 2008-11-11 03:40:18 -0500
committer: Ingo Molnar <mingo@elte.hu> 2008-11-11 03:40:18 -0500
commit: e0cb4ebcd9e5b4ddd8216c20f54445c91b1fa4b9 (patch)
tree: d1c3b22b7e9f02fb56927da530da09c6ee7ce0b9 /kernel
parent: a309720c876d7ad2e224bfd1982c92ae4364c82e (diff)
parent: 45b86a96f17cb2900f291129b0e67287400e45b2 (diff)
10 files changed, 235 insertions, 70 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 35eebd5510c2..358e77564e6f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2497,7 +2497,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        list_del(&cgrp->sibling);
        spin_lock(&cgrp->dentry->d_lock);
        d = dget(cgrp->dentry);
-        cgrp->dentry = NULL;
        spin_unlock(&d->d_lock);
        cgroup_d_remove_dir(d);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 86d49045daed..5a732c5ef08b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -499,3 +499,6 @@ const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
 #endif
 };
 EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
+const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
+EXPORT_SYMBOL(cpu_all_bits);
diff --git a/kernel/sched.c b/kernel/sched.c
index e8819bc6f462..57c933ffbee1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -397,7 +397,7 @@ struct cfs_rq {
         * 'curr' points to currently running entity on this cfs_rq.
         * It is set to NULL otherwise (i.e when none are currently running).
         */
-        struct sched_entity *curr, *next;
+        struct sched_entity *curr, *next, *last;
        unsigned long nr_spread_over;
@@ -1805,7 +1805,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
        /*
         * Buddy candidates are cache hot:
         */
-        if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
+        if (sched_feat(CACHE_HOT_BUDDY) &&
+                        (&p->se == cfs_rq_of(&p->se)->next ||
+                         &p->se == cfs_rq_of(&p->se)->last))
                return 1;
        if (p->sched_class != &fair_sched_class)
@@ -6875,15 +6877,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        struct sched_domain *tmp;
        /* Remove the sched domains which do not contribute to scheduling. */
-        for (tmp = sd; tmp; tmp = tmp->parent) {
+        for (tmp = sd; tmp; ) {
                struct sched_domain *parent = tmp->parent;
                if (!parent)
                        break;
                if (sd_parent_degenerate(tmp, parent)) {
                        tmp->parent = parent->parent;
                        if (parent->parent)
                                parent->parent->child = tmp;
-                }
+                } else
+                        tmp = tmp->parent;
        }
        if (sd && sd_degenerate(sd)) {
@@ -7672,6 +7676,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 error:
        free_sched_groups(cpu_map, tmpmask);
        SCHED_CPUMASK_FREE((void *)allmasks);
+        kfree(rd);
        return -ENOMEM;
 #endif
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ce514afd78ff..51aa3e102acb 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -341,23 +341,20 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                cfs_rq->rb_leftmost = next_node;
        }
-        if (cfs_rq->next == se)
-                cfs_rq->next = NULL;
        rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 }
-static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
-{
-        return cfs_rq->rb_leftmost;
-}
 static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 {
-        return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
+        struct rb_node *left = cfs_rq->rb_leftmost;
+        if (!left)
+                return NULL;
+        return rb_entry(left, struct sched_entity, run_node);
 }
-static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
        struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -741,6 +738,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 #endif
        }
+        if (cfs_rq->last == se)
+                cfs_rq->last = NULL;
+        if (cfs_rq->next == se)
+                cfs_rq->next = NULL;
        if (se != cfs_rq->curr)
                __dequeue_entity(cfs_rq, se);
        account_entity_dequeue(cfs_rq, se);
@@ -794,24 +797,15 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static int
 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-static struct sched_entity *
-pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        if (!cfs_rq->next || wakeup_preempt_entity(cfs_rq->next, se) == 1)
-                return se;
-        return cfs_rq->next;
-}
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
-        struct sched_entity *se = NULL;
+        struct sched_entity *se = __pick_next_entity(cfs_rq);
-        if (first_fair(cfs_rq)) {
+        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
-                se = __pick_next_entity(cfs_rq);
+                return cfs_rq->next;
-                se = pick_next(cfs_rq, se);
-                set_next_entity(cfs_rq, se);
+        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
-        }
+                return cfs_rq->last;
        return se;
 }
@@ -1325,26 +1319,53 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
        return 0;
 }
+static void set_last_buddy(struct sched_entity *se)
+{
+        for_each_sched_entity(se)
+                cfs_rq_of(se)->last = se;
+}
+static void set_next_buddy(struct sched_entity *se)
+{
+        for_each_sched_entity(se)
+                cfs_rq_of(se)->next = se;
+}
 /*
 * Preempt the current task with a newly woken task if needed:
 */
 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 {
        struct task_struct *curr = rq->curr;
-        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
        struct sched_entity *se = &curr->se, *pse = &p->se;
        if (unlikely(rt_prio(p->prio))) {
+                struct cfs_rq *cfs_rq = task_cfs_rq(curr);
                update_rq_clock(rq);
                update_curr(cfs_rq);
                resched_task(curr);
                return;
        }
+        if (unlikely(p->sched_class != &fair_sched_class))
+                return;
        if (unlikely(se == pse))
                return;
-        cfs_rq_of(pse)->next = pse;
+        /*
+         * Only set the backward buddy when the current task is still on the
+         * rq. This can happen when a wakeup gets interleaved with schedule on
+         * the ->pre_schedule() or idle_balance() point, either of which can
+         * drop the rq lock.
+         *
+         * Also, during early boot the idle thread is in the fair class, for
+         * obvious reasons its a bad idea to schedule back to the idle thread.
+         */
+        if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
+                set_last_buddy(se);
+        set_next_buddy(pse);
        /*
         * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1396,6 +1417,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
        do {
                se = pick_next_entity(cfs_rq);
+                set_next_entity(cfs_rq, se);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index fda016218296..da5d93b5d2c6 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -12,3 +12,4 @@ SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
+SCHED_FEAT(LAST_BUDDY, 1)
diff --git a/kernel/smp.c b/kernel/smp.c
index f362a8553777..75c8dde58c55 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -51,10 +51,6 @@ static void csd_flag_wait(struct call_single_data *data)
 {
        /* Wait for response */
        do {
-                /*
-                 * We need to see the flags store in the IPI handler
-                 */
-                smp_mb();
                if (!(data->flags & CSD_FLAG_WAIT))
                        break;
                cpu_relax();
@@ -76,6 +72,11 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
        list_add_tail(&data->list, &dst->list);
        spin_unlock_irqrestore(&dst->lock, flags);
+        /*
+         * Make the list addition visible before sending the ipi.
+         */
+        smp_mb();
        if (ipi)
                arch_send_call_function_single_ipi(cpu);
@@ -157,7 +158,7 @@ void generic_smp_call_function_single_interrupt(void)
         * Need to see other stores to list head for checking whether
         * list is empty without holding q->lock
         */
-        smp_mb();
+        smp_read_barrier_depends();
        while (!list_empty(&q->list)) {
                unsigned int data_flags;
@@ -191,7 +192,7 @@ void generic_smp_call_function_single_interrupt(void)
                /*
                 * See comment on outer loop
                 */
-                smp_mb();
+                smp_read_barrier_depends();
        }
 }
@@ -370,6 +371,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
        list_add_tail_rcu(&data->csd.list, &call_function_queue);
        spin_unlock_irqrestore(&call_function_lock, flags);
+        /*
+         * Make the list addition visible before sending the ipi.
+         */
+        smp_mb();
        /* Send a message to all CPUs in the map */
        arch_send_call_function_ipi(mask);
diff --git a/kernel/timer.c b/kernel/timer.c
index 56becf373c58..dbd50fabe4c7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -112,27 +112,8 @@ timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
                                      tbase_get_deferrable(timer->base));
 }
-/**
+static unsigned long round_jiffies_common(unsigned long j, int cpu,
- * __round_jiffies - function to round jiffies to a full second
+                bool force_up)
- * @j: the time in (absolute) jiffies that should be rounded
- * @cpu: the processor number on which the timeout will happen
- *
- * __round_jiffies() rounds an absolute time in the future (in jiffies)
- * up or down to (approximately) full seconds. This is useful for timers
- * for which the exact time they fire does not matter too much, as long as
- * they fire approximately every X seconds.
- *
- * By rounding these timers to whole seconds, all such timers will fire
- * at the same time, rather than at various times spread out. The goal
- * of this is to have the CPU wake up less, which saves power.
- *
- * The exact rounding is skewed for each processor to avoid all
- * processors firing at the exact same time, which could lead
- * to lock contention or spurious cache line bouncing.
- *
- * The return value is the rounded version of the @j parameter.
- */
-unsigned long __round_jiffies(unsigned long j, int cpu)
 {
        int rem;
        unsigned long original = j;
@@ -154,8 +135,9 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
         * due to delays of the timer irq, long irq off times etc etc) then
         * we should round down to the whole second, not up. Use 1/4th second
         * as cutoff for this rounding as an extreme upper bound for this.
+         * But never round down if @force_up is set.
         */
-        if (rem < HZ/4) /* round down */
+        if (rem < HZ/4 && !force_up) /* round down */
                j = j - rem;
        else /* round up */
                j = j - rem + HZ;
@@ -167,6 +149,31 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
                return original;
        return j;
 }
+/**
+ * __round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies() rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the @j parameter.
+ */
+unsigned long __round_jiffies(unsigned long j, int cpu)
+{
+        return round_jiffies_common(j, cpu, false);
+}
 EXPORT_SYMBOL_GPL(__round_jiffies);
 /**
@@ -191,13 +198,10 @@ EXPORT_SYMBOL_GPL(__round_jiffies);
 */
 unsigned long __round_jiffies_relative(unsigned long j, int cpu)
 {
-        /*
+        unsigned long j0 = jiffies;
-         * In theory the following code can skip a jiffy in case jiffies
-         * increments right between the addition and the later subtraction.
+        /* Use j0 because jiffies might change while we run */
-         * However since the entire point of this function is to use approximate
+        return round_jiffies_common(j + j0, cpu, false) - j0;
-         * timeouts, it's entirely ok to not handle that.
-         */
-        return  __round_jiffies(j + jiffies, cpu) - jiffies;
 }
 EXPORT_SYMBOL_GPL(__round_jiffies_relative);
@@ -218,7 +222,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative);
 */
 unsigned long round_jiffies(unsigned long j)
 {
-        return __round_jiffies(j, raw_smp_processor_id());
+        return round_jiffies_common(j, raw_smp_processor_id(), false);
 }
 EXPORT_SYMBOL_GPL(round_jiffies);
@@ -243,6 +247,71 @@ unsigned long round_jiffies_relative(unsigned long j)
 }
 EXPORT_SYMBOL_GPL(round_jiffies_relative);
+/**
+ * __round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies() except that it will never
+ * round down.  This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up(unsigned long j, int cpu)
+{
+        return round_jiffies_common(j, cpu, true);
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up);
+/**
+ * __round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies_relative() except that it will never
+ * round down.  This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
+{
+        unsigned long j0 = jiffies;
+        /* Use j0 because jiffies might change while we run */
+        return round_jiffies_common(j + j0, cpu, true) - j0;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
+/**
+ * round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies() except that it will never
+ * round down.  This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up(unsigned long j)
+{
+        return round_jiffies_common(j, raw_smp_processor_id(), true);
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up);
+/**
+ * round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies_relative() except that it will never
+ * round down.  This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up_relative(unsigned long j)
+{
+        return __round_jiffies_up_relative(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
 static inline void set_running_timer(struct tvec_base *base,
                                        struct timer_list *timer)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6781e9aab2c0..ee9b93d318b9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1065,7 +1065,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
                /* Did the write stamp get updated already? */
                if (unlikely(ts < cpu_buffer->write_stamp))
-                        goto again;
+                        delta = 0;
                if (test_time_stamp(delta)) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f147f198b9a6..0c22fe2d43a7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2835,7 +2835,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 {
        unsigned long val;
        char buf[64];
-        int ret;
+        int ret, cpu;
        if (cnt >= sizeof(buf))
                return -EINVAL;
@@ -2857,6 +2857,14 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
        tracing_stop();
+        /* disable all cpu buffers */
+        for_each_tracing_cpu(cpu) {
+                if (global_trace.data[cpu])
+                        atomic_inc(&global_trace.data[cpu]->disabled);
+                if (max_tr.data[cpu])
+                        atomic_inc(&max_tr.data[cpu]->disabled);
+        }
        if (val != global_trace.entries) {
                ret = ring_buffer_resize(global_trace.buffer, val);
                if (ret < 0) {
@@ -2888,6 +2896,13 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
        if (tracing_disabled)
                cnt = -ENOMEM;
 out:
+        for_each_tracing_cpu(cpu) {
+                if (global_trace.data[cpu])
+                        atomic_dec(&global_trace.data[cpu]->disabled);
+                if (max_tr.data[cpu])
+                        atomic_dec(&max_tr.data[cpu]->disabled);
+        }
        tracing_start();
        max_tr.entries = global_trace.entries;
        mutex_unlock(&trace_types_lock);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f928f2a87b9b..d4dc69ddebd7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -970,6 +970,51 @@ undo:
        return ret;
 }
+#ifdef CONFIG_SMP
+struct work_for_cpu {
+        struct work_struct work;
+        long (*fn)(void *);
+        void *arg;
+        long ret;
+};
+static void do_work_for_cpu(struct work_struct *w)
+{
+        struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work);
+        wfc->ret = wfc->fn(wfc->arg);
+}
+/**
+ * work_on_cpu - run a function in user context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn: the function to run
+ * @arg: the function arg
+ *
+ * This will return -EINVAL in the cpu is not online, or the return value
+ * of @fn otherwise.
+ */
+long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
+{
+        struct work_for_cpu wfc;
+        INIT_WORK(&wfc.work, do_work_for_cpu);
+        wfc.fn = fn;
+        wfc.arg = arg;
+        get_online_cpus();
+        if (unlikely(!cpu_online(cpu)))
+                wfc.ret = -EINVAL;
+        else {
+                schedule_work_on(cpu, &wfc.work);
+                flush_work(&wfc.work);
+        }
+        put_online_cpus();
+        return wfc.ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu);
+#endif /* CONFIG_SMP */
 void __init init_workqueues(void)
 {
        cpu_populated_map = cpu_online_map;
author	Ingo Molnar <mingo@elte.hu>	2008-11-11 03:40:18 -0500
committer	Ingo Molnar <mingo@elte.hu>	2008-11-11 03:40:18 -0500
commit	e0cb4ebcd9e5b4ddd8216c20f54445c91b1fa4b9 (patch)
tree	d1c3b22b7e9f02fb56927da530da09c6ee7ce0b9 /kernel
parent	a309720c876d7ad2e224bfd1982c92ae4364c82e (diff)
parent	45b86a96f17cb2900f291129b0e67287400e45b2 (diff)