Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (24 commits) sched: Cleanup duplicate local variable in [enqueue|dequeue]_task_fair sched: Replace use of entity_key() sched: Separate group-scheduling code more clearly sched: Reorder root_domain to remove 64 bit alignment padding sched: Do not attempt to destroy uninitialized rt_bandwidth sched: Remove unused function cpu_cfs_rq() sched: Fix (harmless) typo 'CONFG_FAIR_GROUP_SCHED' sched, cgroup: Optimize load_balance_fair() sched: Don't update shares twice on on_rq parent sched: update correct entity's runtime in check_preempt_wakeup() xtensa: Use generic config PREEMPT definition h8300: Use generic config PREEMPT definition m32r: Use generic PREEMPT config sched: Skip autogroup when looking for all rt sched groups sched: Simplify mutex_spin_on_owner() sched: Remove rcu_read_lock() from wake_affine() sched: Generalize sleep inside spinlock detection sched: Make sleeping inside spinlock detection working in !CONFIG_PREEMPT sched: Isolate preempt counting in its own config option sched: Remove pointless in_atomic() definition check ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2011-07-22 19:45:02 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2011-07-22 19:45:02 -0400
commit: bdc7ccfc0631797636837b10df7f87bc1e2e4ae3 (patch)
tree: 70f09f8ffee07486d41ca254b8abb05692713d1e /kernel
parent: 4d4abdcb1dee03a4f9d6d2021622ed07e14dfd17 (diff)
parent: 0f3171438fc917b9f6b8b60dbb7a3fff9a0f68fd (diff)
5 files changed, 101 insertions, 118 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf987b95b356..24e7cb0ba26a 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY
 config PREEMPT
        bool "Preemptible Kernel (Low-Latency Desktop)"
+        select PREEMPT_COUNT
        help
          This option reduces the latency of the kernel by making
          all kernel code (that is not executing in a critical section)
@@ -52,3 +53,5 @@ config PREEMPT
 endchoice
+config PREEMPT_COUNT
+       bool
+\ No newline at end of file
diff --git a/kernel/sched.c b/kernel/sched.c
index 84b9e076812e..9aaf567c5da5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -124,7 +124,7 @@
 static inline int rt_policy(int policy)
 {
-        if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
+        if (policy == SCHED_FIFO || policy == SCHED_RR)
                return 1;
        return 0;
 }
@@ -422,6 +422,7 @@ struct rt_rq {
 */
 struct root_domain {
        atomic_t refcount;
+        atomic_t rto_count;
        struct rcu_head rcu;
        cpumask_var_t span;
        cpumask_var_t online;
@@ -431,7 +432,6 @@ struct root_domain {
         * one runnable RT task.
         */
        cpumask_var_t rto_mask;
-        atomic_t rto_count;
        struct cpupri cpupri;
 };
@@ -1568,38 +1568,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
        return rq->avg_load_per_task;
 }
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * Compute the cpu's hierarchical load factor for each task group.
- * This needs to be done in a top-down fashion because the load of a child
- * group is a fraction of its parents load.
- */
-static int tg_load_down(struct task_group *tg, void *data)
-{
-        unsigned long load;
-        long cpu = (long)data;
-        if (!tg->parent) {
-                load = cpu_rq(cpu)->load.weight;
-        } else {
-                load = tg->parent->cfs_rq[cpu]->h_load;
-                load *= tg->se[cpu]->load.weight;
-                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
-        }
-        tg->cfs_rq[cpu]->h_load = load;
-        return 0;
-}
-static void update_h_load(long cpu)
-{
-        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-}
-#endif
 #ifdef CONFIG_PREEMPT
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -2497,7 +2465,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
        if (p->sched_class->task_woken)
                p->sched_class->task_woken(rq, p);
-        if (unlikely(rq->idle_stamp)) {
+        if (rq->idle_stamp) {
                u64 delta = rq->clock - rq->idle_stamp;
                u64 max = 2*sysctl_sched_migration_cost;
@@ -2886,7 +2854,7 @@ void sched_fork(struct task_struct *p)
 #if defined(CONFIG_SMP)
        p->on_cpu = 0;
 #endif
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
 #endif
@@ -4338,11 +4306,8 @@ EXPORT_SYMBOL(schedule);
 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
 {
-        bool ret = false;
-        rcu_read_lock();
        if (lock->owner != owner)
-                goto fail;
+                return false;
        /*
         * Ensure we emit the owner->on_cpu, dereference _after_ checking
@@ -4352,11 +4317,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
         */
        barrier();
-        ret = owner->on_cpu;
+        return owner->on_cpu;
-fail:
-        rcu_read_unlock();
-        return ret;
 }
 /*
@@ -4368,21 +4329,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
        if (!sched_feat(OWNER_SPIN))
                return 0;
+        rcu_read_lock();
        while (owner_running(lock, owner)) {
                if (need_resched())
-                        return 0;
+                        break;
                arch_mutex_cpu_relax();
        }
+        rcu_read_unlock();
        /*
-         * If the owner changed to another task there is likely
+         * We break out the loop above on need_resched() and when the
-         * heavy contention, stop spinning.
+         * owner changed, which is a sign for heavy contention. Return
+         * success only when lock->owner is NULL.
         */
-        if (lock->owner)
+        return lock->owner == NULL;
-                return 0;
-        return 1;
 }
 #endif
@@ -7898,17 +7859,10 @@ int in_sched_functions(unsigned long addr)
                && addr < (unsigned long)__sched_text_end);
 }
-static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+static void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
        cfs_rq->tasks_timeline = RB_ROOT;
        INIT_LIST_HEAD(&cfs_rq->tasks);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        cfs_rq->rq = rq;
-        /* allow initial update_cfs_load() to truncate */
-#ifdef CONFIG_SMP
-        cfs_rq->load_stamp = 1;
-#endif
-#endif
        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 #ifndef CONFIG_64BIT
        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
@@ -7928,13 +7882,9 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        /* delimiter for bitsearch: */
        __set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+#if defined CONFIG_SMP
        rt_rq->highest_prio.curr = MAX_RT_PRIO;
-#ifdef CONFIG_SMP
        rt_rq->highest_prio.next = MAX_RT_PRIO;
-#endif
-#endif
-#ifdef CONFIG_SMP
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
        plist_head_init(&rt_rq->pushable_tasks);
@@ -7944,11 +7894,6 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        rt_rq->rt_throttled = 0;
        rt_rq->rt_runtime = 0;
        raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-#ifdef CONFIG_RT_GROUP_SCHED
-        rt_rq->rt_nr_boosted = 0;
-        rt_rq->rq = rq;
-#endif
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7957,11 +7902,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                                struct sched_entity *parent)
 {
        struct rq *rq = cpu_rq(cpu);
-        tg->cfs_rq[cpu] = cfs_rq;
-        init_cfs_rq(cfs_rq, rq);
        cfs_rq->tg = tg;
+        cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+        /* allow initial update_cfs_load() to truncate */
+        cfs_rq->load_stamp = 1;
+#endif
+        tg->cfs_rq[cpu] = cfs_rq;
        tg->se[cpu] = se;
        /* se could be NULL for root_task_group */
        if (!se)
                return;
@@ -7984,12 +7935,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 {
        struct rq *rq = cpu_rq(cpu);
-        tg->rt_rq[cpu] = rt_rq;
+        rt_rq->highest_prio.curr = MAX_RT_PRIO;
-        init_rt_rq(rt_rq, rq);
+        rt_rq->rt_nr_boosted = 0;
+        rt_rq->rq = rq;
        rt_rq->tg = tg;
-        rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
+        tg->rt_rq[cpu] = rt_rq;
        tg->rt_se[cpu] = rt_se;
        if (!rt_se)
                return;
@@ -8071,7 +8024,7 @@ void __init sched_init(void)
                rq->nr_running = 0;
                rq->calc_load_active = 0;
                rq->calc_load_update = jiffies + LOAD_FREQ;
-                init_cfs_rq(&rq->cfs, rq);
+                init_cfs_rq(&rq->cfs);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
                root_task_group.shares = root_task_group_load;
@@ -8185,7 +8138,7 @@ void __init sched_init(void)
        scheduler_running = 1;
 }
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
        int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
@@ -8195,7 +8148,6 @@ static inline int preempt_count_equals(int preempt_offset)
 void __might_sleep(const char *file, int line, int preempt_offset)
 {
-#ifdef in_atomic
        static unsigned long prev_jiffy;        /* ratelimiting */
        if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
@@ -8217,7 +8169,6 @@ void __might_sleep(const char *file, int line, int preempt_offset)
        if (irqs_disabled())
                print_irqtrace_events(current);
        dump_stack();
-#endif
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
@@ -8376,6 +8327,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                if (!se)
                        goto err_free_rq;
+                init_cfs_rq(cfs_rq);
                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
        }
@@ -8403,7 +8355,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
        list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
-#else /* !CONFG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
 {
 }
@@ -8424,7 +8376,8 @@ static void free_rt_sched_group(struct task_group *tg)
 {
        int i;
-        destroy_rt_bandwidth(&tg->rt_bandwidth);
+        if (tg->rt_se)
+                destroy_rt_bandwidth(&tg->rt_bandwidth);
        for_each_possible_cpu(i) {
                if (tg->rt_rq)
@@ -8465,6 +8418,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                if (!rt_se)
                        goto err_free_rq;
+                init_rt_rq(rt_rq, cpu_rq(i));
+                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
        }
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 05577055cfca..c2f0e7248dca 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -13,6 +13,7 @@ struct autogroup {
        int                     nice;
 };
+static inline bool task_group_is_autogroup(struct task_group *tg);
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c768588e180b..bc8ee9993814 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -135,14 +135,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
        return grp->my_q;
 }
-/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
- * another cpu ('this_cpu')
- */
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
-        return cfs_rq->tg->cfs_rq[this_cpu];
-}
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
        if (!cfs_rq->on_list) {
@@ -271,11 +263,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
        return NULL;
 }
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
-        return &cpu_rq(this_cpu)->cfs;
-}
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 }
@@ -334,11 +321,6 @@ static inline int entity_before(struct sched_entity *a,
        return (s64)(a->vruntime - b->vruntime) < 0;
 }
-static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        return se->vruntime - cfs_rq->min_vruntime;
-}
 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 {
        u64 vruntime = cfs_rq->min_vruntime;
@@ -372,7 +354,6 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
        struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
        struct rb_node *parent = NULL;
        struct sched_entity *entry;
-        s64 key = entity_key(cfs_rq, se);
        int leftmost = 1;
        /*
@@ -385,7 +366,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 * We dont care about collisions. Nodes with
                 * the same key stay together.
                 */
-                if (key < entity_key(cfs_rq, entry)) {
+                if (entity_before(se, entry)) {
                        link = &parent->rb_left;
                } else {
                        link = &parent->rb_right;
@@ -1336,7 +1317,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        }
        for_each_sched_entity(se) {
-                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                cfs_rq = cfs_rq_of(se);
                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
@@ -1370,13 +1351,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                         */
                        if (task_sleep && parent_entity(se))
                                set_next_buddy(parent_entity(se));
+                        /* avoid re-evaluating load for this entity */
+                        se = parent_entity(se);
                        break;
                }
                flags |= DEQUEUE_SLEEP;
        }
        for_each_sched_entity(se) {
-                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                cfs_rq = cfs_rq_of(se);
                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
@@ -1481,7 +1465,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
         * effect of the currently running task from the load
         * of the current CPU:
         */
-        rcu_read_lock();
        if (sync) {
                tg = task_group(current);
                weight = current->se.load.weight;
@@ -1517,7 +1500,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
                balanced = this_eff_load <= prev_eff_load;
        } else
                balanced = true;
-        rcu_read_unlock();
        /*
         * If the currently running task will sleep within
@@ -1921,8 +1903,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
-        update_curr(cfs_rq);
        find_matching_se(&se, &pse);
+        update_curr(cfs_rq_of(se));
        BUG_ON(!pse);
        if (wakeup_preempt_entity(se, pse) == 1) {
                /*
@@ -2231,11 +2213,43 @@ static void update_shares(int cpu)
        struct rq *rq = cpu_rq(cpu);
        rcu_read_lock();
+        /*
+         * Iterates the task_group tree in a bottom up fashion, see
+         * list_add_leaf_cfs_rq() for details.
+         */
        for_each_leaf_cfs_rq(rq, cfs_rq)
                update_shares_cpu(cfs_rq->tg, cpu);
        rcu_read_unlock();
 }
+/*
+ * Compute the cpu's hierarchical load factor for each task group.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
+ */
+static int tg_load_down(struct task_group *tg, void *data)
+{
+        unsigned long load;
+        long cpu = (long)data;
+        if (!tg->parent) {
+                load = cpu_rq(cpu)->load.weight;
+        } else {
+                load = tg->parent->cfs_rq[cpu]->h_load;
+                load *= tg->se[cpu]->load.weight;
+                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+        }
+        tg->cfs_rq[cpu]->h_load = load;
+        return 0;
+}
+static void update_h_load(long cpu)
+{
+        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
+}
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
@@ -2243,14 +2257,12 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  int *all_pinned)
 {
        long rem_load_move = max_load_move;
-        int busiest_cpu = cpu_of(busiest);
+        struct cfs_rq *busiest_cfs_rq;
-        struct task_group *tg;
        rcu_read_lock();
-        update_h_load(busiest_cpu);
+        update_h_load(cpu_of(busiest));
-        list_for_each_entry_rcu(tg, &task_groups, list) {
+        for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
-                struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
                unsigned long busiest_h_load = busiest_cfs_rq->h_load;
                unsigned long busiest_weight = busiest_cfs_rq->load.weight;
                u64 rem_load, moved_load;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 10d018212bab..97540f0c9e47 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -185,11 +185,23 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 typedef struct task_group *rt_rq_iter_t;
-#define for_each_rt_rq(rt_rq, iter, rq) \
+static inline struct task_group *next_task_group(struct task_group *tg)
-        for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \
+{
-             (&iter->list != &task_groups) && \
+        do {
-             (rt_rq = iter->rt_rq[cpu_of(rq)]); \
+                tg = list_entry_rcu(tg->list.next,
-             iter = list_entry_rcu(iter->list.next, typeof(*iter), list))
+                        typeof(struct task_group), list);
+        } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
+        if (&tg->list == &task_groups)
+                tg = NULL;
+        return tg;
+}
+#define for_each_rt_rq(rt_rq, iter, rq)                                 \
+        for (iter = container_of(&task_groups, typeof(*iter), list);    \
+                (iter = next_task_group(iter)) &&                       \
+                (rt_rq = iter->rt_rq[cpu_of(rq)]);)
 static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
 {
@@ -1126,7 +1138,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
        rt_rq = &rq->rt;
-        if (unlikely(!rt_rq->rt_nr_running))
+        if (!rt_rq->rt_nr_running)
                return NULL;
        if (rt_rq_throttled(rt_rq))
@@ -1548,7 +1560,7 @@ skip:
 static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 {
        /* Try to pull RT tasks here if we lower this rq's prio */
-        if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
+        if (rq->rt.highest_prio.curr > prev->prio)
                pull_rt_task(rq);
 }
author	Linus Torvalds <torvalds@linux-foundation.org>	2011-07-22 19:45:02 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2011-07-22 19:45:02 -0400
commit	bdc7ccfc0631797636837b10df7f87bc1e2e4ae3 (patch)
tree	70f09f8ffee07486d41ca254b8abb05692713d1e /kernel
parent	4d4abdcb1dee03a4f9d6d2021622ed07e14dfd17 (diff)
parent	0f3171438fc917b9f6b8b60dbb7a3fff9a0f68fd (diff)