[PATCH] sched: fix smt nice lock contention and optimization

Initial report and lock contention fix from Chris Mason: Recent benchmarks showed some performance regressions between 2.6.16 and 2.6.5. We tracked down one of the regressions to lock contention in schedule heavy workloads (~70,000 context switches per second) kernel/sched.c:dependent_sleeper() was responsible for most of the lock contention, hammering on the run queue locks. The patch below is more of a discussion point than a suggested fix (although it does reduce lock contention significantly). The dependent_sleeper code looks very expensive to me, especially for using a spinlock to bounce control between two different siblings in the same cpu. It is further optimized: * perform dependent_sleeper check after next task is determined * convert wake_sleeping_dependent to use trylock * skip smt runqueue check if trylock fails * optimize double_rq_lock now that smt nice is converted to trylock * early exit in searching first SD_SHARE_CPUPOWER domain * speedup fast path of dependent_sleeper [akpm@osdl.org: cleanup] Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Acked-by: Con Kolivas <kernel@kolivas.org> Signed-off-by: Nick Piggin <npiggin@suse.de> Acked-by: Chris Mason <mason@suse.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Chen, Kenneth W <kenneth.w.chen@intel.com> 2006-06-27 05:54:28 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-06-27 20:32:44 -0400
commit: c96d145e71c5c84601322d85748512e09d7b325f (patch)
tree: 4762f8aa4c970295a33afbc4ee506c72d7216073 /kernel
parent: 7a8e2a5ea4cf43c0edd6db56a156549edb0eee98 (diff)
1 files changed, 59 insertions, 123 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 3e57712aefdf..50a67edc3584 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -239,7 +239,6 @@ struct runqueue {
        task_t *migration_thread;
        struct list_head migration_queue;
-        int cpu;
 #endif
 #ifdef CONFIG_SCHEDSTATS
@@ -1074,9 +1073,10 @@ static int sched_balance_self(int cpu, int flag)
        struct task_struct *t = current;
        struct sched_domain *tmp, *sd = NULL;
-        for_each_domain(cpu, tmp)
+        for_each_domain(cpu, tmp) {
                if (tmp->flags & flag)
                        sd = tmp;
+        }
        while (sd) {
                cpumask_t span;
@@ -1691,9 +1691,6 @@ unsigned long nr_active(void)
 /*
 * double_rq_lock - safely lock two runqueues
 *
- * We must take them in cpu order to match code in
- * dependent_sleeper and wake_dependent_sleeper.
- *
 * Note this does not disable interrupts like task_rq_lock,
 * you need to do so manually before calling.
 */
@@ -1705,7 +1702,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
                spin_lock(&rq1->lock);
                __acquire(rq2->lock);   /* Fake it out ;) */
        } else {
-                if (rq1->cpu < rq2->cpu) {
+                if (rq1 < rq2) {
                        spin_lock(&rq1->lock);
                        spin_lock(&rq2->lock);
                } else {
@@ -1741,7 +1738,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
        __acquires(this_rq->lock)
 {
        if (unlikely(!spin_trylock(&busiest->lock))) {
-                if (busiest->cpu < this_rq->cpu) {
+                if (busiest < this_rq) {
                        spin_unlock(&this_rq->lock);
                        spin_lock(&busiest->lock);
                        spin_lock(&this_rq->lock);
@@ -2352,10 +2349,11 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
        double_lock_balance(busiest_rq, target_rq);
        /* Search for an sd spanning us and the target CPU. */
-        for_each_domain(target_cpu, sd)
+        for_each_domain(target_cpu, sd) {
                if ((sd->flags & SD_LOAD_BALANCE) &&
                        cpu_isset(busiest_cpu, sd->span))
                                break;
+        }
        if (unlikely(sd == NULL))
                goto out;
@@ -2691,48 +2689,35 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq)
                resched_task(rq->idle);
 }
-static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+/*
+ * Called with interrupt disabled and this_rq's runqueue locked.
+ */
+static void wake_sleeping_dependent(int this_cpu)
 {
        struct sched_domain *tmp, *sd = NULL;
-        cpumask_t sibling_map;
        int i;
-        for_each_domain(this_cpu, tmp)
+        for_each_domain(this_cpu, tmp) {
-                if (tmp->flags & SD_SHARE_CPUPOWER)
+                if (tmp->flags & SD_SHARE_CPUPOWER) {
                        sd = tmp;
+                        break;
+                }
+        }
        if (!sd)
                return;
-        /*
+        for_each_cpu_mask(i, sd->span) {
-         * Unlock the current runqueue because we have to lock in
-         * CPU order to avoid deadlocks. Caller knows that we might
-         * unlock. We keep IRQs disabled.
-         */
-        spin_unlock(&this_rq->lock);
-        sibling_map = sd->span;
-        for_each_cpu_mask(i, sibling_map)
-                spin_lock(&cpu_rq(i)->lock);
-        /*
-         * We clear this CPU from the mask. This both simplifies the
-         * inner loop and keps this_rq locked when we exit:
-         */
-        cpu_clear(this_cpu, sibling_map);
-        for_each_cpu_mask(i, sibling_map) {
                runqueue_t *smt_rq = cpu_rq(i);
+                if (i == this_cpu)
+                        continue;
+                if (unlikely(!spin_trylock(&smt_rq->lock)))
+                        continue;
                wakeup_busy_runqueue(smt_rq);
+                spin_unlock(&smt_rq->lock);
        }
-        for_each_cpu_mask(i, sibling_map)
-                spin_unlock(&cpu_rq(i)->lock);
-        /*
-         * We exit with this_cpu's rq still held and IRQs
-         * still disabled:
-         */
 }
 /*
@@ -2745,52 +2730,46 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
        return p->time_slice * (100 - sd->per_cpu_gain) / 100;
 }
-static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+/*
+ * To minimise lock contention and not have to drop this_rq's runlock we only
+ * trylock the sibling runqueues and bypass those runqueues if we fail to
+ * acquire their lock. As we only trylock the normal locking order does not
+ * need to be obeyed.
+ */
+static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
 {
        struct sched_domain *tmp, *sd = NULL;
-        cpumask_t sibling_map;
-        prio_array_t *array;
        int ret = 0, i;
-        task_t *p;
-        for_each_domain(this_cpu, tmp)
+        /* kernel/rt threads do not participate in dependent sleeping */
-                if (tmp->flags & SD_SHARE_CPUPOWER)
+        if (!p->mm || rt_task(p))
+                return 0;
+        for_each_domain(this_cpu, tmp) {
+                if (tmp->flags & SD_SHARE_CPUPOWER) {
                        sd = tmp;
+                        break;
+                }
+        }
        if (!sd)
                return 0;
-        /*
+        for_each_cpu_mask(i, sd->span) {
-         * The same locking rules and details apply as for
+                runqueue_t *smt_rq;
-         * wake_sleeping_dependent():
+                task_t *smt_curr;
-         */
-        spin_unlock(&this_rq->lock);
-        sibling_map = sd->span;
-        for_each_cpu_mask(i, sibling_map)
-                spin_lock(&cpu_rq(i)->lock);
-        cpu_clear(this_cpu, sibling_map);
-        /*
+                if (i == this_cpu)
-         * Establish next task to be run - it might have gone away because
+                        continue;
-         * we released the runqueue lock above:
-         */
-        if (!this_rq->nr_running)
-                goto out_unlock;
-        array = this_rq->active;
-        if (!array->nr_active)
-                array = this_rq->expired;
-        BUG_ON(!array->nr_active);
-        p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
+                smt_rq = cpu_rq(i);
-                task_t, run_list);
+                if (unlikely(!spin_trylock(&smt_rq->lock)))
+                        continue;
-        for_each_cpu_mask(i, sibling_map) {
+                smt_curr = smt_rq->curr;
-                runqueue_t *smt_rq = cpu_rq(i);
-                task_t *smt_curr = smt_rq->curr;
-                /* Kernel threads do not participate in dependent sleeping */
+                if (!smt_curr->mm)
-                if (!p->mm || !smt_curr->mm || rt_task(p))
+                        goto unlock;
-                        goto check_smt_task;
                /*
                 * If a user task with lower static priority than the
@@ -2808,49 +2787,24 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
                        if ((jiffies % DEF_TIMESLICE) >
                                (sd->per_cpu_gain * DEF_TIMESLICE / 100))
                                        ret = 1;
-                } else
+                } else {
                        if (smt_curr->static_prio < p->static_prio &&
                                !TASK_PREEMPTS_CURR(p, smt_rq) &&
                                smt_slice(smt_curr, sd) > task_timeslice(p))
                                        ret = 1;
-check_smt_task:
-                if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
-                        rt_task(smt_curr))
-                                continue;
-                if (!p->mm) {
-                        wakeup_busy_runqueue(smt_rq);
-                        continue;
-                }
-                /*
-                 * Reschedule a lower priority task on the SMT sibling for
-                 * it to be put to sleep, or wake it up if it has been put to
-                 * sleep for priority reasons to see if it should run now.
-                 */
-                if (rt_task(p)) {
-                        if ((jiffies % DEF_TIMESLICE) >
-                                (sd->per_cpu_gain * DEF_TIMESLICE / 100))
-                                        resched_task(smt_curr);
-                } else {
-                        if (TASK_PREEMPTS_CURR(p, smt_rq) &&
-                                smt_slice(p, sd) > task_timeslice(smt_curr))
-                                        resched_task(smt_curr);
-                        else
-                                wakeup_busy_runqueue(smt_rq);
                }
+unlock:
+                spin_unlock(&smt_rq->lock);
        }
-out_unlock:
-        for_each_cpu_mask(i, sibling_map)
-                spin_unlock(&cpu_rq(i)->lock);
        return ret;
 }
 #else
-static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+static inline void wake_sleeping_dependent(int this_cpu)
 {
 }
-static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq,
+                                        task_t *p)
 {
        return 0;
 }
@@ -2972,32 +2926,13 @@ need_resched_nonpreemptible:
        cpu = smp_processor_id();
        if (unlikely(!rq->nr_running)) {
-go_idle:
                idle_balance(cpu, rq);
                if (!rq->nr_running) {
                        next = rq->idle;
                        rq->expired_timestamp = 0;
-                        wake_sleeping_dependent(cpu, rq);
+                        wake_sleeping_dependent(cpu);
-                        /*
-                         * wake_sleeping_dependent() might have released
-                         * the runqueue, so break out if we got new
-                         * tasks meanwhile:
-                         */
-                        if (!rq->nr_running)
-                                goto switch_tasks;
-                }
-        } else {
-                if (dependent_sleeper(cpu, rq)) {
-                        next = rq->idle;
                        goto switch_tasks;
                }
-                /*
-                 * dependent_sleeper() releases and reacquires the runqueue
-                 * lock, hence go into the idle loop if the rq went
-                 * empty meanwhile:
-                 */
-                if (unlikely(!rq->nr_running))
-                        goto go_idle;
        }
        array = rq->active;
@@ -3035,6 +2970,8 @@ go_idle:
                }
        }
        next->sleep_type = SLEEP_NORMAL;
+        if (dependent_sleeper(cpu, rq, next))
+                next = rq->idle;
 switch_tasks:
        if (next == rq->idle)
                schedstat_inc(rq, sched_goidle);
@@ -6144,7 +6081,6 @@ void __init sched_init(void)
                rq->push_cpu = 0;
                rq->migration_thread = NULL;
                INIT_LIST_HEAD(&rq->migration_queue);
-                rq->cpu = i;
 #endif
                atomic_set(&rq->nr_iowait, 0);
@@ -6205,7 +6141,7 @@ void normalize_rt_tasks(void)
        runqueue_t *rq;
        read_lock_irq(&tasklist_lock);
-        for_each_process (p) {
+        for_each_process(p) {
                if (!rt_task(p))
                        continue;
author	Chen, Kenneth W <kenneth.w.chen@intel.com>	2006-06-27 05:54:28 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-06-27 20:32:44 -0400
commit	c96d145e71c5c84601322d85748512e09d7b325f (patch)
tree	4762f8aa4c970295a33afbc4ee506c72d7216073 /kernel
parent	7a8e2a5ea4cf43c0edd6db56a156549edb0eee98 (diff)