aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChen, Kenneth W <kenneth.w.chen@intel.com>2006-06-27 05:54:28 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-27 20:32:44 -0400
commitc96d145e71c5c84601322d85748512e09d7b325f (patch)
tree4762f8aa4c970295a33afbc4ee506c72d7216073
parent7a8e2a5ea4cf43c0edd6db56a156549edb0eee98 (diff)
[PATCH] sched: fix smt nice lock contention and optimization
Initial report and lock contention fix from Chris Mason: Recent benchmarks showed some performance regressions between 2.6.16 and 2.6.5. We tracked down one of the regressions to lock contention in schedule heavy workloads (~70,000 context switches per second) kernel/sched.c:dependent_sleeper() was responsible for most of the lock contention, hammering on the run queue locks. The patch below is more of a discussion point than a suggested fix (although it does reduce lock contention significantly). The dependent_sleeper code looks very expensive to me, especially for using a spinlock to bounce control between two different siblings in the same cpu. It is further optimized: * perform dependent_sleeper check after next task is determined * convert wake_sleeping_dependent to use trylock * skip smt runqueue check if trylock fails * optimize double_rq_lock now that smt nice is converted to trylock * early exit in searching first SD_SHARE_CPUPOWER domain * speedup fast path of dependent_sleeper [akpm@osdl.org: cleanup] Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Acked-by: Con Kolivas <kernel@kolivas.org> Signed-off-by: Nick Piggin <npiggin@suse.de> Acked-by: Chris Mason <mason@suse.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--kernel/sched.c182
1 files changed, 59 insertions, 123 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 3e57712aefdf..50a67edc3584 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -239,7 +239,6 @@ struct runqueue {
239 239
240 task_t *migration_thread; 240 task_t *migration_thread;
241 struct list_head migration_queue; 241 struct list_head migration_queue;
242 int cpu;
243#endif 242#endif
244 243
245#ifdef CONFIG_SCHEDSTATS 244#ifdef CONFIG_SCHEDSTATS
@@ -1074,9 +1073,10 @@ static int sched_balance_self(int cpu, int flag)
1074 struct task_struct *t = current; 1073 struct task_struct *t = current;
1075 struct sched_domain *tmp, *sd = NULL; 1074 struct sched_domain *tmp, *sd = NULL;
1076 1075
1077 for_each_domain(cpu, tmp) 1076 for_each_domain(cpu, tmp) {
1078 if (tmp->flags & flag) 1077 if (tmp->flags & flag)
1079 sd = tmp; 1078 sd = tmp;
1079 }
1080 1080
1081 while (sd) { 1081 while (sd) {
1082 cpumask_t span; 1082 cpumask_t span;
@@ -1691,9 +1691,6 @@ unsigned long nr_active(void)
1691/* 1691/*
1692 * double_rq_lock - safely lock two runqueues 1692 * double_rq_lock - safely lock two runqueues
1693 * 1693 *
1694 * We must take them in cpu order to match code in
1695 * dependent_sleeper and wake_dependent_sleeper.
1696 *
1697 * Note this does not disable interrupts like task_rq_lock, 1694 * Note this does not disable interrupts like task_rq_lock,
1698 * you need to do so manually before calling. 1695 * you need to do so manually before calling.
1699 */ 1696 */
@@ -1705,7 +1702,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1705 spin_lock(&rq1->lock); 1702 spin_lock(&rq1->lock);
1706 __acquire(rq2->lock); /* Fake it out ;) */ 1703 __acquire(rq2->lock); /* Fake it out ;) */
1707 } else { 1704 } else {
1708 if (rq1->cpu < rq2->cpu) { 1705 if (rq1 < rq2) {
1709 spin_lock(&rq1->lock); 1706 spin_lock(&rq1->lock);
1710 spin_lock(&rq2->lock); 1707 spin_lock(&rq2->lock);
1711 } else { 1708 } else {
@@ -1741,7 +1738,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1741 __acquires(this_rq->lock) 1738 __acquires(this_rq->lock)
1742{ 1739{
1743 if (unlikely(!spin_trylock(&busiest->lock))) { 1740 if (unlikely(!spin_trylock(&busiest->lock))) {
1744 if (busiest->cpu < this_rq->cpu) { 1741 if (busiest < this_rq) {
1745 spin_unlock(&this_rq->lock); 1742 spin_unlock(&this_rq->lock);
1746 spin_lock(&busiest->lock); 1743 spin_lock(&busiest->lock);
1747 spin_lock(&this_rq->lock); 1744 spin_lock(&this_rq->lock);
@@ -2352,10 +2349,11 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2352 double_lock_balance(busiest_rq, target_rq); 2349 double_lock_balance(busiest_rq, target_rq);
2353 2350
2354 /* Search for an sd spanning us and the target CPU. */ 2351 /* Search for an sd spanning us and the target CPU. */
2355 for_each_domain(target_cpu, sd) 2352 for_each_domain(target_cpu, sd) {
2356 if ((sd->flags & SD_LOAD_BALANCE) && 2353 if ((sd->flags & SD_LOAD_BALANCE) &&
2357 cpu_isset(busiest_cpu, sd->span)) 2354 cpu_isset(busiest_cpu, sd->span))
2358 break; 2355 break;
2356 }
2359 2357
2360 if (unlikely(sd == NULL)) 2358 if (unlikely(sd == NULL))
2361 goto out; 2359 goto out;
@@ -2691,48 +2689,35 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq)
2691 resched_task(rq->idle); 2689 resched_task(rq->idle);
2692} 2690}
2693 2691
2694static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 2692/*
2693 * Called with interrupt disabled and this_rq's runqueue locked.
2694 */
2695static void wake_sleeping_dependent(int this_cpu)
2695{ 2696{
2696 struct sched_domain *tmp, *sd = NULL; 2697 struct sched_domain *tmp, *sd = NULL;
2697 cpumask_t sibling_map;
2698 int i; 2698 int i;
2699 2699
2700 for_each_domain(this_cpu, tmp) 2700 for_each_domain(this_cpu, tmp) {
2701 if (tmp->flags & SD_SHARE_CPUPOWER) 2701 if (tmp->flags & SD_SHARE_CPUPOWER) {
2702 sd = tmp; 2702 sd = tmp;
2703 break;
2704 }
2705 }
2703 2706
2704 if (!sd) 2707 if (!sd)
2705 return; 2708 return;
2706 2709
2707 /* 2710 for_each_cpu_mask(i, sd->span) {
2708 * Unlock the current runqueue because we have to lock in
2709 * CPU order to avoid deadlocks. Caller knows that we might
2710 * unlock. We keep IRQs disabled.
2711 */
2712 spin_unlock(&this_rq->lock);
2713
2714 sibling_map = sd->span;
2715
2716 for_each_cpu_mask(i, sibling_map)
2717 spin_lock(&cpu_rq(i)->lock);
2718 /*
2719 * We clear this CPU from the mask. This both simplifies the
2720 * inner loop and keps this_rq locked when we exit:
2721 */
2722 cpu_clear(this_cpu, sibling_map);
2723
2724 for_each_cpu_mask(i, sibling_map) {
2725 runqueue_t *smt_rq = cpu_rq(i); 2711 runqueue_t *smt_rq = cpu_rq(i);
2726 2712
2713 if (i == this_cpu)
2714 continue;
2715 if (unlikely(!spin_trylock(&smt_rq->lock)))
2716 continue;
2717
2727 wakeup_busy_runqueue(smt_rq); 2718 wakeup_busy_runqueue(smt_rq);
2719 spin_unlock(&smt_rq->lock);
2728 } 2720 }
2729
2730 for_each_cpu_mask(i, sibling_map)
2731 spin_unlock(&cpu_rq(i)->lock);
2732 /*
2733 * We exit with this_cpu's rq still held and IRQs
2734 * still disabled:
2735 */
2736} 2721}
2737 2722
2738/* 2723/*
@@ -2745,52 +2730,46 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
2745 return p->time_slice * (100 - sd->per_cpu_gain) / 100; 2730 return p->time_slice * (100 - sd->per_cpu_gain) / 100;
2746} 2731}
2747 2732
2748static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 2733/*
2734 * To minimise lock contention and not have to drop this_rq's runlock we only
2735 * trylock the sibling runqueues and bypass those runqueues if we fail to
2736 * acquire their lock. As we only trylock the normal locking order does not
2737 * need to be obeyed.
2738 */
2739static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
2749{ 2740{
2750 struct sched_domain *tmp, *sd = NULL; 2741 struct sched_domain *tmp, *sd = NULL;
2751 cpumask_t sibling_map;
2752 prio_array_t *array;
2753 int ret = 0, i; 2742 int ret = 0, i;
2754 task_t *p;
2755 2743
2756 for_each_domain(this_cpu, tmp) 2744 /* kernel/rt threads do not participate in dependent sleeping */
2757 if (tmp->flags & SD_SHARE_CPUPOWER) 2745 if (!p->mm || rt_task(p))
2746 return 0;
2747
2748 for_each_domain(this_cpu, tmp) {
2749 if (tmp->flags & SD_SHARE_CPUPOWER) {
2758 sd = tmp; 2750 sd = tmp;
2751 break;
2752 }
2753 }
2759 2754
2760 if (!sd) 2755 if (!sd)
2761 return 0; 2756 return 0;
2762 2757
2763 /* 2758 for_each_cpu_mask(i, sd->span) {
2764 * The same locking rules and details apply as for 2759 runqueue_t *smt_rq;
2765 * wake_sleeping_dependent(): 2760 task_t *smt_curr;
2766 */
2767 spin_unlock(&this_rq->lock);
2768 sibling_map = sd->span;
2769 for_each_cpu_mask(i, sibling_map)
2770 spin_lock(&cpu_rq(i)->lock);
2771 cpu_clear(this_cpu, sibling_map);
2772 2761
2773 /* 2762 if (i == this_cpu)
2774 * Establish next task to be run - it might have gone away because 2763 continue;
2775 * we released the runqueue lock above:
2776 */
2777 if (!this_rq->nr_running)
2778 goto out_unlock;
2779 array = this_rq->active;
2780 if (!array->nr_active)
2781 array = this_rq->expired;
2782 BUG_ON(!array->nr_active);
2783 2764
2784 p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, 2765 smt_rq = cpu_rq(i);
2785 task_t, run_list); 2766 if (unlikely(!spin_trylock(&smt_rq->lock)))
2767 continue;
2786 2768
2787 for_each_cpu_mask(i, sibling_map) { 2769 smt_curr = smt_rq->curr;
2788 runqueue_t *smt_rq = cpu_rq(i);
2789 task_t *smt_curr = smt_rq->curr;
2790 2770
2791 /* Kernel threads do not participate in dependent sleeping */ 2771 if (!smt_curr->mm)
2792 if (!p->mm || !smt_curr->mm || rt_task(p)) 2772 goto unlock;
2793 goto check_smt_task;
2794 2773
2795 /* 2774 /*
2796 * If a user task with lower static priority than the 2775 * If a user task with lower static priority than the
@@ -2808,49 +2787,24 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2808 if ((jiffies % DEF_TIMESLICE) > 2787 if ((jiffies % DEF_TIMESLICE) >
2809 (sd->per_cpu_gain * DEF_TIMESLICE / 100)) 2788 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2810 ret = 1; 2789 ret = 1;
2811 } else 2790 } else {
2812 if (smt_curr->static_prio < p->static_prio && 2791 if (smt_curr->static_prio < p->static_prio &&
2813 !TASK_PREEMPTS_CURR(p, smt_rq) && 2792 !TASK_PREEMPTS_CURR(p, smt_rq) &&
2814 smt_slice(smt_curr, sd) > task_timeslice(p)) 2793 smt_slice(smt_curr, sd) > task_timeslice(p))
2815 ret = 1; 2794 ret = 1;
2816
2817check_smt_task:
2818 if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
2819 rt_task(smt_curr))
2820 continue;
2821 if (!p->mm) {
2822 wakeup_busy_runqueue(smt_rq);
2823 continue;
2824 }
2825
2826 /*
2827 * Reschedule a lower priority task on the SMT sibling for
2828 * it to be put to sleep, or wake it up if it has been put to
2829 * sleep for priority reasons to see if it should run now.
2830 */
2831 if (rt_task(p)) {
2832 if ((jiffies % DEF_TIMESLICE) >
2833 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2834 resched_task(smt_curr);
2835 } else {
2836 if (TASK_PREEMPTS_CURR(p, smt_rq) &&
2837 smt_slice(p, sd) > task_timeslice(smt_curr))
2838 resched_task(smt_curr);
2839 else
2840 wakeup_busy_runqueue(smt_rq);
2841 } 2795 }
2796unlock:
2797 spin_unlock(&smt_rq->lock);
2842 } 2798 }
2843out_unlock:
2844 for_each_cpu_mask(i, sibling_map)
2845 spin_unlock(&cpu_rq(i)->lock);
2846 return ret; 2799 return ret;
2847} 2800}
2848#else 2801#else
2849static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 2802static inline void wake_sleeping_dependent(int this_cpu)
2850{ 2803{
2851} 2804}
2852 2805
2853static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 2806static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq,
2807 task_t *p)
2854{ 2808{
2855 return 0; 2809 return 0;
2856} 2810}
@@ -2972,32 +2926,13 @@ need_resched_nonpreemptible:
2972 2926
2973 cpu = smp_processor_id(); 2927 cpu = smp_processor_id();
2974 if (unlikely(!rq->nr_running)) { 2928 if (unlikely(!rq->nr_running)) {
2975go_idle:
2976 idle_balance(cpu, rq); 2929 idle_balance(cpu, rq);
2977 if (!rq->nr_running) { 2930 if (!rq->nr_running) {
2978 next = rq->idle; 2931 next = rq->idle;
2979 rq->expired_timestamp = 0; 2932 rq->expired_timestamp = 0;
2980 wake_sleeping_dependent(cpu, rq); 2933 wake_sleeping_dependent(cpu);
2981 /*
2982 * wake_sleeping_dependent() might have released
2983 * the runqueue, so break out if we got new
2984 * tasks meanwhile:
2985 */
2986 if (!rq->nr_running)
2987 goto switch_tasks;
2988 }
2989 } else {
2990 if (dependent_sleeper(cpu, rq)) {
2991 next = rq->idle;
2992 goto switch_tasks; 2934 goto switch_tasks;
2993 } 2935 }
2994 /*
2995 * dependent_sleeper() releases and reacquires the runqueue
2996 * lock, hence go into the idle loop if the rq went
2997 * empty meanwhile:
2998 */
2999 if (unlikely(!rq->nr_running))
3000 goto go_idle;
3001 } 2936 }
3002 2937
3003 array = rq->active; 2938 array = rq->active;
@@ -3035,6 +2970,8 @@ go_idle:
3035 } 2970 }
3036 } 2971 }
3037 next->sleep_type = SLEEP_NORMAL; 2972 next->sleep_type = SLEEP_NORMAL;
2973 if (dependent_sleeper(cpu, rq, next))
2974 next = rq->idle;
3038switch_tasks: 2975switch_tasks:
3039 if (next == rq->idle) 2976 if (next == rq->idle)
3040 schedstat_inc(rq, sched_goidle); 2977 schedstat_inc(rq, sched_goidle);
@@ -6144,7 +6081,6 @@ void __init sched_init(void)
6144 rq->push_cpu = 0; 6081 rq->push_cpu = 0;
6145 rq->migration_thread = NULL; 6082 rq->migration_thread = NULL;
6146 INIT_LIST_HEAD(&rq->migration_queue); 6083 INIT_LIST_HEAD(&rq->migration_queue);
6147 rq->cpu = i;
6148#endif 6084#endif
6149 atomic_set(&rq->nr_iowait, 0); 6085 atomic_set(&rq->nr_iowait, 0);
6150 6086
@@ -6205,7 +6141,7 @@ void normalize_rt_tasks(void)
6205 runqueue_t *rq; 6141 runqueue_t *rq;
6206 6142
6207 read_lock_irq(&tasklist_lock); 6143 read_lock_irq(&tasklist_lock);
6208 for_each_process (p) { 6144 for_each_process(p) {
6209 if (!rt_task(p)) 6145 if (!rt_task(p))
6210 continue; 6146 continue;
6211 6147