diff options
| author | Chen, Kenneth W <kenneth.w.chen@intel.com> | 2006-06-27 05:54:28 -0400 | 
|---|---|---|
| committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-06-27 20:32:44 -0400 | 
| commit | c96d145e71c5c84601322d85748512e09d7b325f (patch) | |
| tree | 4762f8aa4c970295a33afbc4ee506c72d7216073 /kernel/sched.c | |
| parent | 7a8e2a5ea4cf43c0edd6db56a156549edb0eee98 (diff) | |
[PATCH] sched: fix smt nice lock contention and optimization
Initial report and lock contention fix from Chris Mason:
Recent benchmarks showed some performance regressions between 2.6.16 and
2.6.5.  We tracked down one of the regressions to lock contention in
schedule heavy workloads (~70,000 context switches per second)
kernel/sched.c:dependent_sleeper() was responsible for most of the lock
contention, hammering on the run queue locks.  The patch below is more of a
discussion point than a suggested fix (although it does reduce lock
contention significantly).  The dependent_sleeper code looks very expensive
to me, especially for using a spinlock to bounce control between two
different siblings in the same cpu.
It is further optimized:
* perform dependent_sleeper check after next task is determined
* convert wake_sleeping_dependent to use trylock
* skip smt runqueue check if trylock fails
* optimize double_rq_lock now that smt nice is converted to trylock
* early exit in searching first SD_SHARE_CPUPOWER domain
* speedup fast path of dependent_sleeper
[akpm@osdl.org: cleanup]
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Chris Mason <mason@suse.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel/sched.c')
| -rw-r--r-- | kernel/sched.c | 182 | 
1 files changed, 59 insertions, 123 deletions
| diff --git a/kernel/sched.c b/kernel/sched.c index 3e57712aefdf..50a67edc3584 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -239,7 +239,6 @@ struct runqueue { | |||
| 239 | 239 | ||
| 240 | task_t *migration_thread; | 240 | task_t *migration_thread; | 
| 241 | struct list_head migration_queue; | 241 | struct list_head migration_queue; | 
| 242 | int cpu; | ||
| 243 | #endif | 242 | #endif | 
| 244 | 243 | ||
| 245 | #ifdef CONFIG_SCHEDSTATS | 244 | #ifdef CONFIG_SCHEDSTATS | 
| @@ -1074,9 +1073,10 @@ static int sched_balance_self(int cpu, int flag) | |||
| 1074 | struct task_struct *t = current; | 1073 | struct task_struct *t = current; | 
| 1075 | struct sched_domain *tmp, *sd = NULL; | 1074 | struct sched_domain *tmp, *sd = NULL; | 
| 1076 | 1075 | ||
| 1077 | for_each_domain(cpu, tmp) | 1076 | for_each_domain(cpu, tmp) { | 
| 1078 | if (tmp->flags & flag) | 1077 | if (tmp->flags & flag) | 
| 1079 | sd = tmp; | 1078 | sd = tmp; | 
| 1079 | } | ||
| 1080 | 1080 | ||
| 1081 | while (sd) { | 1081 | while (sd) { | 
| 1082 | cpumask_t span; | 1082 | cpumask_t span; | 
| @@ -1691,9 +1691,6 @@ unsigned long nr_active(void) | |||
| 1691 | /* | 1691 | /* | 
| 1692 | * double_rq_lock - safely lock two runqueues | 1692 | * double_rq_lock - safely lock two runqueues | 
| 1693 | * | 1693 | * | 
| 1694 | * We must take them in cpu order to match code in | ||
| 1695 | * dependent_sleeper and wake_dependent_sleeper. | ||
| 1696 | * | ||
| 1697 | * Note this does not disable interrupts like task_rq_lock, | 1694 | * Note this does not disable interrupts like task_rq_lock, | 
| 1698 | * you need to do so manually before calling. | 1695 | * you need to do so manually before calling. | 
| 1699 | */ | 1696 | */ | 
| @@ -1705,7 +1702,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | |||
| 1705 | spin_lock(&rq1->lock); | 1702 | spin_lock(&rq1->lock); | 
| 1706 | __acquire(rq2->lock); /* Fake it out ;) */ | 1703 | __acquire(rq2->lock); /* Fake it out ;) */ | 
| 1707 | } else { | 1704 | } else { | 
| 1708 | if (rq1->cpu < rq2->cpu) { | 1705 | if (rq1 < rq2) { | 
| 1709 | spin_lock(&rq1->lock); | 1706 | spin_lock(&rq1->lock); | 
| 1710 | spin_lock(&rq2->lock); | 1707 | spin_lock(&rq2->lock); | 
| 1711 | } else { | 1708 | } else { | 
| @@ -1741,7 +1738,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | |||
| 1741 | __acquires(this_rq->lock) | 1738 | __acquires(this_rq->lock) | 
| 1742 | { | 1739 | { | 
| 1743 | if (unlikely(!spin_trylock(&busiest->lock))) { | 1740 | if (unlikely(!spin_trylock(&busiest->lock))) { | 
| 1744 | if (busiest->cpu < this_rq->cpu) { | 1741 | if (busiest < this_rq) { | 
| 1745 | spin_unlock(&this_rq->lock); | 1742 | spin_unlock(&this_rq->lock); | 
| 1746 | spin_lock(&busiest->lock); | 1743 | spin_lock(&busiest->lock); | 
| 1747 | spin_lock(&this_rq->lock); | 1744 | spin_lock(&this_rq->lock); | 
| @@ -2352,10 +2349,11 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) | |||
| 2352 | double_lock_balance(busiest_rq, target_rq); | 2349 | double_lock_balance(busiest_rq, target_rq); | 
| 2353 | 2350 | ||
| 2354 | /* Search for an sd spanning us and the target CPU. */ | 2351 | /* Search for an sd spanning us and the target CPU. */ | 
| 2355 | for_each_domain(target_cpu, sd) | 2352 | for_each_domain(target_cpu, sd) { | 
| 2356 | if ((sd->flags & SD_LOAD_BALANCE) && | 2353 | if ((sd->flags & SD_LOAD_BALANCE) && | 
| 2357 | cpu_isset(busiest_cpu, sd->span)) | 2354 | cpu_isset(busiest_cpu, sd->span)) | 
| 2358 | break; | 2355 | break; | 
| 2356 | } | ||
| 2359 | 2357 | ||
| 2360 | if (unlikely(sd == NULL)) | 2358 | if (unlikely(sd == NULL)) | 
| 2361 | goto out; | 2359 | goto out; | 
| @@ -2691,48 +2689,35 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq) | |||
| 2691 | resched_task(rq->idle); | 2689 | resched_task(rq->idle); | 
| 2692 | } | 2690 | } | 
| 2693 | 2691 | ||
| 2694 | static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 2692 | /* | 
| 2693 | * Called with interrupt disabled and this_rq's runqueue locked. | ||
| 2694 | */ | ||
| 2695 | static void wake_sleeping_dependent(int this_cpu) | ||
| 2695 | { | 2696 | { | 
| 2696 | struct sched_domain *tmp, *sd = NULL; | 2697 | struct sched_domain *tmp, *sd = NULL; | 
| 2697 | cpumask_t sibling_map; | ||
| 2698 | int i; | 2698 | int i; | 
| 2699 | 2699 | ||
| 2700 | for_each_domain(this_cpu, tmp) | 2700 | for_each_domain(this_cpu, tmp) { | 
| 2701 | if (tmp->flags & SD_SHARE_CPUPOWER) | 2701 | if (tmp->flags & SD_SHARE_CPUPOWER) { | 
| 2702 | sd = tmp; | 2702 | sd = tmp; | 
| 2703 | break; | ||
| 2704 | } | ||
| 2705 | } | ||
| 2703 | 2706 | ||
| 2704 | if (!sd) | 2707 | if (!sd) | 
| 2705 | return; | 2708 | return; | 
| 2706 | 2709 | ||
| 2707 | /* | 2710 | for_each_cpu_mask(i, sd->span) { | 
| 2708 | * Unlock the current runqueue because we have to lock in | ||
| 2709 | * CPU order to avoid deadlocks. Caller knows that we might | ||
| 2710 | * unlock. We keep IRQs disabled. | ||
| 2711 | */ | ||
| 2712 | spin_unlock(&this_rq->lock); | ||
| 2713 | |||
| 2714 | sibling_map = sd->span; | ||
| 2715 | |||
| 2716 | for_each_cpu_mask(i, sibling_map) | ||
| 2717 | spin_lock(&cpu_rq(i)->lock); | ||
| 2718 | /* | ||
| 2719 | * We clear this CPU from the mask. This both simplifies the | ||
| 2720 | * inner loop and keps this_rq locked when we exit: | ||
| 2721 | */ | ||
| 2722 | cpu_clear(this_cpu, sibling_map); | ||
| 2723 | |||
| 2724 | for_each_cpu_mask(i, sibling_map) { | ||
| 2725 | runqueue_t *smt_rq = cpu_rq(i); | 2711 | runqueue_t *smt_rq = cpu_rq(i); | 
| 2726 | 2712 | ||
| 2713 | if (i == this_cpu) | ||
| 2714 | continue; | ||
| 2715 | if (unlikely(!spin_trylock(&smt_rq->lock))) | ||
| 2716 | continue; | ||
| 2717 | |||
| 2727 | wakeup_busy_runqueue(smt_rq); | 2718 | wakeup_busy_runqueue(smt_rq); | 
| 2719 | spin_unlock(&smt_rq->lock); | ||
| 2728 | } | 2720 | } | 
| 2729 | |||
| 2730 | for_each_cpu_mask(i, sibling_map) | ||
| 2731 | spin_unlock(&cpu_rq(i)->lock); | ||
| 2732 | /* | ||
| 2733 | * We exit with this_cpu's rq still held and IRQs | ||
| 2734 | * still disabled: | ||
| 2735 | */ | ||
| 2736 | } | 2721 | } | 
| 2737 | 2722 | ||
| 2738 | /* | 2723 | /* | 
| @@ -2745,52 +2730,46 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) | |||
| 2745 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; | 2730 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; | 
| 2746 | } | 2731 | } | 
| 2747 | 2732 | ||
| 2748 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 2733 | /* | 
| 2734 | * To minimise lock contention and not have to drop this_rq's runlock we only | ||
| 2735 | * trylock the sibling runqueues and bypass those runqueues if we fail to | ||
| 2736 | * acquire their lock. As we only trylock the normal locking order does not | ||
| 2737 | * need to be obeyed. | ||
| 2738 | */ | ||
| 2739 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p) | ||
| 2749 | { | 2740 | { | 
| 2750 | struct sched_domain *tmp, *sd = NULL; | 2741 | struct sched_domain *tmp, *sd = NULL; | 
| 2751 | cpumask_t sibling_map; | ||
| 2752 | prio_array_t *array; | ||
| 2753 | int ret = 0, i; | 2742 | int ret = 0, i; | 
| 2754 | task_t *p; | ||
| 2755 | 2743 | ||
| 2756 | for_each_domain(this_cpu, tmp) | 2744 | /* kernel/rt threads do not participate in dependent sleeping */ | 
| 2757 | if (tmp->flags & SD_SHARE_CPUPOWER) | 2745 | if (!p->mm || rt_task(p)) | 
| 2746 | return 0; | ||
| 2747 | |||
| 2748 | for_each_domain(this_cpu, tmp) { | ||
| 2749 | if (tmp->flags & SD_SHARE_CPUPOWER) { | ||
| 2758 | sd = tmp; | 2750 | sd = tmp; | 
| 2751 | break; | ||
| 2752 | } | ||
| 2753 | } | ||
| 2759 | 2754 | ||
| 2760 | if (!sd) | 2755 | if (!sd) | 
| 2761 | return 0; | 2756 | return 0; | 
| 2762 | 2757 | ||
| 2763 | /* | 2758 | for_each_cpu_mask(i, sd->span) { | 
| 2764 | * The same locking rules and details apply as for | 2759 | runqueue_t *smt_rq; | 
| 2765 | * wake_sleeping_dependent(): | 2760 | task_t *smt_curr; | 
| 2766 | */ | ||
| 2767 | spin_unlock(&this_rq->lock); | ||
| 2768 | sibling_map = sd->span; | ||
| 2769 | for_each_cpu_mask(i, sibling_map) | ||
| 2770 | spin_lock(&cpu_rq(i)->lock); | ||
| 2771 | cpu_clear(this_cpu, sibling_map); | ||
| 2772 | 2761 | ||
| 2773 | /* | 2762 | if (i == this_cpu) | 
| 2774 | * Establish next task to be run - it might have gone away because | 2763 | continue; | 
| 2775 | * we released the runqueue lock above: | ||
| 2776 | */ | ||
| 2777 | if (!this_rq->nr_running) | ||
| 2778 | goto out_unlock; | ||
| 2779 | array = this_rq->active; | ||
| 2780 | if (!array->nr_active) | ||
| 2781 | array = this_rq->expired; | ||
| 2782 | BUG_ON(!array->nr_active); | ||
| 2783 | 2764 | ||
| 2784 | p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, | 2765 | smt_rq = cpu_rq(i); | 
| 2785 | task_t, run_list); | 2766 | if (unlikely(!spin_trylock(&smt_rq->lock))) | 
| 2767 | continue; | ||
| 2786 | 2768 | ||
| 2787 | for_each_cpu_mask(i, sibling_map) { | 2769 | smt_curr = smt_rq->curr; | 
| 2788 | runqueue_t *smt_rq = cpu_rq(i); | ||
| 2789 | task_t *smt_curr = smt_rq->curr; | ||
| 2790 | 2770 | ||
| 2791 | /* Kernel threads do not participate in dependent sleeping */ | 2771 | if (!smt_curr->mm) | 
| 2792 | if (!p->mm || !smt_curr->mm || rt_task(p)) | 2772 | goto unlock; | 
| 2793 | goto check_smt_task; | ||
| 2794 | 2773 | ||
| 2795 | /* | 2774 | /* | 
| 2796 | * If a user task with lower static priority than the | 2775 | * If a user task with lower static priority than the | 
| @@ -2808,49 +2787,24 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | |||
| 2808 | if ((jiffies % DEF_TIMESLICE) > | 2787 | if ((jiffies % DEF_TIMESLICE) > | 
| 2809 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | 2788 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | 
| 2810 | ret = 1; | 2789 | ret = 1; | 
| 2811 | } else | 2790 | } else { | 
| 2812 | if (smt_curr->static_prio < p->static_prio && | 2791 | if (smt_curr->static_prio < p->static_prio && | 
| 2813 | !TASK_PREEMPTS_CURR(p, smt_rq) && | 2792 | !TASK_PREEMPTS_CURR(p, smt_rq) && | 
| 2814 | smt_slice(smt_curr, sd) > task_timeslice(p)) | 2793 | smt_slice(smt_curr, sd) > task_timeslice(p)) | 
| 2815 | ret = 1; | 2794 | ret = 1; | 
| 2816 | |||
| 2817 | check_smt_task: | ||
| 2818 | if ((!smt_curr->mm && smt_curr != smt_rq->idle) || | ||
| 2819 | rt_task(smt_curr)) | ||
| 2820 | continue; | ||
| 2821 | if (!p->mm) { | ||
| 2822 | wakeup_busy_runqueue(smt_rq); | ||
| 2823 | continue; | ||
| 2824 | } | ||
| 2825 | |||
| 2826 | /* | ||
| 2827 | * Reschedule a lower priority task on the SMT sibling for | ||
| 2828 | * it to be put to sleep, or wake it up if it has been put to | ||
| 2829 | * sleep for priority reasons to see if it should run now. | ||
| 2830 | */ | ||
| 2831 | if (rt_task(p)) { | ||
| 2832 | if ((jiffies % DEF_TIMESLICE) > | ||
| 2833 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | ||
| 2834 | resched_task(smt_curr); | ||
| 2835 | } else { | ||
| 2836 | if (TASK_PREEMPTS_CURR(p, smt_rq) && | ||
| 2837 | smt_slice(p, sd) > task_timeslice(smt_curr)) | ||
| 2838 | resched_task(smt_curr); | ||
| 2839 | else | ||
| 2840 | wakeup_busy_runqueue(smt_rq); | ||
| 2841 | } | 2795 | } | 
| 2796 | unlock: | ||
| 2797 | spin_unlock(&smt_rq->lock); | ||
| 2842 | } | 2798 | } | 
| 2843 | out_unlock: | ||
| 2844 | for_each_cpu_mask(i, sibling_map) | ||
| 2845 | spin_unlock(&cpu_rq(i)->lock); | ||
| 2846 | return ret; | 2799 | return ret; | 
| 2847 | } | 2800 | } | 
| 2848 | #else | 2801 | #else | 
| 2849 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 2802 | static inline void wake_sleeping_dependent(int this_cpu) | 
| 2850 | { | 2803 | { | 
| 2851 | } | 2804 | } | 
| 2852 | 2805 | ||
| 2853 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 2806 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq, | 
| 2807 | task_t *p) | ||
| 2854 | { | 2808 | { | 
| 2855 | return 0; | 2809 | return 0; | 
| 2856 | } | 2810 | } | 
| @@ -2972,32 +2926,13 @@ need_resched_nonpreemptible: | |||
| 2972 | 2926 | ||
| 2973 | cpu = smp_processor_id(); | 2927 | cpu = smp_processor_id(); | 
| 2974 | if (unlikely(!rq->nr_running)) { | 2928 | if (unlikely(!rq->nr_running)) { | 
| 2975 | go_idle: | ||
| 2976 | idle_balance(cpu, rq); | 2929 | idle_balance(cpu, rq); | 
| 2977 | if (!rq->nr_running) { | 2930 | if (!rq->nr_running) { | 
| 2978 | next = rq->idle; | 2931 | next = rq->idle; | 
| 2979 | rq->expired_timestamp = 0; | 2932 | rq->expired_timestamp = 0; | 
| 2980 | wake_sleeping_dependent(cpu, rq); | 2933 | wake_sleeping_dependent(cpu); | 
| 2981 | /* | ||
| 2982 | * wake_sleeping_dependent() might have released | ||
| 2983 | * the runqueue, so break out if we got new | ||
| 2984 | * tasks meanwhile: | ||
| 2985 | */ | ||
| 2986 | if (!rq->nr_running) | ||
| 2987 | goto switch_tasks; | ||
| 2988 | } | ||
| 2989 | } else { | ||
| 2990 | if (dependent_sleeper(cpu, rq)) { | ||
| 2991 | next = rq->idle; | ||
| 2992 | goto switch_tasks; | 2934 | goto switch_tasks; | 
| 2993 | } | 2935 | } | 
| 2994 | /* | ||
| 2995 | * dependent_sleeper() releases and reacquires the runqueue | ||
| 2996 | * lock, hence go into the idle loop if the rq went | ||
| 2997 | * empty meanwhile: | ||
| 2998 | */ | ||
| 2999 | if (unlikely(!rq->nr_running)) | ||
| 3000 | goto go_idle; | ||
| 3001 | } | 2936 | } | 
| 3002 | 2937 | ||
| 3003 | array = rq->active; | 2938 | array = rq->active; | 
| @@ -3035,6 +2970,8 @@ go_idle: | |||
| 3035 | } | 2970 | } | 
| 3036 | } | 2971 | } | 
| 3037 | next->sleep_type = SLEEP_NORMAL; | 2972 | next->sleep_type = SLEEP_NORMAL; | 
| 2973 | if (dependent_sleeper(cpu, rq, next)) | ||
| 2974 | next = rq->idle; | ||
| 3038 | switch_tasks: | 2975 | switch_tasks: | 
| 3039 | if (next == rq->idle) | 2976 | if (next == rq->idle) | 
| 3040 | schedstat_inc(rq, sched_goidle); | 2977 | schedstat_inc(rq, sched_goidle); | 
| @@ -6144,7 +6081,6 @@ void __init sched_init(void) | |||
| 6144 | rq->push_cpu = 0; | 6081 | rq->push_cpu = 0; | 
| 6145 | rq->migration_thread = NULL; | 6082 | rq->migration_thread = NULL; | 
| 6146 | INIT_LIST_HEAD(&rq->migration_queue); | 6083 | INIT_LIST_HEAD(&rq->migration_queue); | 
| 6147 | rq->cpu = i; | ||
| 6148 | #endif | 6084 | #endif | 
| 6149 | atomic_set(&rq->nr_iowait, 0); | 6085 | atomic_set(&rq->nr_iowait, 0); | 
| 6150 | 6086 | ||
| @@ -6205,7 +6141,7 @@ void normalize_rt_tasks(void) | |||
| 6205 | runqueue_t *rq; | 6141 | runqueue_t *rq; | 
| 6206 | 6142 | ||
| 6207 | read_lock_irq(&tasklist_lock); | 6143 | read_lock_irq(&tasklist_lock); | 
| 6208 | for_each_process (p) { | 6144 | for_each_process(p) { | 
| 6209 | if (!rt_task(p)) | 6145 | if (!rt_task(p)) | 
| 6210 | continue; | 6146 | continue; | 
| 6211 | 6147 | ||
