aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCon Kolivas <kernel@kolivas.org>2007-03-05 03:30:29 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-03-05 10:57:51 -0500
commit69f7c0a1be84b10a81b6edcce2dbee0cdec26eba (patch)
treea6d4988fda72595ea71ba7e2b4ac11f91fde0159
parent759b9775c25f5e69aaea8a75c3914019e2dc5539 (diff)
[PATCH] sched: remove SMT nice
Remove the SMT-nice feature which idles sibling cpus on SMT cpus to facilitiate nice working properly where cpu power is shared. The idling of cpus in the presence of runnable tasks is considered too fragile, easy to break with outside code, and the complexity of managing this system if an architecture comes along with many logical cores sharing cpu power will be unworkable. Remove the associated per_cpu_gain variable in sched_domains used only by this code. Also: The reason is that with dynticks enabled, this code breaks without yet further tweaks so dynticks brought on the rapid demise of this code. So either we tweak this code or kill it off entirely. It was Ingo's preference to kill it off. Either way this needs to happen for 2.6.21 since dynticks has gone in. Signed-off-by: Con Kolivas <kernel@kolivas.org> Acked-by: Ingo Molnar <mingo@elte.hu> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/asm-i386/topology.h1
-rw-r--r--include/asm-ia64/topology.h2
-rw-r--r--include/asm-mips/mach-ip27/topology.h1
-rw-r--r--include/asm-powerpc/topology.h1
-rw-r--r--include/asm-x86_64/topology.h1
-rw-r--r--include/linux/sched.h1
-rw-r--r--include/linux/topology.h4
-rw-r--r--kernel/sched.c155
8 files changed, 1 insertions, 165 deletions
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index ac58580ad664..7fc512d90ea8 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -85,7 +85,6 @@ static inline int node_to_first_cpu(int node)
85 .idle_idx = 1, \ 85 .idle_idx = 1, \
86 .newidle_idx = 2, \ 86 .newidle_idx = 2, \
87 .wake_idx = 1, \ 87 .wake_idx = 1, \
88 .per_cpu_gain = 100, \
89 .flags = SD_LOAD_BALANCE \ 88 .flags = SD_LOAD_BALANCE \
90 | SD_BALANCE_EXEC \ 89 | SD_BALANCE_EXEC \
91 | SD_BALANCE_FORK \ 90 | SD_BALANCE_FORK \
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index 22ed6749557e..233f1caae048 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -65,7 +65,6 @@ void build_cpu_to_node_map(void);
65 .max_interval = 4, \ 65 .max_interval = 4, \
66 .busy_factor = 64, \ 66 .busy_factor = 64, \
67 .imbalance_pct = 125, \ 67 .imbalance_pct = 125, \
68 .per_cpu_gain = 100, \
69 .cache_nice_tries = 2, \ 68 .cache_nice_tries = 2, \
70 .busy_idx = 2, \ 69 .busy_idx = 2, \
71 .idle_idx = 1, \ 70 .idle_idx = 1, \
@@ -97,7 +96,6 @@ void build_cpu_to_node_map(void);
97 .newidle_idx = 0, /* unused */ \ 96 .newidle_idx = 0, /* unused */ \
98 .wake_idx = 1, \ 97 .wake_idx = 1, \
99 .forkexec_idx = 1, \ 98 .forkexec_idx = 1, \
100 .per_cpu_gain = 100, \
101 .flags = SD_LOAD_BALANCE \ 99 .flags = SD_LOAD_BALANCE \
102 | SD_BALANCE_EXEC \ 100 | SD_BALANCE_EXEC \
103 | SD_BALANCE_FORK \ 101 | SD_BALANCE_FORK \
diff --git a/include/asm-mips/mach-ip27/topology.h b/include/asm-mips/mach-ip27/topology.h
index 44790fdc5d00..61d9be3f3175 100644
--- a/include/asm-mips/mach-ip27/topology.h
+++ b/include/asm-mips/mach-ip27/topology.h
@@ -28,7 +28,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
28 .busy_factor = 32, \ 28 .busy_factor = 32, \
29 .imbalance_pct = 125, \ 29 .imbalance_pct = 125, \
30 .cache_nice_tries = 1, \ 30 .cache_nice_tries = 1, \
31 .per_cpu_gain = 100, \
32 .flags = SD_LOAD_BALANCE \ 31 .flags = SD_LOAD_BALANCE \
33 | SD_BALANCE_EXEC \ 32 | SD_BALANCE_EXEC \
34 | SD_WAKE_BALANCE, \ 33 | SD_WAKE_BALANCE, \
diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h
index 6610495f5f16..0ad21a849b5f 100644
--- a/include/asm-powerpc/topology.h
+++ b/include/asm-powerpc/topology.h
@@ -57,7 +57,6 @@ static inline int pcibus_to_node(struct pci_bus *bus)
57 .busy_factor = 32, \ 57 .busy_factor = 32, \
58 .imbalance_pct = 125, \ 58 .imbalance_pct = 125, \
59 .cache_nice_tries = 1, \ 59 .cache_nice_tries = 1, \
60 .per_cpu_gain = 100, \
61 .busy_idx = 3, \ 60 .busy_idx = 3, \
62 .idle_idx = 1, \ 61 .idle_idx = 1, \
63 .newidle_idx = 2, \ 62 .newidle_idx = 2, \
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 2facec5914d2..4fd6fb23953e 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -43,7 +43,6 @@ extern int __node_distance(int, int);
43 .newidle_idx = 0, \ 43 .newidle_idx = 0, \
44 .wake_idx = 1, \ 44 .wake_idx = 1, \
45 .forkexec_idx = 1, \ 45 .forkexec_idx = 1, \
46 .per_cpu_gain = 100, \
47 .flags = SD_LOAD_BALANCE \ 46 .flags = SD_LOAD_BALANCE \
48 | SD_BALANCE_FORK \ 47 | SD_BALANCE_FORK \
49 | SD_BALANCE_EXEC \ 48 | SD_BALANCE_EXEC \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6f7c9a4d80e5..49fe2997a016 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -684,7 +684,6 @@ struct sched_domain {
684 unsigned int imbalance_pct; /* No balance until over watermark */ 684 unsigned int imbalance_pct; /* No balance until over watermark */
685 unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ 685 unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
686 unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ 686 unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
687 unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */
688 unsigned int busy_idx; 687 unsigned int busy_idx;
689 unsigned int idle_idx; 688 unsigned int idle_idx;
690 unsigned int newidle_idx; 689 unsigned int newidle_idx;
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 6c5a6e6e813b..a9d1f049cc15 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -96,7 +96,6 @@
96 .busy_factor = 64, \ 96 .busy_factor = 64, \
97 .imbalance_pct = 110, \ 97 .imbalance_pct = 110, \
98 .cache_nice_tries = 0, \ 98 .cache_nice_tries = 0, \
99 .per_cpu_gain = 25, \
100 .busy_idx = 0, \ 99 .busy_idx = 0, \
101 .idle_idx = 0, \ 100 .idle_idx = 0, \
102 .newidle_idx = 1, \ 101 .newidle_idx = 1, \
@@ -128,7 +127,6 @@
128 .busy_factor = 64, \ 127 .busy_factor = 64, \
129 .imbalance_pct = 125, \ 128 .imbalance_pct = 125, \
130 .cache_nice_tries = 1, \ 129 .cache_nice_tries = 1, \
131 .per_cpu_gain = 100, \
132 .busy_idx = 2, \ 130 .busy_idx = 2, \
133 .idle_idx = 1, \ 131 .idle_idx = 1, \
134 .newidle_idx = 2, \ 132 .newidle_idx = 2, \
@@ -159,7 +157,6 @@
159 .busy_factor = 64, \ 157 .busy_factor = 64, \
160 .imbalance_pct = 125, \ 158 .imbalance_pct = 125, \
161 .cache_nice_tries = 1, \ 159 .cache_nice_tries = 1, \
162 .per_cpu_gain = 100, \
163 .busy_idx = 2, \ 160 .busy_idx = 2, \
164 .idle_idx = 1, \ 161 .idle_idx = 1, \
165 .newidle_idx = 2, \ 162 .newidle_idx = 2, \
@@ -193,7 +190,6 @@
193 .newidle_idx = 0, /* unused */ \ 190 .newidle_idx = 0, /* unused */ \
194 .wake_idx = 0, /* unused */ \ 191 .wake_idx = 0, /* unused */ \
195 .forkexec_idx = 0, /* unused */ \ 192 .forkexec_idx = 0, /* unused */ \
196 .per_cpu_gain = 100, \
197 .flags = SD_LOAD_BALANCE \ 193 .flags = SD_LOAD_BALANCE \
198 | SD_SERIALIZE, \ 194 | SD_SERIALIZE, \
199 .last_balance = jiffies, \ 195 .last_balance = jiffies, \
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f102e6c7a4c..a4ca632c477c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3006,23 +3006,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
3006} 3006}
3007#endif 3007#endif
3008 3008
3009static inline void wake_priority_sleeper(struct rq *rq)
3010{
3011#ifdef CONFIG_SCHED_SMT
3012 if (!rq->nr_running)
3013 return;
3014
3015 spin_lock(&rq->lock);
3016 /*
3017 * If an SMT sibling task has been put to sleep for priority
3018 * reasons reschedule the idle task to see if it can now run.
3019 */
3020 if (rq->nr_running)
3021 resched_task(rq->idle);
3022 spin_unlock(&rq->lock);
3023#endif
3024}
3025
3026DEFINE_PER_CPU(struct kernel_stat, kstat); 3009DEFINE_PER_CPU(struct kernel_stat, kstat);
3027 3010
3028EXPORT_PER_CPU_SYMBOL(kstat); 3011EXPORT_PER_CPU_SYMBOL(kstat);
@@ -3239,10 +3222,7 @@ void scheduler_tick(void)
3239 3222
3240 update_cpu_clock(p, rq, now); 3223 update_cpu_clock(p, rq, now);
3241 3224
3242 if (p == rq->idle) 3225 if (p != rq->idle)
3243 /* Task on the idle queue */
3244 wake_priority_sleeper(rq);
3245 else
3246 task_running_tick(rq, p); 3226 task_running_tick(rq, p);
3247#ifdef CONFIG_SMP 3227#ifdef CONFIG_SMP
3248 update_load(rq); 3228 update_load(rq);
@@ -3251,136 +3231,6 @@ void scheduler_tick(void)
3251#endif 3231#endif
3252} 3232}
3253 3233
3254#ifdef CONFIG_SCHED_SMT
3255static inline void wakeup_busy_runqueue(struct rq *rq)
3256{
3257 /* If an SMT runqueue is sleeping due to priority reasons wake it up */
3258 if (rq->curr == rq->idle && rq->nr_running)
3259 resched_task(rq->idle);
3260}
3261
3262/*
3263 * Called with interrupt disabled and this_rq's runqueue locked.
3264 */
3265static void wake_sleeping_dependent(int this_cpu)
3266{
3267 struct sched_domain *tmp, *sd = NULL;
3268 int i;
3269
3270 for_each_domain(this_cpu, tmp) {
3271 if (tmp->flags & SD_SHARE_CPUPOWER) {
3272 sd = tmp;
3273 break;
3274 }
3275 }
3276
3277 if (!sd)
3278 return;
3279
3280 for_each_cpu_mask(i, sd->span) {
3281 struct rq *smt_rq = cpu_rq(i);
3282
3283 if (i == this_cpu)
3284 continue;
3285 if (unlikely(!spin_trylock(&smt_rq->lock)))
3286 continue;
3287
3288 wakeup_busy_runqueue(smt_rq);
3289 spin_unlock(&smt_rq->lock);
3290 }
3291}
3292
3293/*
3294 * number of 'lost' timeslices this task wont be able to fully
3295 * utilize, if another task runs on a sibling. This models the
3296 * slowdown effect of other tasks running on siblings:
3297 */
3298static inline unsigned long
3299smt_slice(struct task_struct *p, struct sched_domain *sd)
3300{
3301 return p->time_slice * (100 - sd->per_cpu_gain) / 100;
3302}
3303
3304/*
3305 * To minimise lock contention and not have to drop this_rq's runlock we only
3306 * trylock the sibling runqueues and bypass those runqueues if we fail to
3307 * acquire their lock. As we only trylock the normal locking order does not
3308 * need to be obeyed.
3309 */
3310static int
3311dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
3312{
3313 struct sched_domain *tmp, *sd = NULL;
3314 int ret = 0, i;
3315
3316 /* kernel/rt threads do not participate in dependent sleeping */
3317 if (!p->mm || rt_task(p))
3318 return 0;
3319
3320 for_each_domain(this_cpu, tmp) {
3321 if (tmp->flags & SD_SHARE_CPUPOWER) {
3322 sd = tmp;
3323 break;
3324 }
3325 }
3326
3327 if (!sd)
3328 return 0;
3329
3330 for_each_cpu_mask(i, sd->span) {
3331 struct task_struct *smt_curr;
3332 struct rq *smt_rq;
3333
3334 if (i == this_cpu)
3335 continue;
3336
3337 smt_rq = cpu_rq(i);
3338 if (unlikely(!spin_trylock(&smt_rq->lock)))
3339 continue;
3340
3341 smt_curr = smt_rq->curr;
3342
3343 if (!smt_curr->mm)
3344 goto unlock;
3345
3346 /*
3347 * If a user task with lower static priority than the
3348 * running task on the SMT sibling is trying to schedule,
3349 * delay it till there is proportionately less timeslice
3350 * left of the sibling task to prevent a lower priority
3351 * task from using an unfair proportion of the
3352 * physical cpu's resources. -ck
3353 */
3354 if (rt_task(smt_curr)) {
3355 /*
3356 * With real time tasks we run non-rt tasks only
3357 * per_cpu_gain% of the time.
3358 */
3359 if ((jiffies % DEF_TIMESLICE) >
3360 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
3361 ret = 1;
3362 } else {
3363 if (smt_curr->static_prio < p->static_prio &&
3364 !TASK_PREEMPTS_CURR(p, smt_rq) &&
3365 smt_slice(smt_curr, sd) > task_timeslice(p))
3366 ret = 1;
3367 }
3368unlock:
3369 spin_unlock(&smt_rq->lock);
3370 }
3371 return ret;
3372}
3373#else
3374static inline void wake_sleeping_dependent(int this_cpu)
3375{
3376}
3377static inline int
3378dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
3379{
3380 return 0;
3381}
3382#endif
3383
3384#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) 3234#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3385 3235
3386void fastcall add_preempt_count(int val) 3236void fastcall add_preempt_count(int val)
@@ -3507,7 +3357,6 @@ need_resched_nonpreemptible:
3507 if (!rq->nr_running) { 3357 if (!rq->nr_running) {
3508 next = rq->idle; 3358 next = rq->idle;
3509 rq->expired_timestamp = 0; 3359 rq->expired_timestamp = 0;
3510 wake_sleeping_dependent(cpu);
3511 goto switch_tasks; 3360 goto switch_tasks;
3512 } 3361 }
3513 } 3362 }
@@ -3547,8 +3396,6 @@ need_resched_nonpreemptible:
3547 } 3396 }
3548 } 3397 }
3549 next->sleep_type = SLEEP_NORMAL; 3398 next->sleep_type = SLEEP_NORMAL;
3550 if (rq->nr_running == 1 && dependent_sleeper(cpu, rq, next))
3551 next = rq->idle;
3552switch_tasks: 3399switch_tasks:
3553 if (next == rq->idle) 3400 if (next == rq->idle)
3554 schedstat_inc(rq, sched_goidle); 3401 schedstat_inc(rq, sched_goidle);