aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.linux-foundation.org>2007-08-24 00:38:39 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-08-24 00:38:39 -0400
commitd0797b39dcd70fe366b114515cb898ac6fecdd99 (patch)
tree1716f05d10cfe5a52646eda23275a5d773054e81
parent0542170dec523d50e8bed5515e2f7314e738c8d8 (diff)
parent505c0efd58031923ae01deac16d896607cafa70e (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched
* git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched: sched: tweak the sched_runtime_limit tunable sched: skip updating rq's next_balance under null SD sched: fix broken SMT/MC optimizations sched: accounting regression since rc1 sched: fix sysctl directory permissions sched: sched_clock_idle_[sleep|wakeup]_event()
-rw-r--r--arch/i386/kernel/tsc.c1
-rw-r--r--drivers/acpi/processor_idle.c32
-rw-r--r--fs/proc/array.c44
-rw-r--r--include/linux/sched.h5
-rw-r--r--kernel/sched.c68
-rw-r--r--kernel/sched_debug.c3
6 files changed, 110 insertions, 43 deletions
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index debd7dbb4158..a39280b4dd3a 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -292,7 +292,6 @@ static struct clocksource clocksource_tsc = {
292 292
293void mark_tsc_unstable(char *reason) 293void mark_tsc_unstable(char *reason)
294{ 294{
295 sched_clock_unstable_event();
296 if (!tsc_unstable) { 295 if (!tsc_unstable) {
297 tsc_unstable = 1; 296 tsc_unstable = 1;
298 tsc_enabled = 0; 297 tsc_enabled = 0;
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index a8634a0655fc..d9b8af763e1e 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -63,6 +63,7 @@
63ACPI_MODULE_NAME("processor_idle"); 63ACPI_MODULE_NAME("processor_idle");
64#define ACPI_PROCESSOR_FILE_POWER "power" 64#define ACPI_PROCESSOR_FILE_POWER "power"
65#define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000) 65#define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
66#define PM_TIMER_TICK_NS (1000000000ULL/PM_TIMER_FREQUENCY)
66#define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */ 67#define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */
67#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */ 68#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */
68static void (*pm_idle_save) (void) __read_mostly; 69static void (*pm_idle_save) (void) __read_mostly;
@@ -462,6 +463,9 @@ static void acpi_processor_idle(void)
462 * TBD: Can't get time duration while in C1, as resumes 463 * TBD: Can't get time duration while in C1, as resumes
463 * go to an ISR rather than here. Need to instrument 464 * go to an ISR rather than here. Need to instrument
464 * base interrupt handler. 465 * base interrupt handler.
466 *
467 * Note: the TSC better not stop in C1, sched_clock() will
468 * skew otherwise.
465 */ 469 */
466 sleep_ticks = 0xFFFFFFFF; 470 sleep_ticks = 0xFFFFFFFF;
467 break; 471 break;
@@ -469,6 +473,8 @@ static void acpi_processor_idle(void)
469 case ACPI_STATE_C2: 473 case ACPI_STATE_C2:
470 /* Get start time (ticks) */ 474 /* Get start time (ticks) */
471 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); 475 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
476 /* Tell the scheduler that we are going deep-idle: */
477 sched_clock_idle_sleep_event();
472 /* Invoke C2 */ 478 /* Invoke C2 */
473 acpi_state_timer_broadcast(pr, cx, 1); 479 acpi_state_timer_broadcast(pr, cx, 1);
474 acpi_cstate_enter(cx); 480 acpi_cstate_enter(cx);
@@ -479,17 +485,22 @@ static void acpi_processor_idle(void)
479 /* TSC halts in C2, so notify users */ 485 /* TSC halts in C2, so notify users */
480 mark_tsc_unstable("possible TSC halt in C2"); 486 mark_tsc_unstable("possible TSC halt in C2");
481#endif 487#endif
488 /* Compute time (ticks) that we were actually asleep */
489 sleep_ticks = ticks_elapsed(t1, t2);
490
491 /* Tell the scheduler how much we idled: */
492 sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
493
482 /* Re-enable interrupts */ 494 /* Re-enable interrupts */
483 local_irq_enable(); 495 local_irq_enable();
496 /* Do not account our idle-switching overhead: */
497 sleep_ticks -= cx->latency_ticks + C2_OVERHEAD;
498
484 current_thread_info()->status |= TS_POLLING; 499 current_thread_info()->status |= TS_POLLING;
485 /* Compute time (ticks) that we were actually asleep */
486 sleep_ticks =
487 ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
488 acpi_state_timer_broadcast(pr, cx, 0); 500 acpi_state_timer_broadcast(pr, cx, 0);
489 break; 501 break;
490 502
491 case ACPI_STATE_C3: 503 case ACPI_STATE_C3:
492
493 /* 504 /*
494 * disable bus master 505 * disable bus master
495 * bm_check implies we need ARB_DIS 506 * bm_check implies we need ARB_DIS
@@ -518,6 +529,8 @@ static void acpi_processor_idle(void)
518 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); 529 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
519 /* Invoke C3 */ 530 /* Invoke C3 */
520 acpi_state_timer_broadcast(pr, cx, 1); 531 acpi_state_timer_broadcast(pr, cx, 1);
532 /* Tell the scheduler that we are going deep-idle: */
533 sched_clock_idle_sleep_event();
521 acpi_cstate_enter(cx); 534 acpi_cstate_enter(cx);
522 /* Get end time (ticks) */ 535 /* Get end time (ticks) */
523 t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); 536 t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
@@ -531,12 +544,17 @@ static void acpi_processor_idle(void)
531 /* TSC halts in C3, so notify users */ 544 /* TSC halts in C3, so notify users */
532 mark_tsc_unstable("TSC halts in C3"); 545 mark_tsc_unstable("TSC halts in C3");
533#endif 546#endif
547 /* Compute time (ticks) that we were actually asleep */
548 sleep_ticks = ticks_elapsed(t1, t2);
549 /* Tell the scheduler how much we idled: */
550 sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
551
534 /* Re-enable interrupts */ 552 /* Re-enable interrupts */
535 local_irq_enable(); 553 local_irq_enable();
554 /* Do not account our idle-switching overhead: */
555 sleep_ticks -= cx->latency_ticks + C3_OVERHEAD;
556
536 current_thread_info()->status |= TS_POLLING; 557 current_thread_info()->status |= TS_POLLING;
537 /* Compute time (ticks) that we were actually asleep */
538 sleep_ticks =
539 ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD;
540 acpi_state_timer_broadcast(pr, cx, 0); 558 acpi_state_timer_broadcast(pr, cx, 0);
541 break; 559 break;
542 560
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 965625a0977d..ee4814dd98f9 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -320,7 +320,21 @@ int proc_pid_status(struct task_struct *task, char *buffer)
320 return buffer - orig; 320 return buffer - orig;
321} 321}
322 322
323static clock_t task_utime(struct task_struct *p) 323/*
324 * Use precise platform statistics if available:
325 */
326#ifdef CONFIG_VIRT_CPU_ACCOUNTING
327static cputime_t task_utime(struct task_struct *p)
328{
329 return p->utime;
330}
331
332static cputime_t task_stime(struct task_struct *p)
333{
334 return p->stime;
335}
336#else
337static cputime_t task_utime(struct task_struct *p)
324{ 338{
325 clock_t utime = cputime_to_clock_t(p->utime), 339 clock_t utime = cputime_to_clock_t(p->utime),
326 total = utime + cputime_to_clock_t(p->stime); 340 total = utime + cputime_to_clock_t(p->stime);
@@ -337,10 +351,10 @@ static clock_t task_utime(struct task_struct *p)
337 } 351 }
338 utime = (clock_t)temp; 352 utime = (clock_t)temp;
339 353
340 return utime; 354 return clock_t_to_cputime(utime);
341} 355}
342 356
343static clock_t task_stime(struct task_struct *p) 357static cputime_t task_stime(struct task_struct *p)
344{ 358{
345 clock_t stime; 359 clock_t stime;
346 360
@@ -349,10 +363,12 @@ static clock_t task_stime(struct task_struct *p)
349 * the total, to make sure the total observed by userspace 363 * the total, to make sure the total observed by userspace
350 * grows monotonically - apps rely on that): 364 * grows monotonically - apps rely on that):
351 */ 365 */
352 stime = nsec_to_clock_t(p->se.sum_exec_runtime) - task_utime(p); 366 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
367 cputime_to_clock_t(task_utime(p));
353 368
354 return stime; 369 return clock_t_to_cputime(stime);
355} 370}
371#endif
356 372
357static int do_task_stat(struct task_struct *task, char *buffer, int whole) 373static int do_task_stat(struct task_struct *task, char *buffer, int whole)
358{ 374{
@@ -368,8 +384,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
368 unsigned long long start_time; 384 unsigned long long start_time;
369 unsigned long cmin_flt = 0, cmaj_flt = 0; 385 unsigned long cmin_flt = 0, cmaj_flt = 0;
370 unsigned long min_flt = 0, maj_flt = 0; 386 unsigned long min_flt = 0, maj_flt = 0;
371 cputime_t cutime, cstime; 387 cputime_t cutime, cstime, utime, stime;
372 clock_t utime, stime;
373 unsigned long rsslim = 0; 388 unsigned long rsslim = 0;
374 char tcomm[sizeof(task->comm)]; 389 char tcomm[sizeof(task->comm)];
375 unsigned long flags; 390 unsigned long flags;
@@ -387,8 +402,7 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
387 402
388 sigemptyset(&sigign); 403 sigemptyset(&sigign);
389 sigemptyset(&sigcatch); 404 sigemptyset(&sigcatch);
390 cutime = cstime = cputime_zero; 405 cutime = cstime = utime = stime = cputime_zero;
391 utime = stime = 0;
392 406
393 rcu_read_lock(); 407 rcu_read_lock();
394 if (lock_task_sighand(task, &flags)) { 408 if (lock_task_sighand(task, &flags)) {
@@ -414,15 +428,15 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
414 do { 428 do {
415 min_flt += t->min_flt; 429 min_flt += t->min_flt;
416 maj_flt += t->maj_flt; 430 maj_flt += t->maj_flt;
417 utime += task_utime(t); 431 utime = cputime_add(utime, task_utime(t));
418 stime += task_stime(t); 432 stime = cputime_add(stime, task_stime(t));
419 t = next_thread(t); 433 t = next_thread(t);
420 } while (t != task); 434 } while (t != task);
421 435
422 min_flt += sig->min_flt; 436 min_flt += sig->min_flt;
423 maj_flt += sig->maj_flt; 437 maj_flt += sig->maj_flt;
424 utime += cputime_to_clock_t(sig->utime); 438 utime = cputime_add(utime, sig->utime);
425 stime += cputime_to_clock_t(sig->stime); 439 stime = cputime_add(stime, sig->stime);
426 } 440 }
427 441
428 sid = signal_session(sig); 442 sid = signal_session(sig);
@@ -471,8 +485,8 @@ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
471 cmin_flt, 485 cmin_flt,
472 maj_flt, 486 maj_flt,
473 cmaj_flt, 487 cmaj_flt,
474 utime, 488 cputime_to_clock_t(utime),
475 stime, 489 cputime_to_clock_t(stime),
476 cputime_to_clock_t(cutime), 490 cputime_to_clock_t(cutime),
477 cputime_to_clock_t(cstime), 491 cputime_to_clock_t(cstime),
478 priority, 492 priority,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 682ef87da6eb..ba78807eab91 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -681,7 +681,7 @@ enum cpu_idle_type {
681#define SCHED_LOAD_SHIFT 10 681#define SCHED_LOAD_SHIFT 10
682#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) 682#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
683 683
684#define SCHED_LOAD_SCALE_FUZZ (SCHED_LOAD_SCALE >> 1) 684#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE
685 685
686#ifdef CONFIG_SMP 686#ifdef CONFIG_SMP
687#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ 687#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */
@@ -1388,7 +1388,8 @@ extern void sched_exec(void);
1388#define sched_exec() {} 1388#define sched_exec() {}
1389#endif 1389#endif
1390 1390
1391extern void sched_clock_unstable_event(void); 1391extern void sched_clock_idle_sleep_event(void);
1392extern void sched_clock_idle_wakeup_event(u64 delta_ns);
1392 1393
1393#ifdef CONFIG_HOTPLUG_CPU 1394#ifdef CONFIG_HOTPLUG_CPU
1394extern void idle_task_exit(void); 1395extern void idle_task_exit(void);
diff --git a/kernel/sched.c b/kernel/sched.c
index 45e17b83b7f1..96e9b82246d2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -262,7 +262,8 @@ struct rq {
262 s64 clock_max_delta; 262 s64 clock_max_delta;
263 263
264 unsigned int clock_warps, clock_overflows; 264 unsigned int clock_warps, clock_overflows;
265 unsigned int clock_unstable_events; 265 u64 idle_clock;
266 unsigned int clock_deep_idle_events;
266 u64 tick_timestamp; 267 u64 tick_timestamp;
267 268
268 atomic_t nr_iowait; 269 atomic_t nr_iowait;
@@ -556,18 +557,40 @@ static inline struct rq *this_rq_lock(void)
556} 557}
557 558
558/* 559/*
559 * CPU frequency is/was unstable - start new by setting prev_clock_raw: 560 * We are going deep-idle (irqs are disabled):
560 */ 561 */
561void sched_clock_unstable_event(void) 562void sched_clock_idle_sleep_event(void)
562{ 563{
563 unsigned long flags; 564 struct rq *rq = cpu_rq(smp_processor_id());
564 struct rq *rq;
565 565
566 rq = task_rq_lock(current, &flags); 566 spin_lock(&rq->lock);
567 rq->prev_clock_raw = sched_clock(); 567 __update_rq_clock(rq);
568 rq->clock_unstable_events++; 568 spin_unlock(&rq->lock);
569 task_rq_unlock(rq, &flags); 569 rq->clock_deep_idle_events++;
570} 570}
571EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
572
573/*
574 * We just idled delta nanoseconds (called with irqs disabled):
575 */
576void sched_clock_idle_wakeup_event(u64 delta_ns)
577{
578 struct rq *rq = cpu_rq(smp_processor_id());
579 u64 now = sched_clock();
580
581 rq->idle_clock += delta_ns;
582 /*
583 * Override the previous timestamp and ignore all
584 * sched_clock() deltas that occured while we idled,
585 * and use the PM-provided delta_ns to advance the
586 * rq clock:
587 */
588 spin_lock(&rq->lock);
589 rq->prev_clock_raw = now;
590 rq->clock += delta_ns;
591 spin_unlock(&rq->lock);
592}
593EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
571 594
572/* 595/*
573 * resched_task - mark a task 'to be rescheduled now'. 596 * resched_task - mark a task 'to be rescheduled now'.
@@ -2494,7 +2517,7 @@ group_next:
2494 * a think about bumping its value to force at least one task to be 2517 * a think about bumping its value to force at least one task to be
2495 * moved 2518 * moved
2496 */ 2519 */
2497 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { 2520 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) {
2498 unsigned long tmp, pwr_now, pwr_move; 2521 unsigned long tmp, pwr_now, pwr_move;
2499 unsigned int imbn; 2522 unsigned int imbn;
2500 2523
@@ -3020,6 +3043,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
3020 struct sched_domain *sd; 3043 struct sched_domain *sd;
3021 /* Earliest time when we have to do rebalance again */ 3044 /* Earliest time when we have to do rebalance again */
3022 unsigned long next_balance = jiffies + 60*HZ; 3045 unsigned long next_balance = jiffies + 60*HZ;
3046 int update_next_balance = 0;
3023 3047
3024 for_each_domain(cpu, sd) { 3048 for_each_domain(cpu, sd) {
3025 if (!(sd->flags & SD_LOAD_BALANCE)) 3049 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3056,8 +3080,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
3056 if (sd->flags & SD_SERIALIZE) 3080 if (sd->flags & SD_SERIALIZE)
3057 spin_unlock(&balancing); 3081 spin_unlock(&balancing);
3058out: 3082out:
3059 if (time_after(next_balance, sd->last_balance + interval)) 3083 if (time_after(next_balance, sd->last_balance + interval)) {
3060 next_balance = sd->last_balance + interval; 3084 next_balance = sd->last_balance + interval;
3085 update_next_balance = 1;
3086 }
3061 3087
3062 /* 3088 /*
3063 * Stop the load balance at this level. There is another 3089 * Stop the load balance at this level. There is another
@@ -3067,7 +3093,14 @@ out:
3067 if (!balance) 3093 if (!balance)
3068 break; 3094 break;
3069 } 3095 }
3070 rq->next_balance = next_balance; 3096
3097 /*
3098 * next_balance will be updated only when there is a need.
3099 * When the cpu is attached to null domain for ex, it will not be
3100 * updated.
3101 */
3102 if (likely(update_next_balance))
3103 rq->next_balance = next_balance;
3071} 3104}
3072 3105
3073/* 3106/*
@@ -4890,7 +4923,7 @@ static inline void sched_init_granularity(void)
4890 if (sysctl_sched_granularity > gran_limit) 4923 if (sysctl_sched_granularity > gran_limit)
4891 sysctl_sched_granularity = gran_limit; 4924 sysctl_sched_granularity = gran_limit;
4892 4925
4893 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; 4926 sysctl_sched_runtime_limit = sysctl_sched_granularity * 8;
4894 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; 4927 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4895} 4928}
4896 4929
@@ -5234,15 +5267,16 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
5234static struct ctl_table sd_ctl_dir[] = { 5267static struct ctl_table sd_ctl_dir[] = {
5235 { 5268 {
5236 .procname = "sched_domain", 5269 .procname = "sched_domain",
5237 .mode = 0755, 5270 .mode = 0555,
5238 }, 5271 },
5239 {0,}, 5272 {0,},
5240}; 5273};
5241 5274
5242static struct ctl_table sd_ctl_root[] = { 5275static struct ctl_table sd_ctl_root[] = {
5243 { 5276 {
5277 .ctl_name = CTL_KERN,
5244 .procname = "kernel", 5278 .procname = "kernel",
5245 .mode = 0755, 5279 .mode = 0555,
5246 .child = sd_ctl_dir, 5280 .child = sd_ctl_dir,
5247 }, 5281 },
5248 {0,}, 5282 {0,},
@@ -5318,7 +5352,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5318 for_each_domain(cpu, sd) { 5352 for_each_domain(cpu, sd) {
5319 snprintf(buf, 32, "domain%d", i); 5353 snprintf(buf, 32, "domain%d", i);
5320 entry->procname = kstrdup(buf, GFP_KERNEL); 5354 entry->procname = kstrdup(buf, GFP_KERNEL);
5321 entry->mode = 0755; 5355 entry->mode = 0555;
5322 entry->child = sd_alloc_ctl_domain_table(sd); 5356 entry->child = sd_alloc_ctl_domain_table(sd);
5323 entry++; 5357 entry++;
5324 i++; 5358 i++;
@@ -5338,7 +5372,7 @@ static void init_sched_domain_sysctl(void)
5338 for (i = 0; i < cpu_num; i++, entry++) { 5372 for (i = 0; i < cpu_num; i++, entry++) {
5339 snprintf(buf, 32, "cpu%d", i); 5373 snprintf(buf, 32, "cpu%d", i);
5340 entry->procname = kstrdup(buf, GFP_KERNEL); 5374 entry->procname = kstrdup(buf, GFP_KERNEL);
5341 entry->mode = 0755; 5375 entry->mode = 0555;
5342 entry->child = sd_alloc_ctl_cpu_table(i); 5376 entry->child = sd_alloc_ctl_cpu_table(i);
5343 } 5377 }
5344 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 5378 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 87e524762b85..ab18f45f2ab2 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -154,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu)
154 P(next_balance); 154 P(next_balance);
155 P(curr->pid); 155 P(curr->pid);
156 P(clock); 156 P(clock);
157 P(idle_clock);
157 P(prev_clock_raw); 158 P(prev_clock_raw);
158 P(clock_warps); 159 P(clock_warps);
159 P(clock_overflows); 160 P(clock_overflows);
160 P(clock_unstable_events); 161 P(clock_deep_idle_events);
161 P(clock_max_delta); 162 P(clock_max_delta);
162 P(cpu_load[0]); 163 P(cpu_load[0]);
163 P(cpu_load[1]); 164 P(cpu_load[1]);