From cf82ff7ea7695b0e82ba07bc5e9f1bd03a74e1aa Mon Sep 17 00:00:00 2001 From: "Jayson R. King" Date: Mon, 5 Oct 2009 05:21:26 -0500 Subject: sched: Remove obsolete comment in sched_init() Remove the comment about calling alloc_bootmem() as it is not called here since commit 36b7b6d465489c4754c4fd66fcec6086eba87896. Signed-off-by: Jayson R. King Cc: Peter Zijlstra Cc: Jiri Kosina LKML-Reference: <4AC9C8A6.6010209@jaysonking.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 830967e18285..a56446d7fda2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9322,10 +9322,6 @@ void __init sched_init(void) #ifdef CONFIG_CPUMASK_OFFSTACK alloc_size += num_possible_cpus() * cpumask_size(); #endif - /* - * As sched_init() is called before page_alloc is setup, - * we use alloc_bootmem(). - */ if (alloc_size) { ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); -- cgit v1.2.2 From ce0e7b28fb75cb003cfc8d0238613aaf1c55e797 Mon Sep 17 00:00:00 2001 From: Ryota Ozaki Date: Sat, 24 Oct 2009 01:20:10 +0900 Subject: sched, cpuacct: Fix niced guest time accounting CPU time of a guest is always accounted in 'user' time without concern for the nice value of its counterpart process although the guest is scheduled under the nice value. This patch fixes the defect and accounts cpu time of a niced guest in 'nice' time as same as a niced process. And also the patch adds 'guest_nice' to cpuacct. The value provides niced guest cpu time which is like 'nice' to 'user'. The original discussions can be found here: http://www.mail-archive.com/kvm@vger.kernel.org/msg23982.html http://www.mail-archive.com/kvm@vger.kernel.org/msg23860.html Signed-off-by: Ryota Ozaki Acked-by: Avi Kivity Cc: Peter Zijlstra LKML-Reference: <1256314810-7897-1-git-send-email-ozaki.ryota@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e5205811c19e..67be4d0dddaa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5017,8 +5017,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, p->gtime = cputime_add(p->gtime, cputime); /* Add guest time to cpustat. */ - cpustat->user = cputime64_add(cpustat->user, tmp); - cpustat->guest = cputime64_add(cpustat->guest, tmp); + if (TASK_NICE(p) > 0) { + cpustat->nice = cputime64_add(cpustat->nice, tmp); + cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); + } else { + cpustat->user = cputime64_add(cpustat->user, tmp); + cpustat->guest = cputime64_add(cpustat->guest, tmp); + } } /* -- cgit v1.2.2 From 9824a2b728b63e7ff586b9fd9293c819be79f0f3 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 4 Nov 2009 16:16:54 +0900 Subject: sched: Remove unused cpu_nr_migrations() cpu_nr_migrations() is not used, remove it. Signed-off-by: Hiroshi Shimamoto Cc: Peter Zijlstra LKML-Reference: <4AF12A66.6020609@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 67be4d0dddaa..30fd0ba5f603 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -541,7 +541,6 @@ struct rq { struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; - u64 nr_migrations_in; struct cfs_rq cfs; struct rt_rq rt; @@ -2049,7 +2048,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #endif if (old_cpu != new_cpu) { p->se.nr_migrations++; - new_rq->nr_migrations_in++; #ifdef CONFIG_SCHEDSTATS if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); @@ -2988,15 +2986,6 @@ static void calc_load_account_active(struct rq *this_rq) } } -/* - * Externally visible per-cpu scheduler statistics: - * cpu_nr_migrations(cpu) - number of migrations into that cpu - */ -u64 cpu_nr_migrations(int cpu) -{ - return cpu_rq(cpu)->nr_migrations_in; -} - /* * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). -- cgit v1.2.2 From acc3f5d7cabbfd6cec71f0c1f9900621fa2d6ae7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 3 Nov 2009 14:53:40 +1030 Subject: cpumask: Partition_sched_domains takes array of cpumask_var_t Currently partition_sched_domains() takes a 'struct cpumask *doms_new' which is a kmalloc'ed array of cpumask_t. You can't have such an array if 'struct cpumask' is undefined, as we plan for CONFIG_CPUMASK_OFFSTACK=y. So, we make this an array of cpumask_var_t instead: this is the same for the CONFIG_CPUMASK_OFFSTACK=n case, but requires multiple allocations for the CONFIG_CPUMASK_OFFSTACK=y case. Hence we add alloc_sched_domains() and free_sched_domains() functions. Signed-off-by: Rusty Russell Cc: Peter Zijlstra LKML-Reference: <200911031453.40668.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- kernel/sched.c | 68 +++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 22 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 30fd0ba5f603..ae026aad145b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8846,7 +8846,7 @@ static int build_sched_domains(const struct cpumask *cpu_map) return __build_sched_domains(cpu_map, NULL); } -static struct cpumask *doms_cur; /* current sched domains */ +static cpumask_var_t *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; /* attribues of custom domains in 'doms_cur' */ @@ -8868,6 +8868,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void) return 0; } +cpumask_var_t *alloc_sched_domains(unsigned int ndoms) +{ + int i; + cpumask_var_t *doms; + + doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); + if (!doms) + return NULL; + for (i = 0; i < ndoms; i++) { + if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { + free_sched_domains(doms, i); + return NULL; + } + } + return doms; +} + +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) +{ + unsigned int i; + for (i = 0; i < ndoms; i++) + free_cpumask_var(doms[i]); + kfree(doms); +} + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to @@ -8879,12 +8904,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) arch_update_cpu_topology(); ndoms_cur = 1; - doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); + doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) - doms_cur = fallback_doms; - cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); + doms_cur = &fallback_doms; + cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); dattr_cur = NULL; - err = build_sched_domains(doms_cur); + err = build_sched_domains(doms_cur[0]); register_sched_domain_sysctl(); return err; @@ -8934,19 +8959,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * doms_new[] to the current sched domain partitioning, doms_cur[]. * It destroys each deleted domain and builds each new domain. * - * 'doms_new' is an array of cpumask's of length 'ndoms_new'. + * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. * The masks don't intersect (don't overlap.) We should setup one * sched domain for each mask. CPUs not in any of the cpumasks will * not be load balanced. If the same cpumask appears both in the * current 'doms_cur' domains and in the new 'doms_new', we can leave * it as it is. * - * The passed in 'doms_new' should be kmalloc'd. This routine takes - * ownership of it and will kfree it when done with it. If the caller - * failed the kmalloc call, then it can pass in doms_new == NULL && - * ndoms_new == 1, and partition_sched_domains() will fallback to - * the single partition 'fallback_doms', it also forces the domains - * to be rebuilt. + * The passed in 'doms_new' should be allocated using + * alloc_sched_domains. This routine takes ownership of it and will + * free_sched_domains it when done with it. If the caller failed the + * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, + * and partition_sched_domains() will fallback to the single partition + * 'fallback_doms', it also forces the domains to be rebuilt. * * If doms_new == NULL it will be replaced with cpu_online_mask. * ndoms_new == 0 is a special case for destroying existing domains, @@ -8954,8 +8979,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * * Call with hotplug lock held */ -/* FIXME: Change to struct cpumask *doms_new[] */ -void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { int i, j, n; @@ -8974,40 +8998,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(&doms_cur[i], &doms_new[j]) + if (cpumask_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } /* no match - a current sched domain not in new doms_new[] */ - detach_destroy_domains(doms_cur + i); + detach_destroy_domains(doms_cur[i]); match1: ; } if (doms_new == NULL) { ndoms_cur = 0; - doms_new = fallback_doms; - cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); + doms_new = &fallback_doms; + cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur && !new_topology; j++) { - if (cpumask_equal(&doms_new[i], &doms_cur[j]) + if (cpumask_equal(doms_new[i], doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } /* no match - add a new doms_new */ - __build_sched_domains(doms_new + i, + __build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); match2: ; } /* Remember the new sched domains */ - if (doms_cur != fallback_doms) - kfree(doms_cur); + if (doms_cur != &fallback_doms) + free_sched_domains(doms_cur, ndoms_cur); kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; dattr_cur = dattr_new; -- cgit v1.2.2 From 1b9508f6831e10d53256825de8904caa22d1ca2c Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 4 Nov 2009 17:53:50 +0100 Subject: sched: Rate-limit newidle Rate limit newidle to migration_cost. It's a win for all stages of sysbench oltp tests. Signed-off-by: Mike Galbraith Cc: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ae026aad145b..f8492123b5d1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -589,6 +589,8 @@ struct rq { u64 rt_avg; u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; #endif /* calc_load related fields */ @@ -2353,6 +2355,17 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (rq != orig_rq) update_rq_clock(rq); + if (rq->idle_stamp) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } + WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); @@ -4389,6 +4402,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq) int pulled_task = 0; unsigned long next_balance = jiffies + HZ; + this_rq->idle_stamp = this_rq->clock; + + if (this_rq->avg_idle < sysctl_sched_migration_cost) + return; + for_each_domain(this_cpu, sd) { unsigned long interval; @@ -4403,8 +4421,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) + if (pulled_task) { + this_rq->idle_stamp = 0; break; + } } if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* -- cgit v1.2.2 From d8c80ce091f6ead6710bc71b58f2c32e5bf855e4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 27 Oct 2009 15:45:23 +0800 Subject: sched, no_hz: Remove unused rq->last_tick_seen field In 15934a37324f32e0fda633dc7984a671ea81cd75, field last_tick_seen is added to struct rq. But it is unused now. Signed-off-by: Lai Jiangshan Cc: Guillaume Chazarain LKML-Reference: <4AE6A513.6010100@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f8492123b5d1..23e353568d8e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -534,7 +534,6 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; #ifdef CONFIG_NO_HZ - unsigned long last_tick_seen; unsigned char in_nohz_recently; #endif /* capture load from *all* tasks on this cpu: */ -- cgit v1.2.2 From eae0c9dfb534cb3449888b9601228efa6480fdb5 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 10 Nov 2009 03:50:02 +0100 Subject: sched: Fix and clean up rate-limit newidle code Commit 1b9508f, "Rate-limit newidle" has been confirmed to fix the netperf UDP loopback regression reported by Alex Shi. This is a cleanup and a fix: - moved to a more out of the way spot - fix to ensure that balancing doesn't try to balance runqueues which haven't gone online yet, which can mess up CPU enumeration during boot. Reported-by: Alex Shi Reported-by: Zhang, Yanmin Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra Cc: # .32.x: a1f84a3: sched: Check for an idle shared cache Cc: # .32.x: 1b9508f: sched: Rate-limit newidle Cc: # .32.x: fd21073: sched: Fix affinity logic Cc: # .32.x LKML-Reference: <1257821402.5648.17.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 23e353568d8e..ad37776cc39b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2354,17 +2354,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (rq != orig_rq) update_rq_clock(rq); - if (rq->idle_stamp) { - u64 delta = rq->clock - rq->idle_stamp; - u64 max = 2*sysctl_sched_migration_cost; - - if (delta > max) - rq->avg_idle = max; - else - update_avg(&rq->avg_idle, delta); - rq->idle_stamp = 0; - } - WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); @@ -2421,6 +2410,17 @@ out_running: #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); + + if (unlikely(rq->idle_stamp)) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } #endif out: task_rq_unlock(rq, &flags); @@ -4098,7 +4098,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_setall(cpus); + cpumask_copy(cpus, cpu_online_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4261,7 +4261,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int all_pinned = 0; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_setall(cpus); + cpumask_copy(cpus, cpu_online_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -9522,6 +9522,8 @@ void __init sched_init(void) rq->cpu = i; rq->online = 0; rq->migration_thread = NULL; + rq->idle_stamp = 0; + rq->avg_idle = 2*sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->migration_queue); rq_attach_root(rq, &def_root_domain); #endif -- cgit v1.2.2 From ffd44db5f02af32bcc25a8eb5981bf02a141cdab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Nov 2009 20:12:01 +0100 Subject: sched: Make sure task has correct sched_class after policy change From the code in rt_mutex_setprio(), it is evident that the intention is that task's with a RT 'prio' value as a consequence of receiving a PI boost also have their 'sched_class' field set to '&rt_sched_class'. However, Peter noticed that the code in __setscheduler() could result in this intention being frustrated. Fix it. Reported-by: Peter Williams Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <1257880321.4108.457.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ad37776cc39b..43e61fa04dc7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6159,22 +6159,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) BUG_ON(p->se.on_rq); p->policy = policy; - switch (p->policy) { - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - p->sched_class = &fair_sched_class; - break; - case SCHED_FIFO: - case SCHED_RR: - p->sched_class = &rt_sched_class; - break; - } - p->rt_priority = prio; p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); + if (rt_prio(p->prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; set_load_weight(p); } -- cgit v1.2.2 From 956539b75921f561c0956c22d37320780e8b4ba1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Nov 2009 13:37:20 -0800 Subject: rcu: Enable synchronize_sched_expedited() fastpath This patch adds a counter increment to enable tasks to actually take the synchronize_sched_expedited() function's fastpath. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1257889042435-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 76c0e9691fc0..e69fee4544bd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -10865,6 +10865,7 @@ void synchronize_sched_expedited(void) spin_unlock_irqrestore(&rq->lock, flags); } rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + synchronize_sched_expedited_count++; mutex_unlock(&rcu_sched_expedited_mutex); put_online_cpus(); if (need_full_sync) -- cgit v1.2.2 From 56992309ccbe71f4321ddd50ee2f76f91b412c1a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 5 Nov 2009 15:38:40 -0800 Subject: sysctl kernel: Remove binary sysctl logic Now that sys_sysctl is a generic wrapper around /proc/sys .ctl_name and .strategy members of sysctl tables are dead code. Remove them. Cc: Ingo Molnar Cc: Peter Zijlstra Cc: David Howells Signed-off-by: Eric W. Biederman --- kernel/sched.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index a455dca884a6..dbb99d787a41 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7373,17 +7373,16 @@ static struct ctl_table sd_ctl_dir[] = { .procname = "sched_domain", .mode = 0555, }, - {0, }, + {} }; static struct ctl_table sd_ctl_root[] = { { - .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, .child = sd_ctl_dir, }, - {0, }, + {} }; static struct ctl_table *sd_alloc_ctl_entry(int n) -- cgit v1.2.2 From 055a00865dcfc8e61f3cbefbb879c9577bd36ae5 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 12 Nov 2009 11:07:44 +0100 Subject: sched: Fix/add missing update_rq_clock() calls kthread_bind(), migrate_task() and sched_fork were missing updates, and try_to_wake_up() was updating after having already used the stale clock. Aside from preventing potential latency hits, there' a side benefit in that early boot printk time stamps become monotonic. Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra LKML-Reference: <1258020464.6491.2.camel@marge.simson.net> Signed-off-by: Ingo Molnar LKML-Reference: --- kernel/sched.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3c11ae0a948d..701eca4958a2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2017,6 +2017,7 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) } spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); set_task_cpu(p, cpu); p->cpus_allowed = cpumask_of_cpu(cpu); p->rt.nr_cpus_allowed = 1; @@ -2115,6 +2116,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) * it is sufficient to simply update the task's cpu field. */ if (!p->se.on_rq && !task_running(rq, p)) { + update_rq_clock(rq); set_task_cpu(p, dest_cpu); return 0; } @@ -2376,14 +2378,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, task_rq_unlock(rq, &flags); cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) + if (cpu != orig_cpu) { + local_irq_save(flags); + rq = cpu_rq(cpu); + update_rq_clock(rq); set_task_cpu(p, cpu); - + local_irq_restore(flags); + } rq = task_rq_lock(p, &flags); - if (rq != orig_rq) - update_rq_clock(rq); - WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); @@ -2545,6 +2548,7 @@ static void __sched_fork(struct task_struct *p) void sched_fork(struct task_struct *p, int clone_flags) { int cpu = get_cpu(); + unsigned long flags; __sched_fork(p); @@ -2581,7 +2585,10 @@ void sched_fork(struct task_struct *p, int clone_flags) #ifdef CONFIG_SMP cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); #endif + local_irq_save(flags); + update_rq_clock(cpu_rq(cpu)); set_task_cpu(p, cpu); + local_irq_restore(flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) -- cgit v1.2.2 From 761b1d26df542fd5eb348837351e4d2f3bc7bffe Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 12 Nov 2009 13:33:45 +0900 Subject: sched: Fix granularity of task_u/stime() Originally task_s/utime() were designed to return clock_t but later changed to return cputime_t by following commit: commit efe567fc8281661524ffa75477a7c4ca9b466c63 Author: Christian Borntraeger Date: Thu Aug 23 15:18:02 2007 +0200 It only changed the type of return value, but not the implementation. As the result the granularity of task_s/utime() is still that of clock_t, not that of cputime_t. So using task_s/utime() in __exit_signal() makes values accumulated to the signal struct to be rounded and coarse grained. This patch removes casts to clock_t in task_u/stime(), to keep granularity of cputime_t over the calculation. v2: Use div_u64() to avoid error "undefined reference to `__udivdi3`" on some 32bit systems. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: xiyou.wangcong@gmail.com Cc: Spencer Candland Cc: Oleg Nesterov Cc: Stanislaw Gruszka LKML-Reference: <4AFB9029.9000208@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 43e61fa04dc7..ab9a034c4a17 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5156,41 +5156,45 @@ cputime_t task_stime(struct task_struct *p) return p->stime; } #else + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs) \ + msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC)) +#endif + cputime_t task_utime(struct task_struct *p) { - clock_t utime = cputime_to_clock_t(p->utime), - total = utime + cputime_to_clock_t(p->stime); + cputime_t utime = p->utime, total = utime + p->stime; u64 temp; /* * Use CFS's precise accounting: */ - temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { temp *= utime; do_div(temp, total); } - utime = (clock_t)temp; + utime = (cputime_t)temp; - p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); + p->prev_utime = max(p->prev_utime, utime); return p->prev_utime; } cputime_t task_stime(struct task_struct *p) { - clock_t stime; + cputime_t stime; /* * Use CFS's precise accounting. (we subtract utime from * the total, to make sure the total observed by userspace * grows monotonically - apps rely on that): */ - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - - cputime_to_clock_t(task_utime(p)); + stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p); if (stime >= 0) - p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); + p->prev_stime = max(p->prev_stime, stime); return p->prev_stime; } -- cgit v1.2.2 From 498657a478c60be092208422fefa9c7b248729c2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Nov 2009 18:33:53 +0900 Subject: sched, kvm: Fix race condition involving sched_in_preempt_notifers In finish_task_switch(), fire_sched_in_preempt_notifiers() is called after finish_lock_switch(). However, depending on architecture, preemption can be enabled after finish_lock_switch() which breaks the semantics of preempt notifiers. So move it before finish_arch_switch(). This also makes the in- notifiers symmetric to out- notifiers in terms of locking - now both are called under rq lock. Signed-off-by: Tejun Heo Acked-by: Avi Kivity Cc: Peter Zijlstra LKML-Reference: <4AFD2801.7020900@kernel.org> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 701eca4958a2..cea2beac7909 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2758,9 +2758,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) prev_state = prev->state; finish_arch_switch(prev); perf_event_task_sched_in(current, cpu_of(rq)); + fire_sched_in_preempt_notifiers(current); finish_lock_switch(rq, prev); - fire_sched_in_preempt_notifiers(current); if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { -- cgit v1.2.2 From 047106adcc85e3023da210143a6ab8a55df9e0fc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 16 Nov 2009 10:28:09 +0100 Subject: sched: Sched_rt_periodic_timer vs cpu hotplug Heiko reported a case where a timer interrupt managed to reference a root_domain structure that was already freed by a concurrent hot-un-plug operation. Solve this like the regular sched_domain stuff is also synchronized, by adding a synchronize_sched() stmt to the free path, this ensures that a root_domain stays present for any atomic section that could have observed it. Reported-by: Heiko Carstens Signed-off-by: Peter Zijlstra Acked-by: Heiko Carstens Cc: Gregory Haskins Cc: Siddha Suresh B Cc: Martin Schwidefsky LKML-Reference: <1258363873.26714.83.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index cea2beac7909..3c91f110fc62 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7912,6 +7912,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) static void free_rootdomain(struct root_domain *rd) { + synchronize_sched(); + cpupri_cleanup(&rd->cpupri); free_cpumask_var(rd->rto_mask); -- cgit v1.2.2 From 429947248f814e90f416ab4f68a871ab628000c3 Mon Sep 17 00:00:00 2001 From: Jan Blunck Date: Fri, 20 Nov 2009 17:40:37 +0100 Subject: sched_feat_write(): Update ppos instead of file->f_pos sched_feat_write() should update ppos instead of file->f_pos. (This reduces some BKL dependencies of this code.) Signed-off-by: Jan Blunck Cc: jkacur@redhat.com Cc: Arnd Bergmann Cc: Frederic Weisbecker Cc: Jamie Lokier Cc: Peter Zijlstra Cc: Christoph Hellwig Cc: Alan Cox LKML-Reference: <1258735245-25826-8-git-send-email-jblunck@suse.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ab9a034c4a17..93474a7935ae 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -771,7 +771,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, if (!sched_feat_names[i]) return -EINVAL; - filp->f_pos += cnt; + *ppos += cnt; return cnt; } -- cgit v1.2.2 From 710390d90f143a9ebb87a475215140f426792efd Mon Sep 17 00:00:00 2001 From: Tim Blechmann Date: Tue, 24 Nov 2009 11:55:27 +0100 Subject: sched: Optimize branch hint in context_switch() Branch hint profiling on my nehalem machine showed over 90% incorrect branch hints: 10420275 170645395 94 context_switch sched.c 3043 10408421 171098521 94 context_switch sched.c 3050 Signed-off-by: Tim Blechmann Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <4B0BBB9F.6080304@klingt.org> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 93474a7935ae..010d5e16b4c5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2829,14 +2829,14 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_start_context_switch(prev); - if (unlikely(!mm)) { + if (likely(!mm)) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (unlikely(!prev->mm)) { + if (likely(!prev->mm)) { prev->active_mm = NULL; rq->prev_mm = oldmm; } -- cgit v1.2.2 From 93335a21557e80f6a99bc2812c634e488139043c Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Wed, 25 Nov 2009 15:23:41 +0200 Subject: sched.c: Call debug_show_all_locks() when dumping all tasks In commit v2.6.21-691-g39bc89f ("make SysRq-T show all tasks again") the interface of show_state_filter() was changed: zero valued 'state_filter' specifies "dump all tasks" (instead of -1). However, the condition for calling debug_show_all_locks() ("show locks if all tasks are dumped") was not updated accordingly. Signed-off-by: Shmulik Ladkani Cc: peterz@infradead.org LKML-Reference: <4b0d2fe4.0ab6660a.6437.3cfc@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 010d5e16b4c5..a57c6aee6d4a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6915,7 +6915,7 @@ void show_state_filter(unsigned long state_filter) /* * Only show locks if all tasks are dumped: */ - if (state_filter == -1) + if (!state_filter) debug_show_all_locks(); } -- cgit v1.2.2 From f6630114d9198aa959ac95c131334c020038f253 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 17 Nov 2009 18:22:15 -0600 Subject: sched: Limit the number of scheduler debug messages Remove the verbose scheduler debug messages unless kernel parameter "sched_debug" set. /proc/sched_debug unchanged. Signed-off-by: Mike Travis Cc: Heiko Carstens Cc: Roland Dreier Cc: Randy Dunlap Cc: Tejun Heo Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Yinghai Lu Cc: David Rientjes Cc: Steven Rostedt Cc: Rusty Russell Cc: Hidetoshi Seto Cc: Jack Steiner Cc: Frederic Weisbecker LKML-Reference: <20091118002221.489305000@alcatraz.americas.sgi.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index a57c6aee6d4a..48ff66a6892d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7720,6 +7720,16 @@ early_initcall(migration_init); #ifdef CONFIG_SCHED_DEBUG +static __read_mostly int sched_domain_debug_enabled; + +static int __init sched_domain_debug_setup(char *str) +{ + sched_domain_debug_enabled = 1; + + return 0; +} +early_param("sched_debug", sched_domain_debug_setup); + static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { @@ -7806,6 +7816,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) cpumask_var_t groupmask; int level = 0; + if (!sched_domain_debug_enabled) + return; + if (!sd) { printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); return; -- cgit v1.2.2 From d180c5bccec02612256fd8076ff3c1fac3429553 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 26 Nov 2009 14:48:30 +0900 Subject: sched: Introduce task_times() to replace task_{u,s}time() pair Functions task_{u,s}time() are called in pair in almost all cases. However task_stime() is implemented to call task_utime() from its inside, so such paired calls run task_utime() twice. It means we do heavy divisions (div_u64 + do_div) twice to get utime and stime which can be obtained at same time by one set of divisions. This patch introduces a function task_times(*tsk, *utime, *stime) to retrieve utime and stime at once in better, optimized way. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: Stanislaw Gruszka Cc: Spencer Candland Cc: Oleg Nesterov Cc: Balbir Singh Cc: Americo Wang LKML-Reference: <4B0E16AE.906@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 55 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 20 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 315ba4059f93..475a6f2b7158 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5191,6 +5191,14 @@ cputime_t task_stime(struct task_struct *p) { return p->stime; } + +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + if (ut) + *ut = task_utime(p); + if (st) + *st = task_stime(p); +} #else #ifndef nsecs_to_cputime @@ -5198,41 +5206,48 @@ cputime_t task_stime(struct task_struct *p) msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC)) #endif -cputime_t task_utime(struct task_struct *p) +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - cputime_t utime = p->utime, total = utime + p->stime; - u64 temp; + cputime_t rtime, utime = p->utime, total = utime + p->stime; /* * Use CFS's precise accounting: */ - temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime); + rtime = nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { - temp *= utime; + u64 temp; + + temp = (u64)(rtime * utime); do_div(temp, total); - } - utime = (cputime_t)temp; + utime = (cputime_t)temp; + } else + utime = rtime; + /* + * Compare with previous values, to keep monotonicity: + */ p->prev_utime = max(p->prev_utime, utime); - return p->prev_utime; + p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + + if (ut) + *ut = p->prev_utime; + if (st) + *st = p->prev_stime; +} + +cputime_t task_utime(struct task_struct *p) +{ + cputime_t utime; + task_times(p, &utime, NULL); + return utime; } cputime_t task_stime(struct task_struct *p) { cputime_t stime; - - /* - * Use CFS's precise accounting. (we subtract utime from - * the total, to make sure the total observed by userspace - * grows monotonically - apps rely on that): - */ - stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p); - - if (stime >= 0) - p->prev_stime = max(p->prev_stime, stime); - - return p->prev_stime; + task_times(p, NULL, &stime); + return stime; } #endif -- cgit v1.2.2 From d5b7c78e975302a1bab28263266c39ecb71acad4 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 26 Nov 2009 14:49:05 +0900 Subject: sched: Remove task_{u,s,g}time() Now all task_{u,s}time() pairs are replaced by task_times(). And task_gtime() is too simple to be an inline function. Cleanup them all. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: Stanislaw Gruszka Cc: Spencer Candland Cc: Oleg Nesterov Cc: Balbir Singh Cc: Americo Wang LKML-Reference: <4B0E16D1.70902@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 475a6f2b7158..82251c21f785 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5182,22 +5182,12 @@ void account_idle_ticks(unsigned long ticks) * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING -cputime_t task_utime(struct task_struct *p) -{ - return p->utime; -} - -cputime_t task_stime(struct task_struct *p) -{ - return p->stime; -} - void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { if (ut) - *ut = task_utime(p); + *ut = p->utime; if (st) - *st = task_stime(p); + *st = p->stime; } #else @@ -5235,27 +5225,8 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) if (st) *st = p->prev_stime; } - -cputime_t task_utime(struct task_struct *p) -{ - cputime_t utime; - task_times(p, &utime, NULL); - return utime; -} - -cputime_t task_stime(struct task_struct *p) -{ - cputime_t stime; - task_times(p, NULL, &stime); - return stime; -} #endif -inline cputime_t task_gtime(struct task_struct *p) -{ - return p->gtime; -} - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. -- cgit v1.2.2 From b7b20df91d43d5e59578b8fc16e895c0c8cbd9b5 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 26 Nov 2009 14:49:27 +0900 Subject: sched, time: Define nsecs_to_jiffies() Use of msecs_to_jiffies() for nsecs_to_cputime() have some problems: - The type of msecs_to_jiffies()'s argument is unsigned int, so it cannot convert msecs greater than UINT_MAX = about 49.7 days. - msecs_to_jiffies() returns MAX_JIFFY_OFFSET if MSB of argument is set, assuming that input was negative value. So it cannot convert msecs greater than INT_MAX = about 24.8 days too. This patch defines a new function nsecs_to_jiffies() that can deal greater values, and that can deal all incoming values as unsigned. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: Stanislaw Gruszka Cc: Spencer Candland Cc: Oleg Nesterov Cc: Balbir Singh Cc: Amrico Wang Cc: Thomas Gleixner Cc: John Stultz LKML-Reference: <4B0E16E7.5070307@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 82251c21f785..b3d4e2be95aa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5192,8 +5192,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) #else #ifndef nsecs_to_cputime -# define nsecs_to_cputime(__nsecs) \ - msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC)) +# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) #endif void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) -- cgit v1.2.2 From 8592e6486a177a02f048567cb928bc3a1f9a86c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Dec 2009 12:56:46 +0900 Subject: sched: Revert 498657a478c60be092208422fefa9c7b248729c2 498657a478c60be092208422fefa9c7b248729c2 incorrectly assumed that preempt wasn't disabled around context_switch() and thus was fixing imaginary problem. It also broke KVM because it depended on ->sched_in() to be called with irq enabled so that it can do smp calls from there. Revert the incorrect commit and add comment describing different contexts under with the two callbacks are invoked. Avi: spotted transposed in/out in the added comment. Signed-off-by: Tejun Heo Acked-by: Avi Kivity Cc: peterz@infradead.org Cc: efault@gmx.de Cc: rusty@rustcorp.com.au LKML-Reference: <1259726212-30259-2-git-send-email-tj@kernel.org> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b3d4e2be95aa..1031cae39c4c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2768,9 +2768,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) prev_state = prev->state; finish_arch_switch(prev); perf_event_task_sched_in(current, cpu_of(rq)); - fire_sched_in_preempt_notifiers(current); finish_lock_switch(rq, prev); + fire_sched_in_preempt_notifiers(current); if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { -- cgit v1.2.2 From bdddd2963c0264c56f18043f6fa829d3c1d3d1c0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 2 Dec 2009 14:09:16 +1030 Subject: sched: Fix isolcpus boot option Anton Blanchard wrote: > We allocate and zero cpu_isolated_map after the isolcpus > __setup option has run. This means cpu_isolated_map always > ends up empty and if CPUMASK_OFFSTACK is enabled we write to a > cpumask that hasn't been allocated. I introduced this regression in 49557e620339cb13 (sched: Fix boot crash by zalloc()ing most of the cpu masks). Use the bootmem allocator if they set isolcpus=, otherwise allocate and zero like normal. Reported-by: Anton Blanchard Signed-off-by: Rusty Russell Cc: peterz@infradead.org Cc: Linus Torvalds Cc: LKML-Reference: <200912021409.17013.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar Tested-by: Anton Blanchard --- kernel/sched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 1031cae39c4c..4883fee99314 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8061,6 +8061,7 @@ static cpumask_var_t cpu_isolated_map; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { + alloc_bootmem_cpumask_var(&cpu_isolated_map); cpulist_parse(str, cpu_isolated_map); return 1; } @@ -9609,7 +9610,9 @@ void __init sched_init(void) zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); #endif - zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); + /* May be allocated at isolcpus cmdline parse time */ + if (cpu_isolated_map == NULL) + zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ perf_event_init(); -- cgit v1.2.2 From d99ca3b977fc5a93141304f571475c2af9e6c1c5 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 2 Dec 2009 17:26:47 +0900 Subject: sched, cputime: Cleanups related to task_times() - Remove if({u,s}t)s because no one call it with NULL now. - Use cputime_{add,sub}(). - Add ifndef-endif for prev_{u,s}time since they are used only when !VIRT_CPU_ACCOUNTING. Signed-off-by: Hidetoshi Seto Cc: Peter Zijlstra Cc: Spencer Candland Cc: Americo Wang Cc: Oleg Nesterov Cc: Balbir Singh Cc: Stanislaw Gruszka LKML-Reference: <4B1624C7.7040302@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 4883fee99314..17e2c1db2bde 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5184,10 +5184,8 @@ void account_idle_ticks(unsigned long ticks) #ifdef CONFIG_VIRT_CPU_ACCOUNTING void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - if (ut) - *ut = p->utime; - if (st) - *st = p->stime; + *ut = p->utime; + *st = p->stime; } #else @@ -5197,7 +5195,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - cputime_t rtime, utime = p->utime, total = utime + p->stime; + cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); /* * Use CFS's precise accounting: @@ -5217,12 +5215,10 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) * Compare with previous values, to keep monotonicity: */ p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); - if (ut) - *ut = p->prev_utime; - if (st) - *st = p->prev_stime; + *ut = p->prev_utime; + *st = p->prev_stime; } #endif -- cgit v1.2.2 From 0cf55e1ec08bb5a22e068309e2d8ba1180ab4239 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 2 Dec 2009 17:28:07 +0900 Subject: sched, cputime: Introduce thread_group_times() This is a real fix for problem of utime/stime values decreasing described in the thread: http://lkml.org/lkml/2009/11/3/522 Now cputime is accounted in the following way: - {u,s}time in task_struct are increased every time when the thread is interrupted by a tick (timer interrupt). - When a thread exits, its {u,s}time are added to signal->{u,s}time, after adjusted by task_times(). - When all threads in a thread_group exits, accumulated {u,s}time (and also c{u,s}time) in signal struct are added to c{u,s}time in signal struct of the group's parent. So {u,s}time in task struct are "raw" tick count, while {u,s}time and c{u,s}time in signal struct are "adjusted" values. And accounted values are used by: - task_times(), to get cputime of a thread: This function returns adjusted values that originates from raw {u,s}time and scaled by sum_exec_runtime that accounted by CFS. - thread_group_cputime(), to get cputime of a thread group: This function returns sum of all {u,s}time of living threads in the group, plus {u,s}time in the signal struct that is sum of adjusted cputimes of all exited threads belonged to the group. The problem is the return value of thread_group_cputime(), because it is mixed sum of "raw" value and "adjusted" value: group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time) This misbehavior can break {u,s}time monotonicity. Assume that if there is a thread that have raw values greater than adjusted values (e.g. interrupted by 1000Hz ticks 50 times but only runs 45ms) and if it exits, cputime will decrease (e.g. -5ms). To fix this, we could do: group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time) But task_times() contains hard divisions, so applying it for every thread should be avoided. This patch fixes the above problem in the following way: - Modify thread's exit (= __exit_signal()) not to use task_times(). It means {u,s}time in signal struct accumulates raw values instead of adjusted values. As the result it makes thread_group_cputime() to return pure sum of "raw" values. - Introduce a new function thread_group_times(*task, *utime, *stime) that converts "raw" values of thread_group_cputime() to "adjusted" values, in same calculation procedure as task_times(). - Modify group's exit (= wait_task_zombie()) to use this introduced thread_group_times(). It make c{u,s}time in signal struct to have adjusted values like before this patch. - Replace some thread_group_cputime() by thread_group_times(). This replacements are only applied where conveys the "adjusted" cputime to users, and where already uses task_times() near by it. (i.e. sys_times(), getrusage(), and /proc//stat.) This patch have a positive side effect: - Before this patch, if a group contains many short-life threads (e.g. runs 0.9ms and not interrupted by ticks), the group's cputime could be invisible since thread's cputime was accumulated after adjusted: imagine adjustment function as adj(ticks, runtime), {adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0. After this patch it will not happen because the adjustment is applied after accumulated. v2: - remove if()s, put new variables into signal_struct. Signed-off-by: Hidetoshi Seto Acked-by: Peter Zijlstra Cc: Spencer Candland Cc: Americo Wang Cc: Oleg Nesterov Cc: Balbir Singh Cc: Stanislaw Gruszka LKML-Reference: <4B162517.8040909@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 17e2c1db2bde..e6ba726941ae 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5187,6 +5187,16 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) *ut = p->utime; *st = p->stime; } + +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; +} #else #ifndef nsecs_to_cputime @@ -5220,6 +5230,37 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) *ut = p->prev_utime; *st = p->prev_stime; } + +/* + * Must be called with siglock held. + */ +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct signal_struct *sig = p->signal; + struct task_cputime cputime; + cputime_t rtime, utime, total; + + thread_group_cputime(p, &cputime); + + total = cputime_add(cputime.utime, cputime.stime); + rtime = nsecs_to_cputime(cputime.sum_exec_runtime); + + if (total) { + u64 temp; + + temp = (u64)(rtime * cputime.utime); + do_div(temp, total); + utime = (cputime_t)temp; + } else + utime = rtime; + + sig->prev_utime = max(sig->prev_utime, utime); + sig->prev_stime = max(sig->prev_stime, + cputime_sub(rtime, sig->prev_utime)); + + *ut = sig->prev_utime; + *st = sig->prev_stime; +} #endif /* -- cgit v1.2.2 From c08f782985eed9959438368e84ce1d7f2ed03d95 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 2 Dec 2009 20:49:17 +0100 Subject: mutex: Fix missing conditions to build mutex_spin_on_owner() We don't need to build mutex_spin_on_owner() if we have CONFIG_DEBUG_MUTEXES or CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES as it won't be used under such configs. Use CONFIG_MUTEX_SPIN_ON_OWNER as it gathers all the necessary checks before building it. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra LKML-Reference: <1259783357-8542-2-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar Cc: Peter Zijlstra --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3c11ae0a948d..ec0af1fcb195 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5481,7 +5481,7 @@ need_resched_nonpreemptible: } EXPORT_SYMBOL(schedule); -#ifdef CONFIG_SMP +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* * Look out! "owner" is an entirely speculative pointer * access and not reliable. -- cgit v1.2.2 From 6ad4c18884e864cf4c77f9074d3d1816063f99cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Nov 2009 13:31:39 +0100 Subject: sched: Fix balance vs hotplug race Since (e761b77: cpu hotplug, sched: Introduce cpu_active_map and redo sched domain managment) we have cpu_active_mask which is suppose to rule scheduler migration and load-balancing, except it never (fully) did. The particular problem being solved here is a crash in try_to_wake_up() where select_task_rq() ends up selecting an offline cpu because select_task_rq_fair() trusts the sched_domain tree to reflect the current state of affairs, similarly select_task_rq_rt() trusts the root_domain. However, the sched_domains are updated from CPU_DEAD, which is after the cpu is taken offline and after stop_machine is done. Therefore it can race perfectly well with code assuming the domains are right. Cure this by building the domains from cpu_active_mask on CPU_DOWN_PREPARE. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index aa31244caa9f..281da29d0801 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4134,7 +4134,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_copy(cpus, cpu_online_mask); + cpumask_copy(cpus, cpu_active_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4297,7 +4297,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int all_pinned = 0; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_copy(cpus, cpu_online_mask); + cpumask_copy(cpus, cpu_active_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4694,7 +4694,7 @@ int select_nohz_load_balancer(int stop_tick) cpumask_set_cpu(cpu, nohz.cpu_mask); /* time for ilb owner also to sleep */ - if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { + if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) atomic_set(&nohz.load_balancer, -1); return 0; @@ -7093,7 +7093,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpumask_intersects(new_mask, cpu_online_mask)) { + if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; } @@ -7115,7 +7115,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { + if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ struct task_struct *mt = rq->migration_thread; @@ -7269,19 +7269,19 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) again: /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) + for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto move; /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); if (dest_cpu < nr_cpu_ids) goto move; /* No more Mr. Nice Guy. */ if (dest_cpu >= nr_cpu_ids) { cpuset_cpus_allowed_locked(p, &p->cpus_allowed); - dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); /* * Don't tell them about moving exiting tasks or @@ -7310,7 +7310,7 @@ move: */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); unsigned long flags; local_irq_save(flags); @@ -7564,7 +7564,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) static struct ctl_table_header *sd_sysctl_header; static void register_sched_domain_sysctl(void) { - int i, cpu_num = num_online_cpus(); + int i, cpu_num = num_possible_cpus(); struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); char buf[32]; @@ -7574,7 +7574,7 @@ static void register_sched_domain_sysctl(void) if (entry == NULL) return; - for_each_online_cpu(i) { + for_each_possible_cpu(i) { snprintf(buf, 32, "cpu%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); entry->mode = 0555; @@ -9100,7 +9100,7 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; doms_new = &fallback_doms; - cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map); + cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } @@ -9231,8 +9231,10 @@ static int update_sched_domains(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: partition_sched_domains(1, NULL, NULL); return NOTIFY_OK; @@ -9279,7 +9281,7 @@ void __init sched_init_smp(void) #endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(cpu_online_mask); + arch_init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); -- cgit v1.2.2 From 3160568371da441b7f2fb57f2f1225404207e8f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Dec 2009 20:24:16 +0000 Subject: sched: Protect task->cpus_allowed access in sched_getaffinity() sched_getaffinity() is not protected against a concurrent modification of the tasks affinity. Serialize the access with task_rq_lock(task). Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra LKML-Reference: <20091208202026.769251187@linutronix.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 281da29d0801..c4635f74540c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6631,6 +6631,8 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; + unsigned long flags; + struct rq *rq; int retval; get_online_cpus(); @@ -6645,7 +6647,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) if (retval) goto out_unlock; + rq = task_rq_lock(p, &flags); cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + task_rq_unlock(rq, &flags); out_unlock: read_unlock(&tasklist_lock); -- cgit v1.2.2 From dba091b9e3522b9d32fc9975e48d3b69633b45f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 09:32:03 +0100 Subject: sched: Protect sched_rr_get_param() access to task->sched_class sched_rr_get_param calls task->sched_class->get_rr_interval(task) without protection against a concurrent sched_setscheduler() call which modifies task->sched_class. Serialize the access with task_rq_lock(task) and hand the rq pointer into get_rr_interval() as it's needed at least in the sched_fair implementation. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index c4635f74540c..68db5a2e6545 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6887,6 +6887,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, { struct task_struct *p; unsigned int time_slice; + unsigned long flags; + struct rq *rq; int retval; struct timespec t; @@ -6903,7 +6905,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - time_slice = p->sched_class->get_rr_interval(p); + rq = task_rq_lock(p, &flags); + time_slice = p->sched_class->get_rr_interval(rq, p); + task_rq_unlock(rq, &flags); read_unlock(&tasklist_lock); jiffies_to_timespec(time_slice, &t); -- cgit v1.2.2 From 970b13bacba14a8cef6f642861947df1d175b0b3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Nov 2009 13:31:39 +0100 Subject: sched: Consolidate select_task_rq() callers Small cleanup. Signed-off-by: Peter Zijlstra LKML-Reference: [ v2: build fix ] Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 68db5a2e6545..01fd131b47a4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2323,6 +2323,14 @@ void task_oncpu_function_call(struct task_struct *p, preempt_enable(); } +#ifdef CONFIG_SMP +static inline +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +{ + return p->sched_class->select_task_rq(p, sd_flags, wake_flags); +} +#endif + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -2376,7 +2384,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, p->state = TASK_WAKING; task_rq_unlock(rq, &flags); - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); if (cpu != orig_cpu) { local_irq_save(flags); rq = cpu_rq(cpu); @@ -2593,7 +2601,7 @@ void sched_fork(struct task_struct *p, int clone_flags) p->sched_class = &fair_sched_class; #ifdef CONFIG_SMP - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); + cpu = select_task_rq(p, SD_BALANCE_FORK, 0); #endif local_irq_save(flags); update_rq_clock(cpu_rq(cpu)); @@ -3156,7 +3164,7 @@ out: void sched_exec(void) { int new_cpu, this_cpu = get_cpu(); - new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); + new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0); put_cpu(); if (new_cpu != this_cpu) sched_migrate_task(current, new_cpu); -- cgit v1.2.2 From 5afcdab706d6002cb02b567ba46e650215e694e8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Nov 2009 14:12:25 +0100 Subject: sched: Remove rq->clock coupling from set_task_cpu() set_task_cpu() should be rq invariant and only touch task state, it currently fails to do so, which opens up a few races, since not all callers hold both rq->locks. Remove the relyance on rq->clock, as any site calling set_task_cpu() should also do a remote clock update, which should ensure the observed time between these two cpus is monotonic, as per kernel/sched_clock.c:sched_clock_remote(). Therefore we can simply remove the clock_offset bits and be happy. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 01fd131b47a4..1f9c6d99f15d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2060,23 +2060,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); - struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); + struct rq *old_rq = cpu_rq(old_cpu); struct cfs_rq *old_cfsrq = task_cfs_rq(p), *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); - u64 clock_offset; - - clock_offset = old_rq->clock - new_rq->clock; trace_sched_migrate_task(p, new_cpu); -#ifdef CONFIG_SCHEDSTATS - if (p->se.wait_start) - p->se.wait_start -= clock_offset; - if (p->se.sleep_start) - p->se.sleep_start -= clock_offset; - if (p->se.block_start) - p->se.block_start -= clock_offset; -#endif if (old_cpu != new_cpu) { p->se.nr_migrations++; #ifdef CONFIG_SCHEDSTATS -- cgit v1.2.2 From ab19cb23313733c55e0517607844b86720b35f5f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Nov 2009 15:44:43 +0100 Subject: sched: Clean up ttwu() rq locking Since set_task_clock() doesn't rely on rq->clock anymore we can simplyfy the mess in ttwu(). Optimize things a bit by not fiddling with the IRQ state there. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 1f9c6d99f15d..c92670f8e097 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2371,17 +2371,14 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (task_contributes_to_load(p)) rq->nr_uninterruptible--; p->state = TASK_WAKING; - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) { - local_irq_save(flags); - rq = cpu_rq(cpu); - update_rq_clock(rq); + if (cpu != orig_cpu) set_task_cpu(p, cpu); - local_irq_restore(flags); - } - rq = task_rq_lock(p, &flags); + + rq = __task_rq_lock(p); + update_rq_clock(rq); WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); -- cgit v1.2.2 From cd29fe6f2637cc2ccbda5ac65f5332d6bf5fa3c6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Nov 2009 17:32:46 +0100 Subject: sched: Sanitize fork() handling Currently we try to do task placement in wake_up_new_task() after we do the load-balance pass in sched_fork(). This yields complicated semantics in that we have to deal with tasks on different RQs and the set_task_cpu() calls in copy_process() and sched_fork() Rename ->task_new() to ->task_fork() and call it from sched_fork() before the balancing, this gives the policy a clear point to place the task. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 47 ++++++++++++++++++----------------------------- 1 file changed, 18 insertions(+), 29 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index c92670f8e097..33c903573132 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1811,6 +1811,20 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) static void calc_load_account_active(struct rq *this_rq); +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + task_thread_info(p)->cpu = cpu; +#endif +} + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -1967,20 +1981,6 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - set_task_rq(p, cpu); -#ifdef CONFIG_SMP - /* - * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfuly executed on another CPU. We must ensure that updates of - * per-task data have been completed by this moment. - */ - smp_wmb(); - task_thread_info(p)->cpu = cpu; -#endif -} - static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio, int running) @@ -2552,7 +2552,6 @@ static void __sched_fork(struct task_struct *p) void sched_fork(struct task_struct *p, int clone_flags) { int cpu = get_cpu(); - unsigned long flags; __sched_fork(p); @@ -2586,13 +2585,13 @@ void sched_fork(struct task_struct *p, int clone_flags) if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + #ifdef CONFIG_SMP cpu = select_task_rq(p, SD_BALANCE_FORK, 0); #endif - local_irq_save(flags); - update_rq_clock(cpu_rq(cpu)); set_task_cpu(p, cpu); - local_irq_restore(flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) @@ -2625,17 +2624,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); update_rq_clock(rq); - - if (!p->sched_class->task_new || !current->se.on_rq) { - activate_task(rq, p, 0); - } else { - /* - * Let the scheduling class do new task startup - * management (if any): - */ - p->sched_class->task_new(rq, p); - inc_nr_running(rq); - } + activate_task(rq, p, 0); trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP -- cgit v1.2.2 From 6cecd084d0fd27bb1e498e2829fd45846d806856 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Nov 2009 13:00:37 +0100 Subject: sched: Discard some old bits WAKEUP_RUNNING was an experiment, not sure why that ever ended up being merged... Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 33c903573132..0170735bdafc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2493,7 +2493,6 @@ static void __sched_fork(struct task_struct *p) p->se.avg_overlap = 0; p->se.start_runtime = 0; p->se.avg_wakeup = sysctl_sched_wakeup_granularity; - p->se.avg_running = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -5379,13 +5378,14 @@ static inline void schedule_debug(struct task_struct *prev) #endif } -static void put_prev_task(struct rq *rq, struct task_struct *p) +static void put_prev_task(struct rq *rq, struct task_struct *prev) { - u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; + if (prev->state == TASK_RUNNING) { + u64 runtime = prev->se.sum_exec_runtime; - update_avg(&p->se.avg_running, runtime); + runtime -= prev->se.prev_sum_exec_runtime; + runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); - if (p->state == TASK_RUNNING) { /* * In order to avoid avg_overlap growing stale when we are * indeed overlapping and hence not getting put to sleep, grow @@ -5395,12 +5395,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p) * correlates to the amount of cache footprint a task can * build up. */ - runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); - update_avg(&p->se.avg_overlap, runtime); - } else { - update_avg(&p->se.avg_running, 0); + update_avg(&prev->se.avg_overlap, runtime); } - p->sched_class->put_prev_task(rq, p); + prev->sched_class->put_prev_task(rq, prev); } /* -- cgit v1.2.2 From cd8ad40de36c2fe75f3b731bd70198b385895246 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Dec 2009 18:00:07 +0100 Subject: sched: cgroup: Implement different treatment for idle shares When setting the weight for a per-cpu task-group, we have to put in a phantom weight when there is no work on that cpu, otherwise we'll not service that cpu when new work gets placed there until we again update the per-cpu weights. We used to add these phantom weights to the total, so that the idle per-cpu shares don't get inflated, this however causes the non-idle parts to get deflated, causing unexpected weight distibutions. Reverse this, so that the non-idle shares are correct but the idle shares are inflated. Reported-by: Yasunori Goto Tested-by: Yasunori Goto Signed-off-by: Peter Zijlstra LKML-Reference: <1257934048.23203.76.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 0170735bdafc..71eb0622f548 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1614,7 +1614,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0, shares = 0; + unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; unsigned long *usd_rq_weight; struct sched_domain *sd = data; unsigned long flags; @@ -1630,6 +1630,7 @@ static int tg_shares_up(struct task_group *tg, void *data) weight = tg->cfs_rq[i]->load.weight; usd_rq_weight[i] = weight; + rq_weight += weight; /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to @@ -1638,10 +1639,13 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!weight) weight = NICE_0_LOAD; - rq_weight += weight; + sum_weight += weight; shares += tg->cfs_rq[i]->shares; } + if (!rq_weight) + rq_weight = sum_weight; + if ((!shares && rq_weight) || shares > tg->shares) shares = tg->shares; -- cgit v1.2.2 From 57785df5ac53c70da9fb53696130f3c551bfe1f9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 4 Dec 2009 09:59:02 +0100 Subject: sched: Fix task priority bug 83f9ac removed a call to effective_prio() in wake_up_new_task(), which leads to tasks running at MAX_PRIO. This is caused by the idle thread being set to MAX_PRIO before forking off init. O(1) used that to make sure idle was always preempted, CFS uses check_preempt_curr_idle() for that so we can savely remove this bit of legacy code. Reported-by: Mike Galbraith Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1259754383.4003.610.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 71eb0622f548..3878f5018007 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3158,10 +3158,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. - */ check_preempt_curr(this_rq, p, 0); } @@ -6992,7 +6988,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) __sched_fork(idle); idle->se.exec_start = sched_clock(); - idle->prio = idle->normal_prio = MAX_PRIO; cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); __set_task_cpu(idle, cpu); @@ -7696,7 +7691,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_lock_irq(&rq->lock); update_rq_clock(rq); deactivate_task(rq, rq->idle, 0); - rq->idle->static_prio = MAX_PRIO; __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); rq->idle->sched_class = &idle_sched_class; migrate_dead_tasks(cpu); -- cgit v1.2.2 From 0bcdcf28c979869f44e05121b96ff2cfb05bd8e6 Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Mon, 30 Nov 2009 12:16:46 +0100 Subject: sched: Fix missing sched tunable recalculation on cpu add/remove Based on Peter Zijlstras patch suggestion this enables recalculation of the scheduler tunables in response of a change in the number of cpus. It also adds a max of eight cpus that are considered in that scaling. Signed-off-by: Christian Ehrhardt Signed-off-by: Peter Zijlstra LKML-Reference: <1259579808-11357-2-git-send-email-ehrhardt@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3878f5018007..b54ecf84b6be 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -814,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; * default: 0.25ms */ unsigned int sysctl_sched_shares_ratelimit = 250000; +unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; /* * Inject some fuzzyness into changing the per-cpu group shares @@ -1814,6 +1815,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) #endif static void calc_load_account_active(struct rq *this_rq); +static void update_sysctl(void); static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -7028,22 +7030,23 @@ cpumask_var_t nohz_cpu_mask; * * This idea comes from the SD scheduler of Con Kolivas: */ -static inline void sched_init_granularity(void) +static void update_sysctl(void) { - unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long limit = 200000000; - - sysctl_sched_min_granularity *= factor; - if (sysctl_sched_min_granularity > limit) - sysctl_sched_min_granularity = limit; - - sysctl_sched_latency *= factor; - if (sysctl_sched_latency > limit) - sysctl_sched_latency = limit; + unsigned int cpus = min(num_online_cpus(), 8U); + unsigned int factor = 1 + ilog2(cpus); - sysctl_sched_wakeup_granularity *= factor; +#define SET_SYSCTL(name) \ + (sysctl_##name = (factor) * normalized_sysctl_##name) + SET_SYSCTL(sched_min_granularity); + SET_SYSCTL(sched_latency); + SET_SYSCTL(sched_wakeup_granularity); + SET_SYSCTL(sched_shares_ratelimit); +#undef SET_SYSCTL +} - sysctl_sched_shares_ratelimit *= factor; +static inline void sched_init_granularity(void) +{ + update_sysctl(); } #ifdef CONFIG_SMP -- cgit v1.2.2 From 1983a922a1bc843806b9a36cf3a370b242783140 Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Mon, 30 Nov 2009 12:16:47 +0100 Subject: sched: Make tunable scaling style configurable As scaling now takes place on all kind of cpu add/remove events a user that configures values via proc should be able to configure if his set values are still rescaled or kept whatever happens. As the comments state that log2 was just a second guess that worked the interface is not just designed for on/off, but to choose a scaling type. Currently this allows none, log and linear, but more important it allwos us to keep the interface even if someone has an even better idea how to scale the values. Signed-off-by: Christian Ehrhardt Signed-off-by: Peter Zijlstra LKML-Reference: <1259579808-11357-3-git-send-email-ehrhardt@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b54ecf84b6be..116efed962c6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7033,7 +7033,20 @@ cpumask_var_t nohz_cpu_mask; static void update_sysctl(void) { unsigned int cpus = min(num_online_cpus(), 8U); - unsigned int factor = 1 + ilog2(cpus); + unsigned int factor; + + switch (sysctl_sched_tunable_scaling) { + case SCHED_TUNABLESCALING_NONE: + factor = 1; + break; + case SCHED_TUNABLESCALING_LINEAR: + factor = cpus; + break; + case SCHED_TUNABLESCALING_LOG: + default: + factor = 1 + ilog2(cpus); + break; + } #define SET_SYSCTL(name) \ (sysctl_##name = (factor) * normalized_sysctl_##name) -- cgit v1.2.2 From acb4a848da821a095ae9e4d8b22ae2d9633ba5cd Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Mon, 30 Nov 2009 12:16:48 +0100 Subject: sched: Update normalized values on user updates via proc The normalized values are also recalculated in case the scaling factor changes. This patch updates the internally used scheduler tuning values that are normalized to one cpu in case a user sets new values via sysfs. Together with patch 2 of this series this allows to let user configured values scale (or not) to cpu add/remove events taking place later. Signed-off-by: Christian Ehrhardt Signed-off-by: Peter Zijlstra LKML-Reference: <1259579808-11357-4-git-send-email-ehrhardt@linux.vnet.ibm.com> [ v2: fix warning ] Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 116efed962c6..0a60e8e9b094 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1816,6 +1816,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) static void calc_load_account_active(struct rq *this_rq); static void update_sysctl(void); +static int get_update_sysctl_factor(void); static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -7030,9 +7031,9 @@ cpumask_var_t nohz_cpu_mask; * * This idea comes from the SD scheduler of Con Kolivas: */ -static void update_sysctl(void) +static int get_update_sysctl_factor(void) { - unsigned int cpus = min(num_online_cpus(), 8U); + unsigned int cpus = min(num_online_cpus(), 8); unsigned int factor; switch (sysctl_sched_tunable_scaling) { @@ -7048,6 +7049,13 @@ static void update_sysctl(void) break; } + return factor; +} + +static void update_sysctl(void) +{ + unsigned int factor = get_update_sysctl_factor(); + #define SET_SYSCTL(name) \ (sysctl_##name = (factor) * normalized_sysctl_##name) SET_SYSCTL(sched_min_granularity); -- cgit v1.2.2 From 4ca3ef71f54655af98b66e8ff308a47a2a580a53 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 10 Dec 2009 09:25:53 +0100 Subject: sched: Fix build warning in get_update_sysctl_factor() Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar LKML-Reference: --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 0a60e8e9b094..3de3deab8095 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7033,7 +7033,7 @@ cpumask_var_t nohz_cpu_mask; */ static int get_update_sysctl_factor(void) { - unsigned int cpus = min(num_online_cpus(), 8); + unsigned int cpus = min_t(int, num_online_cpus(), 8); unsigned int factor; switch (sysctl_sched_tunable_scaling) { -- cgit v1.2.2 From dfc12eb26a285df316be68a808af86964f3bff86 Mon Sep 17 00:00:00 2001 From: Phil Carmody Date: Thu, 10 Dec 2009 14:29:37 +0200 Subject: sched: Fix memory leak in two error corner cases If the second in each of these pairs of allocations fails, then the first one will not be freed in the error route out. Found by a static code analysis tool. Signed-off-by: Phil Carmody Acked-by: Peter Zijlstra LKML-Reference: <1260448177-28448-1-git-send-email-ext-phil.2.carmody@nokia.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3de3deab8095..36cc05a76947 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9855,13 +9855,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) se = kzalloc_node(sizeof(struct sched_entity), GFP_KERNEL, cpu_to_node(i)); if (!se) - goto err; + goto err_free_rq; init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); } return 1; + err_free_rq: + kfree(cfs_rq); err: return 0; } @@ -9943,13 +9945,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) rt_se = kzalloc_node(sizeof(struct sched_rt_entity), GFP_KERNEL, cpu_to_node(i)); if (!rt_se) - goto err; + goto err_free_rq; init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); } return 1; + err_free_rq: + kfree(rt_rq); err: return 0; } -- cgit v1.2.2 From b9889ed1ddeca5a3f3569c8de7354e9e97d803ae Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 10 Dec 2009 20:32:39 +0100 Subject: sched: Remove forced2_migrations stats This build warning: kernel/sched.c: In function 'set_task_cpu': kernel/sched.c:2070: warning: unused variable 'old_rq' Made me realize that the forced2_migrations stat looks pretty pointless (and a misnomer) - remove it. Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 36cc05a76947..bc68037f3199 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2067,7 +2067,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); - struct rq *old_rq = cpu_rq(old_cpu); struct cfs_rq *old_cfsrq = task_cfs_rq(p), *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); @@ -2075,10 +2074,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (old_cpu != new_cpu) { p->se.nr_migrations++; -#ifdef CONFIG_SCHEDSTATS - if (task_hot(p, old_rq->clock, NULL)) - schedstat_inc(p, se.nr_forced2_migrations); -#endif perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } @@ -2521,7 +2516,6 @@ static void __sched_fork(struct task_struct *p) p->se.nr_failed_migrations_running = 0; p->se.nr_failed_migrations_hot = 0; p->se.nr_forced_migrations = 0; - p->se.nr_forced2_migrations = 0; p->se.nr_wakeups = 0; p->se.nr_wakeups_sync = 0; -- cgit v1.2.2