From cd79007634854f9e936e2369890f2512f94b8759 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Wed, 17 Oct 2007 16:55:11 +0200 Subject: sched: more robust sd-sysctl entry freeing It occurred to me this morning that the procname field was dynamically allocated and needed to be freed. I started to put in break statements when allocation failed but it was approaching 50% error handling code. I came up with this alternative of looping while entry->mode is set and checking proc_handler instead of ->table. Alternatively, the string version of the domain name and cpu number could be stored the structs. I verified by compiling CONFIG_DEBUG_SLAB and checking the allocation counts after taking a cpuset exclusive and back. Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 0da2b2635c54..5e220bf7d4c4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5272,11 +5272,20 @@ static struct ctl_table *sd_alloc_ctl_entry(int n) static void sd_free_ctl_entry(struct ctl_table **tablep) { - struct ctl_table *entry = *tablep; + struct ctl_table *entry; - for (entry = *tablep; entry->procname; entry++) + /* + * In the intermediate directories, both the child directory and + * procname are dynamically allocated and could fail but the mode + * will always be set. In the lowest directory the names are + * static strings and all have proc handlers. + */ + for (entry = *tablep; entry->mode; entry++) { if (entry->child) sd_free_ctl_entry(&entry->child); + if (entry->proc_handler == NULL) + kfree(entry->procname); + } kfree(*tablep); *tablep = NULL; -- cgit v1.2.2 From 908a7c1b9b80d06708177432020c80d147754691 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Wed, 17 Oct 2007 16:55:11 +0200 Subject: sched: fix improper load balance across sched domain We recently discovered a nasty performance bug in the kernel CPU load balancer where we were hit by 50% performance regression. When tasks are assigned to a subset of CPUs that span across sched_domains (either ccNUMA node or the new multi-core domain) via cpu affinity, kernel fails to perform proper load balance at these domains, due to several logic in find_busiest_group() miss identified busiest sched group within a given domain. This leads to inadequate load balance and causes 50% performance hit. To give you a concrete example, on a dual-core, 2 socket numa system, there are 4 logical cpu, organized as: CPU0 attaching sched-domain: domain 0: span 0003 groups: 0001 0002 domain 1: span 000f groups: 0003 000c CPU1 attaching sched-domain: domain 0: span 0003 groups: 0002 0001 domain 1: span 000f groups: 0003 000c CPU2 attaching sched-domain: domain 0: span 000c groups: 0004 0008 domain 1: span 000f groups: 000c 0003 CPU3 attaching sched-domain: domain 0: span 000c groups: 0008 0004 domain 1: span 000f groups: 000c 0003 If I run 2 tasks with CPU affinity set to 0x5. There are situation where cpu0 has run queue length of 2, and cpu2 will be idle. The kernel load balancer is unable to balance out these two tasks over cpu0 and cpu2 due to at least three logics in find_busiest_group() that heavily bias load balance towards power saving mode. e.g. while determining "busiest" variable, kernel only set it when "sum_nr_running > group_capacity". This test is flawed that "sum_nr_running" is not necessary same as sum-tasks-allowed-to-run-within-the sched-group. The end result is that kernel "think" everything is balanced, but in reality we have an imbalance and thus causing one CPU to be over-subscribed and leaving other idle. There are two other logic in the same function will also causing similar effect. The nastiness of this bug is that kernel not be able to get unstuck in this unfortunate broken state. From what we've seen in our environment, kernel will stuck in imbalanced state for extended period of time and it is also very easy for the kernel to stuck into that state (it's pretty much 100% reproducible for us). So proposing the following fix: add addition logic in find_busiest_group to detect intrinsic imbalance within the busiest group. When such condition is detected, load balance goes into spread mode instead of default grouping mode. Signed-off-by: Ken Chen Signed-off-by: Ingo Molnar --- kernel/sched.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 5e220bf7d4c4..975436435b42 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2336,7 +2336,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long max_pull; unsigned long busiest_load_per_task, busiest_nr_running; unsigned long this_load_per_task, this_nr_running; - int load_idx; + int load_idx, group_imb = 0; #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) int power_savings_balance = 1; unsigned long leader_nr_running = 0, min_load_per_task = 0; @@ -2355,9 +2355,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, load_idx = sd->idle_idx; do { - unsigned long load, group_capacity; + unsigned long load, group_capacity, max_cpu_load, min_cpu_load; int local_group; int i; + int __group_imb = 0; unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long sum_nr_running, sum_weighted_load; @@ -2368,6 +2369,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; + max_cpu_load = 0; + min_cpu_load = ~0UL; for_each_cpu_mask(i, group->cpumask) { struct rq *rq; @@ -2388,8 +2391,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, } load = target_load(i, load_idx); - } else + } else { load = source_load(i, load_idx); + if (load > max_cpu_load) + max_cpu_load = load; + if (min_cpu_load > load) + min_cpu_load = load; + } avg_load += load; sum_nr_running += rq->nr_running; @@ -2415,6 +2423,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, avg_load = sg_div_cpu_power(group, avg_load * SCHED_LOAD_SCALE); + if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) + __group_imb = 1; + group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; if (local_group) { @@ -2423,11 +2434,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, this_nr_running = sum_nr_running; this_load_per_task = sum_weighted_load; } else if (avg_load > max_load && - sum_nr_running > group_capacity) { + (sum_nr_running > group_capacity || __group_imb)) { max_load = avg_load; busiest = group; busiest_nr_running = sum_nr_running; busiest_load_per_task = sum_weighted_load; + group_imb = __group_imb; } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -2499,6 +2511,9 @@ group_next: goto out_balanced; busiest_load_per_task /= busiest_nr_running; + if (group_imb) + busiest_load_per_task = min(busiest_load_per_task, avg_load); + /* * We're trying to get all the cpus to the average_load, so we don't * want to push ourselves above the average load, nor do we wish to -- cgit v1.2.2 From b1a8c172c318534b96d0f0f1aecdad3898118b98 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Wed, 17 Oct 2007 16:55:11 +0200 Subject: sched: fix !SYSFS build breakage When CONFIG_SYSFS is not set, CONFIG_FAIR_USER_SCHED fails to build with kernel/built-in.o: In function `uids_kobject_init': (.init.text+0x1488): undefined reference to `kernel_subsys' kernel/built-in.o: In function `uids_kobject_init': (.init.text+0x1490): undefined reference to `kernel_subsys' kernel/built-in.o: In function `uids_kobject_init': (.init.text+0x1480): undefined reference to `kernel_subsys' kernel/built-in.o: In function `uids_kobject_init': (.init.text+0x1494): undefined reference to `kernel_subsys' This patch fixes this build error. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/user.c | 23 +++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 592e3a55f818..4ac7d51ad0e1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -530,10 +530,12 @@ struct user_struct { #ifdef CONFIG_FAIR_USER_SCHED struct task_group *tg; +#ifdef CONFIG_SYSFS struct kset kset; struct subsys_attribute user_attr; struct work_struct work; #endif +#endif }; #ifdef CONFIG_FAIR_USER_SCHED diff --git a/kernel/user.c b/kernel/user.c index f0e561e6d085..7e8215d87b40 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -87,9 +87,6 @@ static inline struct user_struct *uid_hash_find(uid_t uid, #ifdef CONFIG_FAIR_USER_SCHED -static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */ -static DEFINE_MUTEX(uids_mutex); - static void sched_destroy_user(struct user_struct *up) { sched_destroy_group(up->tg); @@ -111,6 +108,19 @@ static void sched_switch_user(struct task_struct *p) sched_move_task(p); } +#else /* CONFIG_FAIR_USER_SCHED */ + +static void sched_destroy_user(struct user_struct *up) { } +static int sched_create_user(struct user_struct *up) { return 0; } +static void sched_switch_user(struct task_struct *p) { } + +#endif /* CONFIG_FAIR_USER_SCHED */ + +#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS) + +static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */ +static DEFINE_MUTEX(uids_mutex); + static inline void uids_mutex_lock(void) { mutex_lock(&uids_mutex); @@ -257,11 +267,8 @@ static inline void free_user(struct user_struct *up, unsigned long flags) schedule_work(&up->work); } -#else /* CONFIG_FAIR_USER_SCHED */ +#else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */ -static void sched_destroy_user(struct user_struct *up) { } -static int sched_create_user(struct user_struct *up) { return 0; } -static void sched_switch_user(struct task_struct *p) { } static inline int user_kobject_create(struct user_struct *up) { return 0; } static inline void uids_mutex_lock(void) { } static inline void uids_mutex_unlock(void) { } @@ -280,7 +287,7 @@ static inline void free_user(struct user_struct *up, unsigned long flags) kmem_cache_free(uid_cachep, up); } -#endif /* CONFIG_FAIR_USER_SCHED */ +#endif /* * Locate the user_struct for the passed UID. If found, take a ref on it. The -- cgit v1.2.2 From b9dca1e0fcb696716840a3bc8f20a6941b484dbf Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Wed, 17 Oct 2007 16:55:11 +0200 Subject: sched: fix new task startup crash Child task may be added on a different cpu that the one on which parent is running. In which case, task_new_fair() should check whether the new born task's parent entity should be added as well on the cfs_rq. Patch below fixes the problem in task_new_fair. This could fix the put_prev_task_fair() crashes reported. Reported-by: Kamalesh Babulal Reported-by: Andy Whitcroft Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_fair.c | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 975436435b42..0ec9521a8e70 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1712,7 +1712,7 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->prio = effective_prio(p); - if (!p->sched_class->task_new || !current->se.on_rq || !rq->cfs.curr) { + if (!p->sched_class->task_new || !current->se.on_rq) { activate_task(rq, p, 0); } else { /* diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a17b785d7000..166ed6db600b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1031,12 +1031,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) swap(curr->vruntime, se->vruntime); } - update_stats_enqueue(cfs_rq, se); - check_spread(cfs_rq, se); - check_spread(cfs_rq, curr); - __enqueue_entity(cfs_rq, se); - account_entity_enqueue(cfs_rq, se); se->peer_preempt = 0; + enqueue_task_fair(rq, p, 0); resched_task(rq->curr); } -- cgit v1.2.2