From eab172294d5e24464f332dd8e94a57a9819c81c4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 29 Oct 2008 17:03:22 +0800 Subject: sched: cleanup for alloc_rt/fair_sched_group() Impact: cleanup Remove checking parent == NULL. It won't be NULLL, because we dynamically create sub task_group only, and sub task_group always has its parent. (root task_group is statically defined) Also replace kmalloc_node(GFP_ZERO) with kzalloc_node(). Signed-off-by: Li Zefan Signed-off-by: Ingo Molnar --- kernel/sched.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e8819bc6f462..7dd6c860773b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8472,7 +8472,7 @@ static int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct cfs_rq *cfs_rq; - struct sched_entity *se, *parent_se; + struct sched_entity *se; struct rq *rq; int i; @@ -8488,18 +8488,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) for_each_possible_cpu(i) { rq = cpu_rq(i); - cfs_rq = kmalloc_node(sizeof(struct cfs_rq), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), + GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) goto err; - se = kmalloc_node(sizeof(struct sched_entity), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + se = kzalloc_node(sizeof(struct sched_entity), + GFP_KERNEL, cpu_to_node(i)); if (!se) goto err; - parent_se = parent ? parent->se[i] : NULL; - init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se); + init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); } return 1; @@ -8560,7 +8559,7 @@ static int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se, *parent_se; + struct sched_rt_entity *rt_se; struct rq *rq; int i; @@ -8577,18 +8576,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) for_each_possible_cpu(i) { rq = cpu_rq(i); - rt_rq = kmalloc_node(sizeof(struct rt_rq), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + rt_rq = kzalloc_node(sizeof(struct rt_rq), + GFP_KERNEL, cpu_to_node(i)); if (!rt_rq) goto err; - rt_se = kmalloc_node(sizeof(struct sched_rt_entity), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + rt_se = kzalloc_node(sizeof(struct sched_rt_entity), + GFP_KERNEL, cpu_to_node(i)); if (!rt_se) goto err; - parent_se = parent ? parent->rt_se[i] : NULL; - init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se); + init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); } return 1; -- cgit v1.2.2 From 34f3a814eef8069a24e5b3ebcf27aba9dabac2ea Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 30 Oct 2008 15:23:32 +0800 Subject: sched: switch sched_features to seqfile Impact: cleanup So handling of sched_features read is simplified. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 52 ++++++++++++++++------------------------------------ 1 file changed, 16 insertions(+), 36 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7dd6c860773b..5419df9cc5c4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -703,45 +703,18 @@ static __read_mostly char *sched_feat_names[] = { #undef SCHED_FEAT -static int sched_feat_open(struct inode *inode, struct file *filp) +static int sched_feat_show(struct seq_file *m, void *v) { - filp->private_data = inode->i_private; - return 0; -} - -static ssize_t -sched_feat_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char *buf; - int r = 0; - int len = 0; int i; for (i = 0; sched_feat_names[i]; i++) { - len += strlen(sched_feat_names[i]); - len += 4; + if (!(sysctl_sched_features & (1UL << i))) + seq_puts(m, "NO_"); + seq_printf(m, "%s ", sched_feat_names[i]); } + seq_puts(m, "\n"); - buf = kmalloc(len + 2, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - for (i = 0; sched_feat_names[i]; i++) { - if (sysctl_sched_features & (1UL << i)) - r += sprintf(buf + r, "%s ", sched_feat_names[i]); - else - r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]); - } - - r += sprintf(buf + r, "\n"); - WARN_ON(r >= len + 2); - - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); - - kfree(buf); - - return r; + return 0; } static ssize_t @@ -786,10 +759,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf, return cnt; } +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_feat_show, NULL); +} + static struct file_operations sched_feat_fops = { - .open = sched_feat_open, - .read = sched_feat_read, - .write = sched_feat_write, + .open = sched_feat_open, + .write = sched_feat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; static __init int sched_init_debug(void) -- cgit v1.2.2 From eefd796a8e831408ce17e633d73d70430748c47a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 4 Nov 2008 16:15:37 +0800 Subject: sched debug: remove sd_level_to_string() Impact: cleanup Just use the newly introduced sd->name. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5419df9cc5c4..7ac59bae87d2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6602,28 +6602,6 @@ early_initcall(migration_init); #ifdef CONFIG_SCHED_DEBUG -static inline const char *sd_level_to_string(enum sched_domain_level lvl) -{ - switch (lvl) { - case SD_LV_NONE: - return "NONE"; - case SD_LV_SIBLING: - return "SIBLING"; - case SD_LV_MC: - return "MC"; - case SD_LV_CPU: - return "CPU"; - case SD_LV_NODE: - return "NODE"; - case SD_LV_ALLNODES: - return "ALLNODES"; - case SD_LV_MAX: - return "MAX"; - - } - return "MAX"; -} - static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_t *groupmask) { @@ -6643,8 +6621,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %s level %s\n", - str, sd_level_to_string(sd->level)); + printk(KERN_CONT "span %s level %s\n", str, sd->name); if (!cpu_isset(cpu, sd->span)) { printk(KERN_ERR "ERROR: domain->span does not contain " -- cgit v1.2.2 From a17e2260926f681a0eb983c1e3cb859ba2064bce Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 4 Nov 2008 16:19:13 +0800 Subject: sched: remove redundant call to unregister_sched_domain_sysctl() Impact: cleanup The sysctl has been unregistered by partition_sched_domains(). Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7ac59bae87d2..3cb94fad33ca 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7691,8 +7691,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) cpumask_t tmpmask; int i; - unregister_sched_domain_sysctl(); - for_each_cpu_mask_nr(i, *cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); -- cgit v1.2.2 From faa2f98f856e89d1afb6e4a91707284d242e816e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 4 Nov 2008 16:20:23 +0800 Subject: sched: add sanity check in partition_sched_domains() Impact: cleanup, add debug check It's wrong to make dattr_new = NULL if doms_new == NULL, it introduces memory leak if dattr_new != NULL. Fortunately dattr_new is always NULL in this case. So remove the code and add a sanity check. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3cb94fad33ca..213cad5e50aa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7767,7 +7767,7 @@ match1: ndoms_cur = 0; doms_new = &fallback_doms; cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); - dattr_new = NULL; + WARN_ON_ONCE(dattr_new); } /* Build new domains */ -- cgit v1.2.2 From cf7f8690e864c6fe11e77202dd847fa60f483418 Mon Sep 17 00:00:00 2001 From: Sripathi Kodi Date: Wed, 5 Nov 2008 18:57:14 +0530 Subject: sched, lockdep: inline double_unlock_balance() We have a test case which measures the variation in the amount of time needed to perform a fixed amount of work on the preempt_rt kernel. We started seeing deterioration in it's performance recently. The test should never take more than 10 microseconds, but we started 5-10% failure rate. Using elimination method, we traced the problem to commit 1b12bbc747560ea68bcc132c3d05699e52271da0 (lockdep: re-annotate scheduler runqueues). When LOCKDEP is disabled, this patch only adds an additional function call to double_unlock_balance(). Hence I inlined double_unlock_balance() and the problem went away. Here is a patch to make this change. Signed-off-by: Sripathi Kodi Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e8819bc6f462..ad10d0aae1d7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2825,7 +2825,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) return ret; } -static void double_unlock_balance(struct rq *this_rq, struct rq *busiest) +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { spin_unlock(&busiest->lock); -- cgit v1.2.2 From 6d21cd62516a9697cb7ec33cc52e6b814fb65a13 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 7 Nov 2008 17:03:18 +0800 Subject: sched: clean up SCHED_CPUMASK_ALLOC Impact: cleanup The #if/#endif is ugly. Change SCHED_CPUMASK_ALLOC and SCHED_CPUMASK_FREE to static inline functions. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b24e57a10f6f..59db86c915f9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7279,13 +7279,21 @@ struct allmasks { }; #if NR_CPUS > 128 -#define SCHED_CPUMASK_ALLOC 1 -#define SCHED_CPUMASK_FREE(v) kfree(v) -#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v +#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v +static inline void sched_cpumask_alloc(struct allmasks **masks) +{ + *masks = kmalloc(sizeof(**masks), GFP_KERNEL); +} +static inline void sched_cpumask_free(struct allmasks *masks) +{ + kfree(masks); +} #else -#define SCHED_CPUMASK_ALLOC 0 -#define SCHED_CPUMASK_FREE(v) -#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v +#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v +static inline void sched_cpumask_alloc(struct allmasks **masks) +{ } +static inline void sched_cpumask_free(struct allmasks *masks) +{ } #endif #define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ @@ -7361,9 +7369,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map, return -ENOMEM; } -#if SCHED_CPUMASK_ALLOC /* get space for all scratch cpumask variables */ - allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); + sched_cpumask_alloc(&allmasks); if (!allmasks) { printk(KERN_WARNING "Cannot alloc cpumask array\n"); kfree(rd); @@ -7372,7 +7379,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #endif return -ENOMEM; } -#endif + tmpmask = (cpumask_t *)allmasks; @@ -7626,13 +7633,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map, cpu_attach_domain(sd, rd, i); } - SCHED_CPUMASK_FREE((void *)allmasks); + sched_cpumask_free(allmasks); return 0; #ifdef CONFIG_NUMA error: free_sched_groups(cpu_map, tmpmask); - SCHED_CPUMASK_FREE((void *)allmasks); + sched_cpumask_free(allmasks); kfree(rd); return -ENOMEM; #endif -- cgit v1.2.2 From 934352f214b3251eb0793c1209d346595a661d80 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Mon, 10 Nov 2008 20:41:13 +0530 Subject: sched: add hierarchical accounting to cpu accounting controller Impact: improve CPU time accounting of tasks under the cpu accounting controller Add hierarchical accounting to cpu accounting controller and include cpuacct documentation. Currently, while charging the task's cputime to its accounting group, the accounting group hierarchy isn't updated. This patch charges the cputime of a task to its accounting group and all its parent accounting groups. Reported-by: Srivatsa Vaddagiri Signed-off-by: Bharata B Rao Reviewed-by: Paul Menage Acked-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 59db86c915f9..ebaf432365f6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9196,11 +9196,12 @@ struct cgroup_subsys cpu_cgroup_subsys = { * (balbir@in.ibm.com). */ -/* track cpu usage of a group of tasks */ +/* track cpu usage of a group of tasks and its child groups */ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every cpu */ u64 *cpuusage; + struct cpuacct *parent; }; struct cgroup_subsys cpuacct_subsys; @@ -9234,6 +9235,9 @@ static struct cgroup_subsys_state *cpuacct_create( return ERR_PTR(-ENOMEM); } + if (cgrp->parent) + ca->parent = cgroup_ca(cgrp->parent); + return &ca->css; } @@ -9313,14 +9317,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) static void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; + int cpu; if (!cpuacct_subsys.active) return; + cpu = task_cpu(tsk); ca = task_ca(tsk); - if (ca) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk)); + for (; ca; ca = ca->parent) { + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } } -- cgit v1.2.2 From ec4e0e2fe018992d980910db901637c814575914 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Tue, 18 Nov 2008 22:41:57 -0800 Subject: sched: fix inconsistency when redistribute per-cpu tg->cfs_rq shares Impact: make load-balancing more consistent In the update_shares() path leading to tg_shares_up(), the calculation of per-cpu cfs_rq shares is rather erratic even under moderate task wake up rate. The problem is that the per-cpu tg->cfs_rq load weight used in the sd_rq_weight aggregation and actual redistribution of the cfs_rq->shares are collected at different time. Under moderate system load, we've seen quite a bit of variation on the cfs_rq->shares and ultimately wildly affects sched_entity's load weight. This patch caches the result of initial per-cpu load weight when doing the sum calculation, and then pass it down to update_group_shares_cpu() for redistributing per-cpu cfs_rq shares. This allows consistent total cfs_rq shares across all CPUs. It also simplifies the rounding and zero load weight check. Signed-off-by: Ken Chen Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index a4c156d9a4a5..93bfb086e60f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1453,27 +1453,13 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long sd_shares, unsigned long sd_rq_weight) { - int boost = 0; unsigned long shares; unsigned long rq_weight; if (!tg->se[cpu]) return; - rq_weight = tg->cfs_rq[cpu]->load.weight; - - /* - * If there are currently no tasks on the cpu pretend there is one of - * average load so that when a new task gets to run here it will not - * get delayed by group starvation. - */ - if (!rq_weight) { - boost = 1; - rq_weight = NICE_0_LOAD; - } - - if (unlikely(rq_weight > sd_rq_weight)) - rq_weight = sd_rq_weight; + rq_weight = tg->cfs_rq[cpu]->rq_weight; /* * \Sum shares * rq_weight @@ -1481,7 +1467,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, * \Sum rq_weight * */ - shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); + shares = (sd_shares * rq_weight) / sd_rq_weight; shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); if (abs(shares - tg->se[cpu]->load.weight) > @@ -1490,11 +1476,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - /* - * record the actual number of shares, not the boosted amount. - */ - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; - tg->cfs_rq[cpu]->rq_weight = rq_weight; + tg->cfs_rq[cpu]->shares = shares; __set_se_shares(tg->se[cpu], shares); spin_unlock_irqrestore(&rq->lock, flags); @@ -1508,13 +1490,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long rq_weight = 0; + unsigned long weight, rq_weight = 0; unsigned long shares = 0; struct sched_domain *sd = data; int i; for_each_cpu_mask(i, sd->span) { - rq_weight += tg->cfs_rq[i]->load.weight; + /* + * If there are currently no tasks on the cpu pretend there + * is one of average load so that when a new task gets to + * run here it will not get delayed by group starvation. + */ + weight = tg->cfs_rq[i]->load.weight; + if (!weight) + weight = NICE_0_LOAD; + + tg->cfs_rq[i]->rq_weight = weight; + rq_weight += weight; shares += tg->cfs_rq[i]->shares; } @@ -1524,9 +1516,6 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - if (!rq_weight) - rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; - for_each_cpu_mask(i, sd->span) update_group_shares_cpu(tg, i, shares, rq_weight); -- cgit v1.2.2 From 957ad0166e9f76a8561dafa5e14ef5bd3f5e9a3b Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 21 Nov 2008 01:30:36 +0100 Subject: sched: update comment for move_task_off_dead_cpu Impact: cleanup This commit: commit f7b4cddcc5aca03e80e357360c9424dfba5056c2 Author: Oleg Nesterov Date: Tue Oct 16 23:30:56 2007 -0700 do CPU_DEAD migrating under read_lock(tasklist) instead of write_lock_irq(ta Currently move_task_off_dead_cpu() is called under write_lock_irq(tasklist). This means it can't use task_lock() which is needed to improve migrating to take task's ->cpuset into account. Change the code to call move_task_off_dead_cpu() with irqs enabled, and change migrate_live_tasks() to use read_lock(tasklist). ...forgot to update the comment in front of move_task_off_dead_cpu. Reference: http://lkml.org/lkml/2008/6/23/135 Signed-off-by: Vegard Nossum Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 93bfb086e60f..a6085d5166dd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6094,7 +6094,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) /* * Figure out where task on dead CPU should go, use force if necessary. - * NOTE: interrupts should be disabled by the caller */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { -- cgit v1.2.2 From 70574a996fc7a70c5586eb56bd92a544eccf18b6 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 28 Nov 2008 22:08:00 +0300 Subject: sched: move double_unlock_balance() higher Move double_lock_balance()/double_unlock_balance() higher to fix the following with gcc-3.4.6: CC kernel/sched.o In file included from kernel/sched.c:1605: kernel/sched_rt.c: In function `find_lock_lowest_rq': kernel/sched_rt.c:914: sorry, unimplemented: inlining failed in call to 'double_unlock_balance': function body not available kernel/sched_rt.c:1077: sorry, unimplemented: called from here make[2]: *** [kernel/sched.o] Error 1 Signed-off-by: Alexey Dobriyan Signed-off-by: Ingo Molnar --- kernel/sched.c | 67 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 34 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3d1ee429219b..6a99703e0eb0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1581,6 +1581,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #endif +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + int ret = 0; + + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + spin_unlock(&this_rq->lock); + BUG_ON(1); + } + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); + ret = 1; + } else + spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); + } + return ret; +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} #endif #ifdef CONFIG_FAIR_GROUP_SCHED @@ -2780,40 +2813,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) __release(rq2->lock); } -/* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. - */ -static int double_lock_balance(struct rq *this_rq, struct rq *busiest) - __releases(this_rq->lock) - __acquires(busiest->lock) - __acquires(this_rq->lock) -{ - int ret = 0; - - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ - spin_unlock(&this_rq->lock); - BUG_ON(1); - } - if (unlikely(!spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { - spin_unlock(&this_rq->lock); - spin_lock(&busiest->lock); - spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); - ret = 1; - } else - spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); - } - return ret; -} - -static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) - __releases(busiest->lock) -{ - spin_unlock(&busiest->lock); - lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); -} - /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only -- cgit v1.2.2 From 6c415b9234a8c71f290e5d4fddc467f103f32719 Mon Sep 17 00:00:00 2001 From: Arun R Bharadwaj Date: Mon, 1 Dec 2008 20:49:05 +0530 Subject: sched: add uid information to sched_debug for CONFIG_USER_SCHED Impact: extend information in /proc/sched_debug This patch adds uid information in sched_debug for CONFIG_USER_SCHED Signed-off-by: Arun R Bharadwaj Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 6a99703e0eb0..4c7388ef5be7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -261,6 +261,10 @@ struct task_group { struct cgroup_subsys_state css; #endif +#ifdef CONFIG_USER_SCHED + uid_t uid; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each cpu */ struct sched_entity **se; @@ -286,6 +290,12 @@ struct task_group { #ifdef CONFIG_USER_SCHED +/* Helper function to pass uid information to create_sched_user() */ +void set_tg_uid(struct user_struct *user) +{ + user->tg->uid = user->uid; +} + /* * Root task group. * Every UID task group (including init_task_group aka UID-0) will -- cgit v1.2.2 From 5436499e6098759c2340f8b906ea52f993dc4efb Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Sun, 7 Dec 2008 18:47:37 -0800 Subject: sched: fix sd_parent_degenerate on non-numa smp machine Impact: optimize the sched domains tree some more The addition of SD_SERIALIZE flag added to SD_NODE_INIT prevented top level dummy numa sched_domain to be properly degenerated on non-numa smp machine. The reason is that in sd_parent_degenerate(), it found that the child and parent does not have comon sched_domain flags due to SD_SERIALIZE. However, for non-numa smp box, the top level is a dummy with a single sched_group. Filter out SD_SERIALIZE if it is on non-numa machine to properly degenerate top level node sched_domain. this will cut back some of the sd domain walk in the load balancer code. Signed-off-by: Ken Chen Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 152828239ef0..74498c840f93 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6768,6 +6768,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES); + if (nr_node_ids == 1) + pflags &= ~SD_SERIALIZE; } if (~cflags & pflags) return 0; -- cgit v1.2.2 From efbe027e95dc13ac343b6130948418d7ead7ddf1 Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Mon, 8 Dec 2008 20:52:49 +0530 Subject: sched: idle_balance() does not call load_balance_newidle() Impact: fix SD_BALANCE_NEWIDLEand broaden its use load_balance_newidle() does not get called if SD_BALANCE_NEWIDLE is set at higher level domain (3-CPU) and not in low level domain (2-MC). pulled_task is initialised to -1 and checked for non-zero which is always true if the lowest level sched_domain does not have SD_BALANCE_NEWIDLE flag set. Signed-off-by: Vaidyanathan Srinivasan Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 74498c840f93..bb9c6384d077 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3685,7 +3685,7 @@ out_balanced: static void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; - int pulled_task = -1; + int pulled_task = 0; unsigned long next_balance = jiffies + HZ; cpumask_t tmpmask; -- cgit v1.2.2 From ee79d1bdb6a10499e53f80b1e8d14110215178ba Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 9 Dec 2008 18:49:50 +0100 Subject: sched: let arch_update_cpu_topology indicate if topology changed Change arch_update_cpu_topology so it returns 1 if the cpu topology changed and 0 if it didn't change. This will be useful for the next patch which adds a call to this function in partition_sched_domains. Signed-off-by: Heiko Carstens Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ef212da928e8..fcfbbd9dbd60 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7675,8 +7675,14 @@ static struct sched_domain_attr *dattr_cur; */ static cpumask_t fallback_doms; -void __attribute__((weak)) arch_update_cpu_topology(void) +/* + * arch_update_cpu_topology lets virtualized architectures update the + * cpu core maps. It is supposed to return 1 if the topology changed + * or 0 if it stayed the same. + */ +int __attribute__((weak)) arch_update_cpu_topology(void) { + return 0; } /* -- cgit v1.2.2 From d65bd5ecb2bd166cea4952a59b7e16cc3ad6ef6c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 9 Dec 2008 18:49:51 +0100 Subject: sched: add missing arch_update_cpu_topology() call arch_reinit_sched_domains() used to call arch_update_cpu_topology() via arch_init_sched_domains(). This call got lost with e761b7725234276a802322549cee5255305a0930 ("cpu hotplug, sched: Introduce cpu_active_map and redo sched domain managment (take 2)". So we might end up with outdated and missing cpus in the cpu core maps (architecture used to call arch_reinit_sched_domains if cpu topology changed). This adds a call to arch_update_cpu_topology in partition_sched_domains which gets called whenever scheduling domains get updated. Which is what is supposed to happen when cpu topology changes. Signed-off-by: Heiko Carstens Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index fcfbbd9dbd60..ad7b93be5691 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7774,17 +7774,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, struct sched_domain_attr *dattr_new) { int i, j, n; + int new_topology; mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); + /* Let architecture update cpu core mappings. */ + new_topology = arch_update_cpu_topology(); + n = doms_new ? ndoms_new : 0; /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { - for (j = 0; j < n; j++) { + for (j = 0; j < n && !new_topology; j++) { if (cpus_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; @@ -7804,7 +7808,7 @@ match1: /* Build new domains */ for (i = 0; i < ndoms_new; i++) { - for (j = 0; j < ndoms_cur; j++) { + for (j = 0; j < ndoms_cur && !new_topology; j++) { if (cpus_equal(doms_new[i], doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; -- cgit v1.2.2 From 03e89e4574a680af15f59329b061f35d9813aff4 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 16 Dec 2008 08:45:30 +0100 Subject: sched: fix wakeup preemption clock Impact: sharpen the wakeup-granularity to always be against current scheduler time It was possible to do the preemption check against an old time stamp. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index ad7b93be5691..88215066efae 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2266,6 +2266,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) smp_wmb(); rq = task_rq_lock(p, &flags); + update_rq_clock(rq); old_state = p->state; if (!(old_state & state)) goto out; @@ -2323,7 +2324,6 @@ out_activate: schedstat_inc(p, se.nr_wakeups_local); else schedstat_inc(p, se.nr_wakeups_remote); - update_rq_clock(rq); activate_task(rq, p, 1); success = 1; -- cgit v1.2.2 From 720f54988e17b33f3f477010b3a68ee872d20d5a Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Mon, 15 Dec 2008 22:02:01 -0800 Subject: sched, cpuacct: refactoring cpuusage_read / cpuusage_write Impact: micro-optimize the code on 64-bit architectures In the thread regarding to 'export percpu cpuacct cgroup stats' http://lkml.org/lkml/2008/12/7/13 akpm pointed out that current cpuacct code is inefficient. This patch refactoring the following: * make cpu_rq locking only on 32-bit * change iterator to each_present_cpu instead of each_possible_cpu to make it hotplug friendly. It's a bit of code churn, but I was rewarded with 160 byte code size saving on x86-64 arch and zero code size change on i386. Signed-off-by: Ken Chen Cc: Paul Menage Cc: Li Zefan Signed-off-by: Ingo Molnar --- kernel/sched.c | 56 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 17 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 88215066efae..41b7e2d524d6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9275,6 +9275,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) kfree(ca); } +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + /* return total cpu usage (in nanoseconds) of a group */ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) { @@ -9282,17 +9317,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) u64 totalcpuusage = 0; int i; - for_each_possible_cpu(i) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, i); - - /* - * Take rq->lock to make 64-bit addition safe on 32-bit - * platforms. - */ - spin_lock_irq(&cpu_rq(i)->lock); - totalcpuusage += *cpuusage; - spin_unlock_irq(&cpu_rq(i)->lock); - } + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); return totalcpuusage; } @@ -9309,13 +9335,9 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, goto out; } - for_each_possible_cpu(i) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, i); + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); - spin_lock_irq(&cpu_rq(i)->lock); - *cpuusage = 0; - spin_unlock_irq(&cpu_rq(i)->lock); - } out: return err; } -- cgit v1.2.2 From e9515c3c9feecd74174c2998add0db51e02abb8d Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Mon, 15 Dec 2008 22:04:15 -0800 Subject: sched, cpuacct: export percpu cpuacct cgroup stats This patch export per-cpu CPU cycle usage for a given cpuacct cgroup. There is a need for a user space monitor daemon to track group CPU usage on per-cpu base. It is also useful for monitoring CFS load balancer behavior by tracking per CPU group usage. Signed-off-by: Ken Chen Reviewed-by: Li Zefan Reviewed-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 41b7e2d524d6..f53e2b8ef521 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9342,12 +9342,32 @@ out: return err; } +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + static struct cftype files[] = { { .name = "usage", .read_u64 = cpuusage_read, .write_u64 = cpuusage_write, }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + }; static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) -- cgit v1.2.2 From 9c2c48020ec0dd6ecd27e5a1298f73b40d85a595 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Tue, 16 Dec 2008 23:41:22 -0800 Subject: schedstat: consolidate per-task cpu runtime stats Impact: simplify code When we turn on CONFIG_SCHEDSTATS, per-task cpu runtime is accumulated twice. Once in task->se.sum_exec_runtime and once in sched_info.cpu_time. These two stats are exactly the same. Given that task->se.sum_exec_runtime is always accumulated by the core scheduler, sched_info can reuse that data instead of duplicate the accounting. Signed-off-by: Ken Chen Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f53e2b8ef521..fd835fc320b8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -596,6 +596,8 @@ struct rq { #ifdef CONFIG_SCHEDSTATS /* latency stats */ struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ unsigned int yld_exp_empty; -- cgit v1.2.2