Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (29 commits) sched: Export account_system_vtime() sched: Call tick_check_idle before __irq_enter sched: Remove irq time from available CPU power sched: Do not account irq time to current task x86: Add IRQ_TIME_ACCOUNTING sched: Add IRQ_TIME_ACCOUNTING, finer accounting of irq time sched: Add a PF flag for ksoftirqd identification sched: Consolidate account_system_vtime extern declaration sched: Fix softirq time accounting sched: Drop group_capacity to 1 only if local group has extra capacity sched: Force balancing on newidle balance if local group has capacity sched: Set group_imb only a task can be pulled from the busiest cpu sched: Do not consider SCHED_IDLE tasks to be cache hot sched: Drop all load weight manipulation for RT tasks sched: Create special class for stop/migrate work sched: Unindent labels sched: Comment updates: fix default latency and granularity numbers tracing/sched: Add sched_pi_setprio tracepoint sched: Give CPU bound RT tasks preference sched: Try not to migrate higher priority RT tasks ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2010-10-21 15:55:43 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2010-10-21 15:55:43 -0400
commit: bc4016f48161454a9a8e5eb209b0693c6cde9f62 (patch)
tree: f470f5d711e975b152eec90282f5dd30a1d5dba5 /kernel/sched.c
parent: 5d70f79b5ef6ea2de4f72a37b2d96e2601e40a22 (diff)
parent: b7dadc38797584f6203386da1947ed5edf516646 (diff)
1 files changed, 246 insertions, 45 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 5a5cc33e4999..d42992bccdfa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -426,9 +426,7 @@ struct root_domain {
         */
        cpumask_var_t rto_mask;
        atomic_t rto_count;
-#ifdef CONFIG_SMP
        struct cpupri cpupri;
-#endif
 };
 /*
@@ -437,7 +435,7 @@ struct root_domain {
 */
 static struct root_domain def_root_domain;
-#endif
+#endif /* CONFIG_SMP */
 /*
 * This is the main, per-CPU runqueue data structure.
@@ -488,11 +486,12 @@ struct rq {
         */
        unsigned long nr_uninterruptible;
-        struct task_struct *curr, *idle;
+        struct task_struct *curr, *idle, *stop;
        unsigned long next_balance;
        struct mm_struct *prev_mm;
        u64 clock;
+        u64 clock_task;
        atomic_t nr_iowait;
@@ -520,6 +519,10 @@ struct rq {
        u64 avg_idle;
 #endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+        u64 prev_irq_time;
+#endif
        /* calc_load related fields */
        unsigned long calc_load_update;
        long calc_load_active;
@@ -643,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)
 #endif /* CONFIG_CGROUP_SCHED */
+static u64 irq_time_cpu(int cpu);
+static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
 inline void update_rq_clock(struct rq *rq)
 {
-        if (!rq->skip_clock_update)
+        if (!rq->skip_clock_update) {
-                rq->clock = sched_clock_cpu(cpu_of(rq));
+                int cpu = cpu_of(rq);
+                u64 irq_time;
+                rq->clock = sched_clock_cpu(cpu);
+                irq_time = irq_time_cpu(cpu);
+                if (rq->clock - irq_time > rq->clock_task)
+                        rq->clock_task = rq->clock - irq_time;
+                sched_irq_time_avg_update(rq, irq_time);
+        }
 }
 /*
@@ -723,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                size_t cnt, loff_t *ppos)
 {
        char buf[64];
-        char *cmp = buf;
+        char *cmp;
        int neg = 0;
        int i;
@@ -734,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                return -EFAULT;
        buf[cnt] = 0;
+        cmp = strstrip(buf);
        if (strncmp(buf, "NO_", 3) == 0) {
                neg = 1;
@@ -741,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        }
        for (i = 0; sched_feat_names[i]; i++) {
-                int len = strlen(sched_feat_names[i]);
+                if (strcmp(cmp, sched_feat_names[i]) == 0) {
-                if (strncmp(cmp, sched_feat_names[i], len) == 0) {
                        if (neg)
                                sysctl_sched_features &= ~(1UL << i);
                        else
@@ -1840,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 static const struct sched_class rt_sched_class;
-#define sched_class_highest (&rt_sched_class)
+#define sched_class_highest (&stop_sched_class)
 #define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)
@@ -1858,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)
 static void set_load_weight(struct task_struct *p)
 {
-        if (task_has_rt_policy(p)) {
-                p->se.load.weight = 0;
-                p->se.load.inv_weight = WMULT_CONST;
-                return;
-        }
        /*
         * SCHED_IDLE tasks get minimal weight:
         */
@@ -1917,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
        dec_nr_running(rq);
 }
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value (or semi updated value on 32 bit) with a side effect of
+ * accounting a slice of irq time to wrong task when irq is in progress
+ * while we read rq->clock. That is a worthy compromise in place of having
+ * locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+void enable_sched_clock_irqtime(void)
+{
+        sched_clock_irqtime = 1;
+}
+void disable_sched_clock_irqtime(void)
+{
+        sched_clock_irqtime = 0;
+}
+static u64 irq_time_cpu(int cpu)
+{
+        if (!sched_clock_irqtime)
+                return 0;
+        return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+void account_system_vtime(struct task_struct *curr)
+{
+        unsigned long flags;
+        int cpu;
+        u64 now, delta;
+        if (!sched_clock_irqtime)
+                return;
+        local_irq_save(flags);
+        cpu = smp_processor_id();
+        now = sched_clock_cpu(cpu);
+        delta = now - per_cpu(irq_start_time, cpu);
+        per_cpu(irq_start_time, cpu) = now;
+        /*
+         * We do not account for softirq time from ksoftirqd here.
+         * We want to continue accounting softirq time to ksoftirqd thread
+         * in that case, so as not to confuse scheduler with a special task
+         * that do not consume any time, but still wants to run.
+         */
+        if (hardirq_count())
+                per_cpu(cpu_hardirq_time, cpu) += delta;
+        else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+                per_cpu(cpu_softirq_time, cpu) += delta;
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+{
+        if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
+                u64 delta_irq = curr_irq_time - rq->prev_irq_time;
+                rq->prev_irq_time = curr_irq_time;
+                sched_rt_avg_update(rq, delta_irq);
+        }
+}
+#else
+static u64 irq_time_cpu(int cpu)
+{
+        return 0;
+}
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+#endif
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "sched_stoptask.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+        struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+        struct task_struct *old_stop = cpu_rq(cpu)->stop;
+        if (stop) {
+                /*
+                 * Make it appear like a SCHED_FIFO task, its something
+                 * userspace knows about and won't get confused about.
+                 *
+                 * Also, it will make PI more or less work without too
+                 * much confusion -- but then, stop work should not
+                 * rely on PI working anyway.
+                 */
+                sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+                stop->sched_class = &stop_sched_class;
+        }
+        cpu_rq(cpu)->stop = stop;
+        if (old_stop) {
+                /*
+                 * Reset it back to a normal scheduling class so that
+                 * it can die in pieces.
+                 */
+                old_stop->sched_class = &rt_sched_class;
+        }
+}
 /*
 * __normal_prio - return the priority that is based on the static prio
 */
@@ -2003,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
        if (p->sched_class != &fair_sched_class)
                return 0;
+        if (unlikely(p->policy == SCHED_IDLE))
+                return 0;
        /*
         * Buddy candidates are cache hot:
         */
@@ -2852,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
         */
        arch_start_context_switch(prev);
-        if (likely(!mm)) {
+        if (!mm) {
                next->active_mm = oldmm;
                atomic_inc(&oldmm->mm_count);
                enter_lazy_tlb(oldmm, next);
        } else
                switch_mm(oldmm, mm, next);
-        if (likely(!prev->mm)) {
+        if (!prev->mm) {
                prev->active_mm = NULL;
                rq->prev_mm = oldmm;
        }
@@ -3248,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
        if (task_current(rq, p)) {
                update_rq_clock(rq);
-                ns = rq->clock - p->se.exec_start;
+                ns = rq->clock_task - p->se.exec_start;
                if ((s64)ns < 0)
                        ns = 0;
        }
@@ -3397,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
        tmp = cputime_to_cputime64(cputime);
        if (hardirq_count() - hardirq_offset)
                cpustat->irq = cputime64_add(cpustat->irq, tmp);
-        else if (softirq_count())
+        else if (in_serving_softirq())
                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
        else
                cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3723,17 +3853,13 @@ pick_next_task(struct rq *rq)
                        return p;
        }
-        class = sched_class_highest;
+        for_each_class(class) {
-        for ( ; ; ) {
                p = class->pick_next_task(rq);
                if (p)
                        return p;
-                /*
-                 * Will never be NULL as the idle class always
-                 * returns a non-NULL p:
-                 */
-                class = class->next;
        }
+        BUG(); /* the idle class will always have a runnable task */
 }
 /*
@@ -4358,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        rq = task_rq_lock(p, &flags);
+        trace_sched_pi_setprio(p, prio);
        oldprio = p->prio;
        prev_class = p->sched_class;
        on_rq = p->se.on_rq;
@@ -4661,6 +4788,15 @@ recheck:
         */
        rq = __task_rq_lock(p);
+        /*
+         * Changing the policy of the stop threads its a very bad idea
+         */
+        if (p == rq->stop) {
+                __task_rq_unlock(rq);
+                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+                return -EINVAL;
+        }
 #ifdef CONFIG_RT_GROUP_SCHED
        if (user) {
                /*
@@ -4893,7 +5029,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        cpuset_cpus_allowed(p, cpus_allowed);
        cpumask_and(new_mask, in_mask, cpus_allowed);
- again:
+again:
        retval = set_cpus_allowed_ptr(p, new_mask);
        if (!retval) {
@@ -6526,6 +6662,7 @@ struct s_data {
        cpumask_var_t           nodemask;
        cpumask_var_t           this_sibling_map;
        cpumask_var_t           this_core_map;
+        cpumask_var_t           this_book_map;
        cpumask_var_t           send_covered;
        cpumask_var_t           tmpmask;
        struct sched_group      **sched_group_nodes;
@@ -6537,6 +6674,7 @@ enum s_alloc {
        sa_rootdomain,
        sa_tmpmask,
        sa_send_covered,
+        sa_this_book_map,
        sa_this_core_map,
        sa_this_sibling_map,
        sa_nodemask,
@@ -6572,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
-#endif /* CONFIG_SCHED_MC */
-#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int
 cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
                  struct sched_group **sg, struct cpumask *mask)
 {
        int group;
+#ifdef CONFIG_SCHED_SMT
        cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
        group = cpumask_first(mask);
+#else
+        group = cpu;
+#endif
        if (sg)
                *sg = &per_cpu(sched_group_core, group).sg;
        return group;
 }
-#elif defined(CONFIG_SCHED_MC)
+#endif /* CONFIG_SCHED_MC */
+/*
+ * book sched-domains:
+ */
+#ifdef CONFIG_SCHED_BOOK
+static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
 static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
-                  struct sched_group **sg, struct cpumask *unused)
+                  struct sched_group **sg, struct cpumask *mask)
 {
+        int group = cpu;
+#ifdef CONFIG_SCHED_MC
+        cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
+        group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_SMT)
+        cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
+        group = cpumask_first(mask);
+#endif
        if (sg)
-                *sg = &per_cpu(sched_group_core, cpu).sg;
+                *sg = &per_cpu(sched_group_book, group).sg;
-        return cpu;
+        return group;
 }
-#endif
+#endif /* CONFIG_SCHED_BOOK */
 static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6606,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
                  struct sched_group **sg, struct cpumask *mask)
 {
        int group;
-#ifdef CONFIG_SCHED_MC
+#ifdef CONFIG_SCHED_BOOK
+        cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
+        group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_MC)
        cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
        group = cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
@@ -6867,6 +7025,9 @@ SD_INIT_FUNC(CPU)
 #ifdef CONFIG_SCHED_MC
 SD_INIT_FUNC(MC)
 #endif
+#ifdef CONFIG_SCHED_BOOK
+ SD_INIT_FUNC(BOOK)
+#endif
 static int default_relax_domain_level = -1;
@@ -6916,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
                free_cpumask_var(d->tmpmask); /* fall through */
        case sa_send_covered:
                free_cpumask_var(d->send_covered); /* fall through */
+        case sa_this_book_map:
+                free_cpumask_var(d->this_book_map); /* fall through */
        case sa_this_core_map:
                free_cpumask_var(d->this_core_map); /* fall through */
        case sa_this_sibling_map:
@@ -6962,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
                return sa_nodemask;
        if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
                return sa_this_sibling_map;
-        if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+        if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
                return sa_this_core_map;
+        if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+                return sa_this_book_map;
        if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
                return sa_send_covered;
        d->rd = alloc_rootdomain();
@@ -7021,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
        return sd;
 }
+static struct sched_domain *__build_book_sched_domain(struct s_data *d,
+        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+        struct sched_domain *parent, int i)
+{
+        struct sched_domain *sd = parent;
+#ifdef CONFIG_SCHED_BOOK
+        sd = &per_cpu(book_domains, i).sd;
+        SD_INIT(sd, BOOK);
+        set_domain_attribute(sd, attr);
+        cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
+        sd->parent = parent;
+        parent->child = sd;
+        cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
+#endif
+        return sd;
+}
 static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
        struct sched_domain *parent, int i)
@@ -7078,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
                                                d->send_covered, d->tmpmask);
                break;
 #endif
+#ifdef CONFIG_SCHED_BOOK
+        case SD_LV_BOOK: /* set up book groups */
+                cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
+                if (cpu == cpumask_first(d->this_book_map))
+                        init_sched_build_groups(d->this_book_map, cpu_map,
+                                                &cpu_to_book_group,
+                                                d->send_covered, d->tmpmask);
+                break;
+#endif
        case SD_LV_CPU: /* set up physical groups */
                cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
                if (!cpumask_empty(d->nodemask))
@@ -7125,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
                sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+                sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
                sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
                sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
        }
        for_each_cpu(i, cpu_map) {
                build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+                build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
                build_sched_groups(&d, SD_LV_MC, cpu_map, i);
        }
@@ -7161,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                init_sched_groups_power(i, sd);
        }
 #endif
+#ifdef CONFIG_SCHED_BOOK
+        for_each_cpu(i, cpu_map) {
+                sd = &per_cpu(book_domains, i).sd;
+                init_sched_groups_power(i, sd);
+        }
+#endif
        for_each_cpu(i, cpu_map) {
                sd = &per_cpu(phys_domains, i).sd;
@@ -7186,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                sd = &per_cpu(cpu_domains, i).sd;
 #elif defined(CONFIG_SCHED_MC)
                sd = &per_cpu(core_domains, i).sd;
+#elif defined(CONFIG_SCHED_BOOK)
+                sd = &per_cpu(book_domains, i).sd;
 #else
                sd = &per_cpu(phys_domains, i).sd;
 #endif
@@ -8090,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        return 1;
- err_free_rq:
+err_free_rq:
        kfree(cfs_rq);
- err:
+err:
        return 0;
 }
@@ -8180,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
        return 1;
- err_free_rq:
+err_free_rq:
        kfree(rt_rq);
- err:
+err:
        return 0;
 }
@@ -8540,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,
                raw_spin_unlock(&rt_rq->rt_runtime_lock);
        }
        raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- unlock:
+unlock:
        read_unlock(&tasklist_lock);
        mutex_unlock(&rt_constraints_mutex);
author	Linus Torvalds <torvalds@linux-foundation.org>	2010-10-21 15:55:43 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2010-10-21 15:55:43 -0400
commit	bc4016f48161454a9a8e5eb209b0693c6cde9f62 (patch)
tree	f470f5d711e975b152eec90282f5dd30a1d5dba5 /kernel/sched.c
parent	5d70f79b5ef6ea2de4f72a37b2d96e2601e40a22 (diff)
parent	b7dadc38797584f6203386da1947ed5edf516646 (diff)