From 41acab8851a0408c1d5ad6c21a07456f88b54d40 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 10 Mar 2010 23:37:45 -0300 Subject: sched: Implement group scheduler statistics in one struct Put all statistic fields of sched_entity in one struct, sched_statistics, and embed it into sched_entity. This change allows to memset the sched_statistics to 0 when needed (for instance when forking), avoiding bugs of non initialized fields. Signed-off-by: Lucas De Marchi Signed-off-by: Peter Zijlstra LKML-Reference: <1268275065-18542-1-git-send-email-lucas.de.marchi@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 54 ++++++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4b1753f7e48e..8cc863d66477 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1127,36 +1127,8 @@ struct load_weight { unsigned long weight, inv_weight; }; -/* - * CFS stats for a schedulable entity (task, task-group etc) - * - * Current field usage histogram: - * - * 4 se->block_start - * 4 se->run_node - * 4 se->sleep_start - * 6 se->load.weight - */ -struct sched_entity { - struct load_weight load; /* for load-balancing */ - struct rb_node run_node; - struct list_head group_node; - unsigned int on_rq; - - u64 exec_start; - u64 sum_exec_runtime; - u64 vruntime; - u64 prev_sum_exec_runtime; - - u64 last_wakeup; - u64 avg_overlap; - - u64 nr_migrations; - - u64 start_runtime; - u64 avg_wakeup; - #ifdef CONFIG_SCHEDSTATS +struct sched_statistics { u64 wait_start; u64 wait_max; u64 wait_count; @@ -1188,6 +1160,30 @@ struct sched_entity { u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; +}; +#endif + +struct sched_entity { + struct load_weight load; /* for load-balancing */ + struct rb_node run_node; + struct list_head group_node; + unsigned int on_rq; + + u64 exec_start; + u64 sum_exec_runtime; + u64 vruntime; + u64 prev_sum_exec_runtime; + + u64 last_wakeup; + u64 avg_overlap; + + u64 nr_migrations; + + u64 start_runtime; + u64 avg_wakeup; + +#ifdef CONFIG_SCHEDSTATS + struct sched_statistics statistics; #endif #ifdef CONFIG_FAIR_GROUP_SCHED -- cgit v1.2.2 From 39c0cbe2150cbd848a25ba6cdb271d1ad46818ad Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 11 Mar 2010 17:17:13 +0100 Subject: sched: Rate-limit nohz Entering nohz code on every micro-idle is costing ~10% throughput for netperf TCP_RR when scheduling cross-cpu. Rate limiting entry fixes this, but raises ticks a bit. On my Q6600, an idle box goes from ~85 interrupts/sec to 128. The higher the context switch rate, the more nohz entry costs. With this patch and some cycle recovery patches in my tree, max cross cpu context switch rate is improved by ~16%, a large portion of which of which is this ratelimiting. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1268301003.6785.28.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8cc863d66477..13efe7dac5fa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -271,11 +271,17 @@ extern cpumask_var_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern int select_nohz_load_balancer(int cpu); extern int get_nohz_load_balancer(void); +extern int nohz_ratelimit(int cpu); #else static inline int select_nohz_load_balancer(int cpu) { return 0; } + +static inline int nohz_ratelimit(int cpu) +{ + return 0; +} #endif /* -- cgit v1.2.2 From b42e0c41a422a212ddea0666d5a3a0e3c35206db Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 11 Mar 2010 17:15:38 +0100 Subject: sched: Remove avg_wakeup Testing the load which led to this heuristic (nfs4 kbuild) shows that it has outlived it's usefullness. With intervening load balancing changes, I cannot see any difference with/without, so recover there fastpath cycles. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1268301062.6785.29.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 13efe7dac5fa..70c560f5ada0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1185,9 +1185,6 @@ struct sched_entity { u64 nr_migrations; - u64 start_runtime; - u64 avg_wakeup; - #ifdef CONFIG_SCHEDSTATS struct sched_statistics statistics; #endif -- cgit v1.2.2 From e12f31d3e5d36328c7fbd0fce40a95e70b59152c Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 11 Mar 2010 17:15:51 +0100 Subject: sched: Remove avg_overlap Both avg_overlap and avg_wakeup had an inherent problem in that their accuracy was detrimentally affected by cross-cpu wakeups, this because we are missing the necessary call to update_curr(). This can't be fixed without increasing overhead in our already too fat fastpath. Additionally, with recent load balancing changes making us prefer to place tasks in an idle cache domain (which is good for compute bound loads), communicating tasks suffer when a sync wakeup, which would enable affine placement, is turned into a non-sync wakeup by SYNC_LESS. With one task on the runqueue, wake_affine() rejects the affine wakeup request, leaving the unfortunate where placed, taking frequent cache misses. Remove it, and recover some fastpath cycles. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1268301121.6785.30.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 70c560f5ada0..8604884cee87 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1180,9 +1180,6 @@ struct sched_entity { u64 vruntime; u64 prev_sum_exec_runtime; - u64 last_wakeup; - u64 avg_overlap; - u64 nr_migrations; #ifdef CONFIG_SCHEDSTATS -- cgit v1.2.2 From 897f0b3c3ff40b443c84e271bef19bd6ae885195 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 15 Mar 2010 10:10:03 +0100 Subject: sched: Kill the broken and deadlockable cpuset_lock/cpuset_cpus_allowed_locked code This patch just states the fact the cpusets/cpuhotplug interaction is broken and removes the deadlockable code which only pretends to work. - cpuset_lock() doesn't really work. It is needed for cpuset_cpus_allowed_locked() but we can't take this lock in try_to_wake_up()->select_fallback_rq() path. - cpuset_lock() is deadlockable. Suppose that a task T bound to CPU takes callback_mutex. If cpu_down(CPU) happens before T drops callback_mutex stop_machine() preempts T, then migration_call(CPU_DEAD) tries to take cpuset_lock() and hangs forever because CPU is already dead and thus T can't be scheduled. - cpuset_cpus_allowed_locked() is deadlockable too. It takes task_lock() which is not irq-safe, but try_to_wake_up() can be called from irq. Kill them, and change select_fallback_rq() to use cpu_possible_mask, like we currently do without CONFIG_CPUSETS. Also, with or without this patch, with or without CONFIG_CPUSETS, the callers of select_fallback_rq() can race with each other or with set_cpus_allowed() pathes. The subsequent patches try to to fix these problems. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra LKML-Reference: <20100315091003.GA9123@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/cpuset.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index a5740fc4d04b..eeaaee746bee 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -21,8 +21,6 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); -extern void cpuset_cpus_allowed_locked(struct task_struct *p, - struct cpumask *mask); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -69,9 +67,6 @@ struct seq_file; extern void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task); -extern void cpuset_lock(void); -extern void cpuset_unlock(void); - extern int cpuset_mem_spread_node(void); static inline int cpuset_do_page_mem_spread(void) @@ -105,11 +100,6 @@ static inline void cpuset_cpus_allowed(struct task_struct *p, { cpumask_copy(mask, cpu_possible_mask); } -static inline void cpuset_cpus_allowed_locked(struct task_struct *p, - struct cpumask *mask) -{ - cpumask_copy(mask, cpu_possible_mask); -} static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) { @@ -157,9 +147,6 @@ static inline void cpuset_task_status_allowed(struct seq_file *m, { } -static inline void cpuset_lock(void) {} -static inline void cpuset_unlock(void) {} - static inline int cpuset_mem_spread_node(void) { return 0; -- cgit v1.2.2 From 6a1bdc1b577ebcb65f6603c57f8347309bc4ab13 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 15 Mar 2010 10:10:23 +0100 Subject: sched: _cpu_down(): Don't play with current->cpus_allowed _cpu_down() changes the current task's affinity and then recovers it at the end. The problems are well known: we can't restore old_allowed if it was bound to the now-dead-cpu, and we can race with the userspace which can change cpu-affinity during unplug. _cpu_down() should not play with current->cpus_allowed at all. Instead, take_cpu_down() can migrate the caller of _cpu_down() after __cpu_disable() removes the dying cpu from cpu_online_mask. Signed-off-by: Oleg Nesterov Acked-by: Rafael J. Wysocki Signed-off-by: Peter Zijlstra LKML-Reference: <20100315091023.GA9148@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 43c945152732..8bea40725c76 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1843,6 +1843,7 @@ extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); #ifdef CONFIG_HOTPLUG_CPU +extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p); extern void idle_task_exit(void); #else static inline void idle_task_exit(void) {} -- cgit v1.2.2 From 9084bb8246ea935b98320554229e2f371f7f52fa Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 15 Mar 2010 10:10:27 +0100 Subject: sched: Make select_fallback_rq() cpuset friendly Introduce cpuset_cpus_allowed_fallback() helper to fix the cpuset problems with select_fallback_rq(). It can be called from any context and can't use any cpuset locks including task_lock(). It is called when the task doesn't have online cpus in ->cpus_allowed but ttwu/etc must be able to find a suitable cpu. I am not proud of this patch. Everything which needs such a fat comment can't be good even if correct. But I'd prefer to not change the locking rules in the code I hardly understand, and in any case I believe this simple change make the code much more correct compared to deadlocks we currently have. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra LKML-Reference: <20100315091027.GA9155@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/cpuset.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index eeaaee746bee..a73454aec333 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -21,6 +21,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); +extern int cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -101,6 +102,12 @@ static inline void cpuset_cpus_allowed(struct task_struct *p, cpumask_copy(mask, cpu_possible_mask); } +static inline int cpuset_cpus_allowed_fallback(struct task_struct *p) +{ + cpumask_copy(&p->cpus_allowed, cpu_possible_mask); + return cpumask_any(cpu_active_mask); +} + static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) { return node_possible_map; -- cgit v1.2.2 From 0017d735092844118bef006696a750a0e4ef6ebd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Mar 2010 18:34:10 +0100 Subject: sched: Fix TASK_WAKING vs fork deadlock Oleg noticed a few races with the TASK_WAKING usage on fork. - since TASK_WAKING is basically a spinlock, it should be IRQ safe - since we set TASK_WAKING (*) without holding rq->lock it could be there still is a rq->lock holder, thereby not actually providing full serialization. (*) in fact we clear PF_STARTING, which in effect enables TASK_WAKING. Cure the second issue by not setting TASK_WAKING in sched_fork(), but only temporarily in wake_up_new_task() while calling select_task_rq(). Cure the first by holding rq->lock around the select_task_rq() call, this will disable IRQs, this however requires that we push down the rq->lock release into select_task_rq_fair()'s cgroup stuff. Because select_task_rq_fair() still needs to drop the rq->lock we cannot fully get rid of TASK_WAKING. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8bea40725c76..fb6c18843ee8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1046,7 +1046,8 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); + int (*select_task_rq)(struct rq *rq, struct task_struct *p, + int sd_flag, int flags); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); -- cgit v1.2.2 From 371fd7e7a56a5c136d31aa980011bd2f131c3ef5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Mar 2010 16:38:48 +0100 Subject: sched: Add enqueue/dequeue flags In order to reduce the dependency on TASK_WAKING rework the enqueue interface to support a proper flags field. Replace the int wakeup, bool head arguments with an int flags argument and create the following flags: ENQUEUE_WAKEUP - the enqueue is a wakeup of a sleeping task, ENQUEUE_WAKING - the enqueue has relative vruntime due to having sched_class::task_waking() called, ENQUEUE_HEAD - the waking task should be places on the head of the priority queue (where appropriate). For symmetry also convert sched_class::dequeue() to a flags scheme. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index fb6c18843ee8..e3e900f318d7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1032,12 +1032,17 @@ struct sched_domain; #define WF_SYNC 0x01 /* waker goes to sleep after wakup */ #define WF_FORK 0x02 /* child wakeup after fork */ +#define ENQUEUE_WAKEUP 1 +#define ENQUEUE_WAKING 2 +#define ENQUEUE_HEAD 4 + +#define DEQUEUE_SLEEP 1 + struct sched_class { const struct sched_class *next; - void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup, - bool head); - void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); -- cgit v1.2.2 From 669c55e9f99b90e46eaa0f98a67ec53d46dc969a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 16 Apr 2010 14:59:29 +0200 Subject: sched: Pre-compute cpumask_weight(sched_domain_span(sd)) Dave reported that his large SPARC machines spend lots of time in hweight64(), try and optimize some of those needless cpumask_weight() invocations (esp. with the large offstack cpumasks these are very expensive indeed). Reported-by: David Miller Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index e3e900f318d7..dfea40574b2a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -960,6 +960,7 @@ struct sched_domain { char *name; #endif + unsigned int span_weight; /* * Span of all CPUs in this domain. * -- cgit v1.2.2 From 1142d810298e694754498dbb4983fcb6cb7fd884 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 May 2010 18:49:20 +0200 Subject: cpu_stop: implement stop_cpu[s]() Implement a simplistic per-cpu maximum priority cpu monopolization mechanism. A non-sleeping callback can be scheduled to run on one or multiple cpus with maximum priority monopolozing those cpus. This is primarily to replace and unify RT workqueue usage in stop_machine and scheduler migration_thread which currently is serving multiple purposes. Four functions are provided - stop_one_cpu(), stop_one_cpu_nowait(), stop_cpus() and try_stop_cpus(). This is to allow clean sharing of resources among stop_cpu and all the migration thread users. One stopper thread per cpu is created which is currently named "stopper/CPU". This will eventually replace the migration thread and take on its name. * This facility was originally named cpuhog and lived in separate files but Peter Zijlstra nacked the name and thus got renamed to cpu_stop and moved into stop_machine.c. * Better reporting of preemption leak as per Peter's suggestion. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Oleg Nesterov Cc: Dimitri Sivanich --- include/linux/stop_machine.h | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index baba3a23a814..efcbd6c37947 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -1,15 +1,46 @@ #ifndef _LINUX_STOP_MACHINE #define _LINUX_STOP_MACHINE -/* "Bogolock": stop the entire machine, disable interrupts. This is a - very heavy lock, which is equivalent to grabbing every spinlock - (and more). So the "read" side to such a lock is anything which - disables preeempt. */ + #include #include +#include #include #if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP) +/* + * stop_cpu[s]() is simplistic per-cpu maximum priority cpu + * monopolization mechanism. The caller can specify a non-sleeping + * function to be executed on a single or multiple cpus preempting all + * other processes and monopolizing those cpus until it finishes. + * + * Resources for this mechanism are preallocated when a cpu is brought + * up and requests are guaranteed to be served as long as the target + * cpus are online. + */ + +typedef int (*cpu_stop_fn_t)(void *arg); + +struct cpu_stop_work { + struct list_head list; /* cpu_stopper->works */ + cpu_stop_fn_t fn; + void *arg; + struct cpu_stop_done *done; +}; + +int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); +void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, + struct cpu_stop_work *work_buf); +int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); +int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); + +/* + * stop_machine "Bogolock": stop the entire machine, disable + * interrupts. This is a very heavy lock, which is equivalent to + * grabbing every spinlock (and more). So the "read" side to such a + * lock is anything which disables preeempt. + */ + /** * stop_machine: freeze the machine on all CPUs and run this function * @fn: the function to run -- cgit v1.2.2 From 3fc1f1e27a5b807791d72e5d992aa33b668a6626 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 May 2010 18:49:20 +0200 Subject: stop_machine: reimplement using cpu_stop Reimplement stop_machine using cpu_stop. As cpu stoppers are guaranteed to be available for all online cpus, stop_machine_create/destroy() are no longer necessary and removed. With resource management and synchronization handled by cpu_stop, the new implementation is much simpler. Asking the cpu_stop to execute the stop_cpu() state machine on all online cpus with cpu hotplug disabled is enough. stop_machine itself doesn't need to manage any global resources anymore, so all per-instance information is rolled into struct stop_machine_data and the mutex and all static data variables are removed. The previous implementation created and destroyed RT workqueues as necessary which made stop_machine() calls highly expensive on very large machines. According to Dimitri Sivanich, preventing the dynamic creation/destruction makes booting faster more than twice on very large machines. cpu_stop resources are preallocated for all online cpus and should have the same effect. Signed-off-by: Tejun Heo Acked-by: Rusty Russell Acked-by: Peter Zijlstra Cc: Oleg Nesterov Cc: Dimitri Sivanich --- include/linux/stop_machine.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index efcbd6c37947..0e552e72a4c4 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -67,23 +67,6 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); */ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); -/** - * stop_machine_create: create all stop_machine threads - * - * Description: This causes all stop_machine threads to be created before - * stop_machine actually gets called. This can be used by subsystems that - * need a non failing stop_machine infrastructure. - */ -int stop_machine_create(void); - -/** - * stop_machine_destroy: destroy all stop_machine threads - * - * Description: This causes all stop_machine threads which were created with - * stop_machine_create to be destroyed again. - */ -void stop_machine_destroy(void); - #else static inline int stop_machine(int (*fn)(void *), void *data, @@ -96,8 +79,5 @@ static inline int stop_machine(int (*fn)(void *), void *data, return ret; } -static inline int stop_machine_create(void) { return 0; } -static inline void stop_machine_destroy(void) { } - #endif /* CONFIG_SMP */ #endif /* _LINUX_STOP_MACHINE */ -- cgit v1.2.2 From 969c79215a35b06e5e3efe69b9412f858df7856c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 May 2010 18:49:21 +0200 Subject: sched: replace migration_thread with cpu_stop Currently migration_thread is serving three purposes - migration pusher, context to execute active_load_balance() and forced context switcher for expedited RCU synchronize_sched. All three roles are hardcoded into migration_thread() and determining which job is scheduled is slightly messy. This patch kills migration_thread and replaces all three uses with cpu_stop. The three different roles of migration_thread() are splitted into three separate cpu_stop callbacks - migration_cpu_stop(), active_load_balance_cpu_stop() and synchronize_sched_expedited_cpu_stop() - and each use case now simply asks cpu_stop to execute the callback as necessary. synchronize_sched_expedited() was implemented with private preallocated resources and custom multi-cpu queueing and waiting logic, both of which are provided by cpu_stop. synchronize_sched_expedited_count is made atomic and all other shared resources along with the mutex are dropped. synchronize_sched_expedited() also implemented a check to detect cases where not all the callback got executed on their assigned cpus and fall back to synchronize_sched(). If called with cpu hotplug blocked, cpu_stop already guarantees that and the condition cannot happen; otherwise, stop_machine() would break. However, this patch preserves the paranoid check using a cpumask to record on which cpus the stopper ran so that it can serve as a bisection point if something actually goes wrong theree. Because the internal execution state is no longer visible, rcu_expedited_torture_stats() is removed. This patch also renames cpu_stop threads to from "stopper/%d" to "migration/%d". The names of these threads ultimately don't matter and there's no reason to make unnecessary userland visible changes. With this patch applied, stop_machine() and sched now share the same resources. stop_machine() is faster without wasting any resources and sched migration users are much cleaner. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Ingo Molnar Cc: Dipankar Sarma Cc: Josh Triplett Cc: Paul E. McKenney Cc: Oleg Nesterov Cc: Dimitri Sivanich --- include/linux/rcutiny.h | 2 -- include/linux/rcutree.h | 1 - 2 files changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index a5195875480a..0006b2df00e1 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -60,8 +60,6 @@ static inline long rcu_batches_completed_bh(void) return 0; } -extern int rcu_expedited_torture_stats(char *page); - static inline void rcu_force_quiescent_state(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 42cc3a04779e..24e467e526b8 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -35,7 +35,6 @@ struct notifier_block; extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); extern int rcu_needs_cpu(int cpu); -extern int rcu_expedited_torture_stats(char *page); #ifdef CONFIG_TREE_PREEMPT_RCU -- cgit v1.2.2 From 27a9da6538ee18046d7bff8e36a9f783542c54c3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 May 2010 20:36:56 +0200 Subject: sched: Remove rq argument to the tracepoints struct rq isn't visible outside of sched.o so its near useless to expose the pointer, also there are no users of it, so remove it. Acked-by: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: <1272997616.1642.207.camel@laptop> Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index cfceb0b73e20..4f733ecea46e 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -51,15 +51,12 @@ TRACE_EVENT(sched_kthread_stop_ret, /* * Tracepoint for waiting on task to unschedule: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) */ TRACE_EVENT(sched_wait_task, - TP_PROTO(struct rq *rq, struct task_struct *p), + TP_PROTO(struct task_struct *p), - TP_ARGS(rq, p), + TP_ARGS(p), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) @@ -79,15 +76,12 @@ TRACE_EVENT(sched_wait_task, /* * Tracepoint for waking up a task: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) */ DECLARE_EVENT_CLASS(sched_wakeup_template, - TP_PROTO(struct rq *rq, struct task_struct *p, int success), + TP_PROTO(struct task_struct *p, int success), - TP_ARGS(rq, p, success), + TP_ARGS(p, success), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) @@ -111,31 +105,25 @@ DECLARE_EVENT_CLASS(sched_wakeup_template, ); DEFINE_EVENT(sched_wakeup_template, sched_wakeup, - TP_PROTO(struct rq *rq, struct task_struct *p, int success), - TP_ARGS(rq, p, success)); + TP_PROTO(struct task_struct *p, int success), + TP_ARGS(p, success)); /* * Tracepoint for waking up a new task: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) */ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, - TP_PROTO(struct rq *rq, struct task_struct *p, int success), - TP_ARGS(rq, p, success)); + TP_PROTO(struct task_struct *p, int success), + TP_ARGS(p, success)); /* * Tracepoint for task switches, performed by the scheduler: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) */ TRACE_EVENT(sched_switch, - TP_PROTO(struct rq *rq, struct task_struct *prev, + TP_PROTO(struct task_struct *prev, struct task_struct *next), - TP_ARGS(rq, prev, next), + TP_ARGS(prev, next), TP_STRUCT__entry( __array( char, prev_comm, TASK_COMM_LEN ) -- cgit v1.2.2 From bbf1bb3eee86f2eef2baa14e600be454d09109ee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 May 2010 16:20:53 +0200 Subject: cpu_stop: add dummy implementation for UP When !CONFIG_SMP, cpu_stop functions weren't defined at all which could lead to build failures if UP code uses cpu_stop facility. Add dummy cpu_stop implementation for UP. The waiting variants execute the work function directly with preempt disabled and stop_one_cpu_nowait() schedules a workqueue work. Makefile and ifdefs around stop_machine implementation are updated to accomodate CONFIG_SMP && !CONFIG_STOP_MACHINE case. Signed-off-by: Tejun Heo Reported-by: Ingo Molnar --- include/linux/stop_machine.h | 69 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 0e552e72a4c4..6b524a0d02e4 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -6,8 +6,6 @@ #include #include -#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP) - /* * stop_cpu[s]() is simplistic per-cpu maximum priority cpu * monopolization mechanism. The caller can specify a non-sleeping @@ -18,9 +16,10 @@ * up and requests are guaranteed to be served as long as the target * cpus are online. */ - typedef int (*cpu_stop_fn_t)(void *arg); +#ifdef CONFIG_SMP + struct cpu_stop_work { struct list_head list; /* cpu_stopper->works */ cpu_stop_fn_t fn; @@ -34,12 +33,70 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); +#else /* CONFIG_SMP */ + +#include + +struct cpu_stop_work { + struct work_struct work; + cpu_stop_fn_t fn; + void *arg; +}; + +static inline int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) +{ + int ret = -ENOENT; + preempt_disable(); + if (cpu == smp_processor_id()) + ret = fn(arg); + preempt_enable(); + return ret; +} + +static void stop_one_cpu_nowait_workfn(struct work_struct *work) +{ + struct cpu_stop_work *stwork = + container_of(work, struct cpu_stop_work, work); + preempt_disable(); + stwork->fn(stwork->arg); + preempt_enable(); +} + +static inline void stop_one_cpu_nowait(unsigned int cpu, + cpu_stop_fn_t fn, void *arg, + struct cpu_stop_work *work_buf) +{ + if (cpu == smp_processor_id()) { + INIT_WORK(&work_buf->work, stop_one_cpu_nowait_workfn); + work_buf->fn = fn; + work_buf->arg = arg; + schedule_work(&work_buf->work); + } +} + +static inline int stop_cpus(const struct cpumask *cpumask, + cpu_stop_fn_t fn, void *arg) +{ + if (cpumask_test_cpu(raw_smp_processor_id(), cpumask)) + return stop_one_cpu(raw_smp_processor_id(), fn, arg); + return -ENOENT; +} + +static inline int try_stop_cpus(const struct cpumask *cpumask, + cpu_stop_fn_t fn, void *arg) +{ + return stop_cpus(cpumask, fn, arg); +} + +#endif /* CONFIG_SMP */ + /* * stop_machine "Bogolock": stop the entire machine, disable * interrupts. This is a very heavy lock, which is equivalent to * grabbing every spinlock (and more). So the "read" side to such a * lock is anything which disables preeempt. */ +#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP) /** * stop_machine: freeze the machine on all CPUs and run this function @@ -67,7 +124,7 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); */ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); -#else +#else /* CONFIG_STOP_MACHINE && CONFIG_SMP */ static inline int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) @@ -79,5 +136,5 @@ static inline int stop_machine(int (*fn)(void *), void *data, return ret; } -#endif /* CONFIG_SMP */ -#endif /* _LINUX_STOP_MACHINE */ +#endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */ +#endif /* _LINUX_STOP_MACHINE */ -- cgit v1.2.2 From e0e37c200f1357db0dd986edb359c41c57d24f6e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 9 May 2010 08:24:39 -0700 Subject: sched: Eliminate the ts->idle_lastupdate field Now that the only user of ts->idle_lastupdate is update_ts_time_stats(), the entire field can be eliminated. In update_ts_time_stats(), idle_lastupdate is first set to "now", and a few lines later, the only user is an if() statement that assigns a variable either to "now" or to ts->idle_lastupdate, which has the value of "now" at that point. Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Reviewed-by: Rik van Riel Acked-by: Peter Zijlstra Cc: davej@redhat.com LKML-Reference: <20100509082439.2fab0b4f@infradead.org> Signed-off-by: Ingo Molnar --- include/linux/tick.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/tick.h b/include/linux/tick.h index d2ae79e21be3..0343eed40619 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -60,7 +60,6 @@ struct tick_sched { ktime_t idle_waketime; ktime_t idle_exittime; ktime_t idle_sleeptime; - ktime_t idle_lastupdate; ktime_t sleep_length; unsigned long last_jiffies; unsigned long next_jiffies; -- cgit v1.2.2 From 0224cf4c5ee0d7faec83956b8e21f7d89e3df3bd Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 9 May 2010 08:25:23 -0700 Subject: sched: Intoduce get_cpu_iowait_time_us() For the ondemand cpufreq governor, it is desired that the iowait time is microaccounted in a similar way as idle time is. This patch introduces the infrastructure to account and expose this information via the get_cpu_iowait_time_us() function. [akpm@linux-foundation.org: fix CONFIG_NO_HZ=n build] Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Reviewed-by: Rik van Riel Acked-by: Peter Zijlstra Cc: davej@redhat.com LKML-Reference: <20100509082523.284feab6@infradead.org> Signed-off-by: Ingo Molnar --- include/linux/tick.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/tick.h b/include/linux/tick.h index 0343eed40619..b232ccc0ee29 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -42,6 +42,7 @@ enum tick_nohz_mode { * @idle_waketime: Time when the idle was interrupted * @idle_exittime: Time when the idle state was left * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped + * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding * @sleep_length: Duration of the current idle sleep * @do_timer_lst: CPU was the last one doing do_timer before going idle */ @@ -60,6 +61,7 @@ struct tick_sched { ktime_t idle_waketime; ktime_t idle_exittime; ktime_t idle_sleeptime; + ktime_t iowait_sleeptime; ktime_t sleep_length; unsigned long last_jiffies; unsigned long next_jiffies; @@ -123,6 +125,7 @@ extern void tick_nohz_stop_sched_tick(int inidle); extern void tick_nohz_restart_sched_tick(void); extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); +extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); # else static inline void tick_nohz_stop_sched_tick(int inidle) { } static inline void tick_nohz_restart_sched_tick(void) { } @@ -133,6 +136,7 @@ static inline ktime_t tick_nohz_get_sleep_length(void) return len; } static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } +static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } # endif /* !NO_HZ */ #endif -- cgit v1.2.2 From a93d2f1744206827ccf416e2cdc5018aa503314e Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Fri, 7 May 2010 14:33:26 +0800 Subject: sched, wait: Use wrapper functions epoll should not touch flags in wait_queue_t. This patch introduces a new function __add_wait_queue_exclusive(), for the users, who use wait queue as a LIFO queue. __add_wait_queue_tail_exclusive() is introduced too instead of add_wait_queue_exclusive_locked(). remove_wait_queue_locked() is removed, as it is a duplicate of __remove_wait_queue(), disliked by users, and with less users. Signed-off-by: Changli Gao Signed-off-by: Peter Zijlstra Cc: Alexander Viro Cc: Paul Menage Cc: Li Zefan Cc: Davide Libenzi Cc: LKML-Reference: <1273214006-2979-1-git-send-email-xiaosuo@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/wait.h | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index a48e16b77d5e..76d96d035ea0 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -127,12 +127,26 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new) /* * Used for wake-one threads: */ +static inline void __add_wait_queue_exclusive(wait_queue_head_t *q, + wait_queue_t *wait) +{ + wait->flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue(q, wait); +} + static inline void __add_wait_queue_tail(wait_queue_head_t *head, - wait_queue_t *new) + wait_queue_t *new) { list_add_tail(&new->task_list, &head->task_list); } +static inline void __add_wait_queue_tail_exclusive(wait_queue_head_t *q, + wait_queue_t *wait) +{ + wait->flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(q, wait); +} + static inline void __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) { @@ -403,25 +417,6 @@ do { \ __ret; \ }) -/* - * Must be called with the spinlock in the wait_queue_head_t held. - */ -static inline void add_wait_queue_exclusive_locked(wait_queue_head_t *q, - wait_queue_t * wait) -{ - wait->flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_tail(q, wait); -} - -/* - * Must be called with the spinlock in the wait_queue_head_t held. - */ -static inline void remove_wait_queue_locked(wait_queue_head_t *q, - wait_queue_t * wait) -{ - __remove_wait_queue(q, wait); -} - /* * These are the old interfaces to sleep waiting for an event. * They are racy. DO NOT use them, use the wait_event* interfaces above. -- cgit v1.2.2