From 41acab8851a0408c1d5ad6c21a07456f88b54d40 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.de.marchi@gmail.com>
Date: Wed, 10 Mar 2010 23:37:45 -0300
Subject: sched: Implement group scheduler statistics in one struct

Put all statistic fields of sched_entity in one struct, sched_statistics,
and embed it into sched_entity.

This change allows to memset the sched_statistics to 0 when needed (for
instance when forking), avoiding bugs of non initialized fields.

Signed-off-by: Lucas De Marchi <lucas.de.marchi@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268275065-18542-1-git-send-email-lucas.de.marchi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 54 ++++++++++++++++++++++++---------------------------
 1 file changed, 25 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4b1753f7e48..8cc863d6647 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1127,36 +1127,8 @@ struct load_weight {
 	unsigned long weight, inv_weight;
 };
 
-/*
- * CFS stats for a schedulable entity (task, task-group etc)
- *
- * Current field usage histogram:
- *
- *     4 se->block_start
- *     4 se->run_node
- *     4 se->sleep_start
- *     6 se->load.weight
- */
-struct sched_entity {
-	struct load_weight	load;		/* for load-balancing */
-	struct rb_node		run_node;
-	struct list_head	group_node;
-	unsigned int		on_rq;
-
-	u64			exec_start;
-	u64			sum_exec_runtime;
-	u64			vruntime;
-	u64			prev_sum_exec_runtime;
-
-	u64			last_wakeup;
-	u64			avg_overlap;
-
-	u64			nr_migrations;
-
-	u64			start_runtime;
-	u64			avg_wakeup;
-
 #ifdef CONFIG_SCHEDSTATS
+struct sched_statistics {
 	u64			wait_start;
 	u64			wait_max;
 	u64			wait_count;
@@ -1188,6 +1160,30 @@ struct sched_entity {
 	u64			nr_wakeups_affine_attempts;
 	u64			nr_wakeups_passive;
 	u64			nr_wakeups_idle;
+};
+#endif
+
+struct sched_entity {
+	struct load_weight	load;		/* for load-balancing */
+	struct rb_node		run_node;
+	struct list_head	group_node;
+	unsigned int		on_rq;
+
+	u64			exec_start;
+	u64			sum_exec_runtime;
+	u64			vruntime;
+	u64			prev_sum_exec_runtime;
+
+	u64			last_wakeup;
+	u64			avg_overlap;
+
+	u64			nr_migrations;
+
+	u64			start_runtime;
+	u64			avg_wakeup;
+
+#ifdef CONFIG_SCHEDSTATS
+	struct sched_statistics statistics;
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-- 
cgit v1.2.2


From 39c0cbe2150cbd848a25ba6cdb271d1ad46818ad Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 11 Mar 2010 17:17:13 +0100
Subject: sched: Rate-limit nohz

Entering nohz code on every micro-idle is costing ~10% throughput for netperf
TCP_RR when scheduling cross-cpu.  Rate limiting entry fixes this, but raises
ticks a bit.  On my Q6600, an idle box goes from ~85 interrupts/sec to 128.

The higher the context switch rate, the more nohz entry costs.  With this patch
and some cycle recovery patches in my tree, max cross cpu context switch rate is
improved by ~16%, a large portion of which of which is this ratelimiting.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301003.6785.28.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8cc863d6647..13efe7dac5f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -271,11 +271,17 @@ extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
 extern int get_nohz_load_balancer(void);
+extern int nohz_ratelimit(int cpu);
 #else
 static inline int select_nohz_load_balancer(int cpu)
 {
 	return 0;
 }
+
+static inline int nohz_ratelimit(int cpu)
+{
+	return 0;
+}
 #endif
 
 /*
-- 
cgit v1.2.2


From b42e0c41a422a212ddea0666d5a3a0e3c35206db Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 11 Mar 2010 17:15:38 +0100
Subject: sched: Remove avg_wakeup

Testing the load which led to this heuristic (nfs4 kbuild) shows that it has
outlived it's usefullness.  With intervening load balancing changes, I cannot
see any difference with/without, so recover there fastpath cycles.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301062.6785.29.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 13efe7dac5f..70c560f5ada 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1185,9 +1185,6 @@ struct sched_entity {
 
 	u64			nr_migrations;
 
-	u64			start_runtime;
-	u64			avg_wakeup;
-
 #ifdef CONFIG_SCHEDSTATS
 	struct sched_statistics statistics;
 #endif
-- 
cgit v1.2.2


From e12f31d3e5d36328c7fbd0fce40a95e70b59152c Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 11 Mar 2010 17:15:51 +0100
Subject: sched: Remove avg_overlap

Both avg_overlap and avg_wakeup had an inherent problem in that their accuracy
was detrimentally affected by cross-cpu wakeups, this because we are missing
the necessary call to update_curr().  This can't be fixed without increasing
overhead in our already too fat fastpath.

Additionally, with recent load balancing changes making us prefer to place tasks
in an idle cache domain (which is good for compute bound loads), communicating
tasks suffer when a sync wakeup, which would enable affine placement, is turned
into a non-sync wakeup by SYNC_LESS.  With one task on the runqueue, wake_affine()
rejects the affine wakeup request, leaving the unfortunate where placed, taking
frequent cache misses.

Remove it, and recover some fastpath cycles.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301121.6785.30.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 70c560f5ada..8604884cee8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1180,9 +1180,6 @@ struct sched_entity {
 	u64			vruntime;
 	u64			prev_sum_exec_runtime;
 
-	u64			last_wakeup;
-	u64			avg_overlap;
-
 	u64			nr_migrations;
 
 #ifdef CONFIG_SCHEDSTATS
-- 
cgit v1.2.2


From 897f0b3c3ff40b443c84e271bef19bd6ae885195 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 15 Mar 2010 10:10:03 +0100
Subject: sched: Kill the broken and deadlockable
 cpuset_lock/cpuset_cpus_allowed_locked code

This patch just states the fact the cpusets/cpuhotplug interaction is
broken and removes the deadlockable code which only pretends to work.

- cpuset_lock() doesn't really work. It is needed for
  cpuset_cpus_allowed_locked() but we can't take this lock in
  try_to_wake_up()->select_fallback_rq() path.

- cpuset_lock() is deadlockable. Suppose that a task T bound to CPU takes
  callback_mutex. If cpu_down(CPU) happens before T drops callback_mutex
  stop_machine() preempts T, then migration_call(CPU_DEAD) tries to take
  cpuset_lock() and hangs forever because CPU is already dead and thus
  T can't be scheduled.

- cpuset_cpus_allowed_locked() is deadlockable too. It takes task_lock()
  which is not irq-safe, but try_to_wake_up() can be called from irq.

Kill them, and change select_fallback_rq() to use cpu_possible_mask, like
we currently do without CONFIG_CPUSETS.

Also, with or without this patch, with or without CONFIG_CPUSETS, the
callers of select_fallback_rq() can race with each other or with
set_cpus_allowed() pathes.

The subsequent patches try to to fix these problems.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100315091003.GA9123@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cpuset.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a5740fc4d04..eeaaee746be 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -21,8 +21,6 @@ extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
-extern void cpuset_cpus_allowed_locked(struct task_struct *p,
-				       struct cpumask *mask);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -69,9 +67,6 @@ struct seq_file;
 extern void cpuset_task_status_allowed(struct seq_file *m,
 					struct task_struct *task);
 
-extern void cpuset_lock(void);
-extern void cpuset_unlock(void);
-
 extern int cpuset_mem_spread_node(void);
 
 static inline int cpuset_do_page_mem_spread(void)
@@ -105,11 +100,6 @@ static inline void cpuset_cpus_allowed(struct task_struct *p,
 {
 	cpumask_copy(mask, cpu_possible_mask);
 }
-static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
-					      struct cpumask *mask)
-{
-	cpumask_copy(mask, cpu_possible_mask);
-}
 
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 {
@@ -157,9 +147,6 @@ static inline void cpuset_task_status_allowed(struct seq_file *m,
 {
 }
 
-static inline void cpuset_lock(void) {}
-static inline void cpuset_unlock(void) {}
-
 static inline int cpuset_mem_spread_node(void)
 {
 	return 0;
-- 
cgit v1.2.2


From 6a1bdc1b577ebcb65f6603c57f8347309bc4ab13 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 15 Mar 2010 10:10:23 +0100
Subject: sched: _cpu_down(): Don't play with current->cpus_allowed

_cpu_down() changes the current task's affinity and then recovers it at
the end. The problems are well known: we can't restore old_allowed if it
was bound to the now-dead-cpu, and we can race with the userspace which
can change cpu-affinity during unplug.

_cpu_down() should not play with current->cpus_allowed at all. Instead,
take_cpu_down() can migrate the caller of _cpu_down() after __cpu_disable()
removes the dying cpu from cpu_online_mask.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100315091023.GA9148@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 43c94515273..8bea40725c7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1843,6 +1843,7 @@ extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 
 #ifdef CONFIG_HOTPLUG_CPU
+extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
 extern void idle_task_exit(void);
 #else
 static inline void idle_task_exit(void) {}
-- 
cgit v1.2.2


From 9084bb8246ea935b98320554229e2f371f7f52fa Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 15 Mar 2010 10:10:27 +0100
Subject: sched: Make select_fallback_rq() cpuset friendly

Introduce cpuset_cpus_allowed_fallback() helper to fix the cpuset problems
with select_fallback_rq(). It can be called from any context and can't use
any cpuset locks including task_lock(). It is called when the task doesn't
have online cpus in ->cpus_allowed but ttwu/etc must be able to find a
suitable cpu.

I am not proud of this patch. Everything which needs such a fat comment
can't be good even if correct. But I'd prefer to not change the locking
rules in the code I hardly understand, and in any case I believe this
simple change make the code much more correct compared to deadlocks we
currently have.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100315091027.GA9155@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cpuset.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index eeaaee746be..a73454aec33 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -21,6 +21,7 @@ extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
+extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -101,6 +102,12 @@ static inline void cpuset_cpus_allowed(struct task_struct *p,
 	cpumask_copy(mask, cpu_possible_mask);
 }
 
+static inline int cpuset_cpus_allowed_fallback(struct task_struct *p)
+{
+	cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
+	return cpumask_any(cpu_active_mask);
+}
+
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 {
 	return node_possible_map;
-- 
cgit v1.2.2


From 0017d735092844118bef006696a750a0e4ef6ebd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 24 Mar 2010 18:34:10 +0100
Subject: sched: Fix TASK_WAKING vs fork deadlock

Oleg noticed a few races with the TASK_WAKING usage on fork.

 - since TASK_WAKING is basically a spinlock, it should be IRQ safe
 - since we set TASK_WAKING (*) without holding rq->lock it could
   be there still is a rq->lock holder, thereby not actually
   providing full serialization.

(*) in fact we clear PF_STARTING, which in effect enables TASK_WAKING.

Cure the second issue by not setting TASK_WAKING in sched_fork(), but
only temporarily in wake_up_new_task() while calling select_task_rq().

Cure the first by holding rq->lock around the select_task_rq() call,
this will disable IRQs, this however requires that we push down the
rq->lock release into select_task_rq_fair()'s cgroup stuff.

Because select_task_rq_fair() still needs to drop the rq->lock we
cannot fully get rid of TASK_WAKING.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8bea40725c7..fb6c18843ee 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1046,7 +1046,8 @@ struct sched_class {
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
-	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+	int  (*select_task_rq)(struct rq *rq, struct task_struct *p,
+			       int sd_flag, int flags);
 
 	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
 	void (*post_schedule) (struct rq *this_rq);
-- 
cgit v1.2.2


From 371fd7e7a56a5c136d31aa980011bd2f131c3ef5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 24 Mar 2010 16:38:48 +0100
Subject: sched: Add enqueue/dequeue flags

In order to reduce the dependency on TASK_WAKING rework the enqueue
interface to support a proper flags field.

Replace the int wakeup, bool head arguments with an int flags argument
and create the following flags:

  ENQUEUE_WAKEUP - the enqueue is a wakeup of a sleeping task,
  ENQUEUE_WAKING - the enqueue has relative vruntime due to
                   having sched_class::task_waking() called,
  ENQUEUE_HEAD - the waking task should be places on the head
                 of the priority queue (where appropriate).

For symmetry also convert sched_class::dequeue() to a flags scheme.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fb6c18843ee..e3e900f318d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1032,12 +1032,17 @@ struct sched_domain;
 #define WF_SYNC		0x01		/* waker goes to sleep after wakup */
 #define WF_FORK		0x02		/* child wakeup after fork */
 
+#define ENQUEUE_WAKEUP		1
+#define ENQUEUE_WAKING		2
+#define ENQUEUE_HEAD		4
+
+#define DEQUEUE_SLEEP		1
+
 struct sched_class {
 	const struct sched_class *next;
 
-	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup,
-			      bool head);
-	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
+	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*yield_task) (struct rq *rq);
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
-- 
cgit v1.2.2


From a2612cb16d4d8447793609cbdd2a2f4f156c0020 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Thu, 15 Apr 2010 22:16:04 +0200
Subject: firewire: cdev: fix cut+paste mistake in disclaimer

This was supposed to be generic "authors or copyright holders";
I mistakenly picked up text from a wrong file.

Reported-by: Daniel K. <dk@uw.no>
Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 include/linux/firewire-cdev.h      | 2 +-
 include/linux/firewire-constants.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 81f3b14d5d7..68f883b30a5 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -17,7 +17,7 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
diff --git a/include/linux/firewire-constants.h b/include/linux/firewire-constants.h
index 9c63f06e67f..9b4bb5fbba4 100644
--- a/include/linux/firewire-constants.h
+++ b/include/linux/firewire-constants.h
@@ -17,7 +17,7 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
-- 
cgit v1.2.2


From be1a50d4eba4cdb3ebf9d97a0a8693c153436775 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sat, 3 Apr 2010 17:37:45 +0200
Subject: regulator: Let drivers know when they use the stub API

Have the stub variant of regulator_get() return NULL, so that drivers
can (but still don't have to) handle this case specifically.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Mark Brown <broonie@opensource.wolfsonmicro.com>
Cc: Jerome Oufella <jerome.oufella@savoirfairelinux.com>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Liam Girdwood <lrg@slimlogic.co.uk>
---
 include/linux/regulator/consumer.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 28c9fd020d3..ebd74726529 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -183,9 +183,13 @@ static inline struct regulator *__must_check regulator_get(struct device *dev,
 {
 	/* Nothing except the stubbed out regulator API should be
 	 * looking at the value except to check if it is an error
-	 * value so the actual return value doesn't matter.
+	 * value. Drivers are free to handle NULL specifically by
+	 * skipping all regulator API calls, but they don't have to.
+	 * Drivers which don't, should make sure they properly handle
+	 * corner cases of the API, such as regulator_get_voltage()
+	 * returning 0.
 	 */
-	return (struct regulator *)id;
+	return NULL;
 }
 static inline void regulator_put(struct regulator *regulator)
 {
-- 
cgit v1.2.2


From 87bf6e7de1134f48681fd2ce4b7c1ec45458cb6d Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 12 Apr 2010 19:35:35 +0900
Subject: KVM: fix the handling of dirty bitmaps to avoid overflows

Int is not long enough to store the size of a dirty bitmap.

This patch fixes this problem with the introduction of a wrapper
function to calculate the sizes of dirty bitmaps.

Note: in mark_page_dirty(), we have to consider the fact that
  __set_bit() takes the offset as int, not long.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 include/linux/kvm_host.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a3fd0f91d94..9ad825e1c79 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -119,6 +119,11 @@ struct kvm_memory_slot {
 	int user_alloc;
 };
 
+static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
+{
+	return ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+}
+
 struct kvm_kernel_irq_routing_entry {
 	u32 gsi;
 	u32 type;
-- 
cgit v1.2.2


From e80e2a60ff7914dae691345a976c80bbbff3ec74 Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala <sri@us.ibm.com>
Date: Tue, 30 Mar 2010 16:48:25 -0700
Subject: KVM: Increase NR_IOBUS_DEVS limit to 200

This patch increases the current hardcoded limit of NR_IOBUS_DEVS
from 6 to 200. We are hitting this limit when creating a guest with more
than 1 virtio-net device using vhost-net backend. Each virtio-net
device requires 2 such devices to service notifications from rx/tx queues.

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9ad825e1c79..169d07758ee 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -54,7 +54,7 @@ extern struct kmem_cache *kvm_vcpu_cache;
  */
 struct kvm_io_bus {
 	int                   dev_count;
-#define NR_IOBUS_DEVS 6
+#define NR_IOBUS_DEVS 200
 	struct kvm_io_device *devs[NR_IOBUS_DEVS];
 };
 
-- 
cgit v1.2.2


From 04de0816173c86948b75da93a6344a0a02bbec4d Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Tue, 20 Apr 2010 14:49:01 +0200
Subject: pcmcia: pcmcia_dev_present bugfix

pcmcia_dev_present is in and by itself buggy. Add a note specifying
why it is broken, and replace the broken locking -- taking a mutex
is a bad idea in IRQ context, from which this function is rarely
called -- by an atomic_t.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
---
 include/pcmcia/ds.h | 7 +++----
 include/pcmcia/ss.h | 8 +++-----
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/pcmcia/ds.h b/include/pcmcia/ds.h
index d57847f2f6c..aab3c13dc31 100644
--- a/include/pcmcia/ds.h
+++ b/include/pcmcia/ds.h
@@ -26,6 +26,7 @@
 #ifdef __KERNEL__
 #include <linux/device.h>
 #include <pcmcia/ss.h>
+#include <asm/atomic.h>
 
 /*
  * PCMCIA device drivers (16-bit cards only; 32-bit cards require CardBus
@@ -94,10 +95,8 @@ struct pcmcia_device {
 	config_req_t		conf;
 	window_handle_t		win;
 
-	/* Is the device suspended, or in the process of
-	 * being removed? */
+	/* Is the device suspended? */
 	u16			suspended:1;
-	u16			_removed:1;
 
 	/* Flags whether io, irq, win configurations were
 	 * requested, and whether the configuration is "locked" */
@@ -115,7 +114,7 @@ struct pcmcia_device {
 	u16			has_card_id:1;
 	u16			has_func_id:1;
 
-	u16			reserved:3;
+	u16			reserved:4;
 
 	u8			func_id;
 	u16			manf_id;
diff --git a/include/pcmcia/ss.h b/include/pcmcia/ss.h
index 2e488b60bc7..344705cb42f 100644
--- a/include/pcmcia/ss.h
+++ b/include/pcmcia/ss.h
@@ -224,18 +224,16 @@ struct pcmcia_socket {
 
 	/* 16-bit state: */
 	struct {
-		/* PCMCIA card is present in socket */
-		u8			present:1;
 		/* "master" ioctl is used */
 		u8			busy:1;
-		/* pcmcia module is being unloaded */
-		u8			dead:1;
 		/* the PCMCIA card consists of two pseudo devices */
 		u8			has_pfc:1;
 
-		u8			reserved:4;
+		u8			reserved:6;
 	} pcmcia_state;
 
+	/* non-zero if PCMCIA card is present */
+	atomic_t			present;
 
 #ifdef CONFIG_PCMCIA_IOCTL
 	struct user_info_t		*user;
-- 
cgit v1.2.2


From c3c532061e46156e8aab1268f38d66cfb63aeb2d Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 22 Apr 2010 11:37:01 +0200
Subject: bdi: add helper function for doing init and register of a bdi for a
 file system

Pretty trivial helper, just sets up the bdi and registers it. An atomic
sequence count is used to ensure that the registered sysfs names are
unique.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/backing-dev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index fcbc26af00e..e19c677f219 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -101,6 +101,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
+int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
 				long nr_pages);
 int bdi_writeback_task(struct bdi_writeback *wb);
-- 
cgit v1.2.2


From 5163d90076729413cb882d3dd5c3d3cfb5b9f035 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 22 Apr 2010 12:12:40 +0200
Subject: coda: add bdi backing to mount session

This ensures that dirty data gets flushed properly.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/coda_psdev.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 5b5d4731f95..644062e8d85 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -1,6 +1,7 @@
 #ifndef __CODA_PSDEV_H
 #define __CODA_PSDEV_H
 
+#include <linux/backing-dev.h>
 #include <linux/magic.h>
 
 #define CODA_PSDEV_MAJOR 67
@@ -17,6 +18,7 @@ struct venus_comm {
 	struct list_head    vc_processing;
 	int                 vc_inuse;
 	struct super_block *vc_sb;
+	struct backing_dev_info bdi;
 };
 
 
-- 
cgit v1.2.2


From f1970c73cbb6b884152207e4dfe90639f5029905 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 22 Apr 2010 12:31:11 +0200
Subject: ncpfs: add bdi backing to mount session

This ensures that dirty data gets flushed properly.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/ncp_fs_sb.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/ncp_fs_sb.h b/include/linux/ncp_fs_sb.h
index 6330fc76b00..5ec9ca67168 100644
--- a/include/linux/ncp_fs_sb.h
+++ b/include/linux/ncp_fs_sb.h
@@ -12,6 +12,7 @@
 #include <linux/ncp_mount.h>
 #include <linux/net.h>
 #include <linux/mutex.h>
+#include <linux/backing-dev.h>
 
 #ifdef __KERNEL__
 
@@ -127,6 +128,7 @@ struct ncp_server {
 		size_t len;
 		__u8 data[128];
 	} unexpected_packet;
+	struct backing_dev_info bdi;
 };
 
 extern void ncp_tcp_rcv_proc(struct work_struct *work);
-- 
cgit v1.2.2


From 424264b7b220e8eee165dc3080ae48692af73dec Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 22 Apr 2010 12:37:07 +0200
Subject: smbfs: add bdi backing to mount session

This ensures that dirty data gets flushed properly.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/smb_fs_sb.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/smb_fs_sb.h b/include/linux/smb_fs_sb.h
index 8a060a7040d..bb947dd1fba 100644
--- a/include/linux/smb_fs_sb.h
+++ b/include/linux/smb_fs_sb.h
@@ -10,6 +10,7 @@
 #define _SMB_FS_SB
 
 #include <linux/types.h>
+#include <linux/backing-dev.h>
 #include <linux/smb.h>
 
 /*
@@ -74,6 +75,8 @@ struct smb_sb_info {
 	struct smb_ops *ops;
 
 	struct super_block *super_block;
+
+	struct backing_dev_info bdi;
 };
 
 static inline int
-- 
cgit v1.2.2


From 71d0a6112a363e703e383ae5b12c492485c39701 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 22 Apr 2010 15:35:57 -0400
Subject: NFS: Fix an unstable write data integrity race

Commit 2c61be0a9478258f77b66208a0c4b1f5f8161c3c (NFS: Ensure that the WRITE
and COMMIT RPC calls are always uninterruptible) exposed a race on file
close. In order to ensure correct close-to-open behaviour, we want to wait
for all outstanding background commit operations to complete.

This patch adds an inode flag that indicates if a commit operation is under
way, and provides a mechanism to allow ->write_inode() to wait for its
completion if this is a data integrity flush.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 1a0b85aa151..07ce4609fe5 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -209,6 +209,7 @@ struct nfs_inode {
 #define NFS_INO_FLUSHING	(4)		/* inode is flushing out data */
 #define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
 #define NFS_INO_FSCACHE_LOCK	(6)		/* FS-Cache cookie management lock */
+#define NFS_INO_COMMIT		(7)		/* inode is committing unstable writes */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
-- 
cgit v1.2.2


From 669c55e9f99b90e46eaa0f98a67ec53d46dc969a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 16 Apr 2010 14:59:29 +0200
Subject: sched: Pre-compute cpumask_weight(sched_domain_span(sd))

Dave reported that his large SPARC machines spend lots of time in
hweight64(), try and optimize some of those needless cpumask_weight()
invocations (esp. with the large offstack cpumasks these are very
expensive indeed).

Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e3e900f318d..dfea40574b2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -960,6 +960,7 @@ struct sched_domain {
 	char *name;
 #endif
 
+	unsigned int span_weight;
 	/*
 	 * Span of all CPUs in this domain.
 	 *
-- 
cgit v1.2.2


From 3a3076f4d6e2fa31338a0b007df42a3b32f079e0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 23 Apr 2010 12:17:17 -0400
Subject: Cleanup generic block based fiemap

This cleans up a few of the complaints of __generic_block_fiemap.  I've
fixed all the typing stuff, used inline functions instead of macros,
gotten rid of a couple of variables, and made sure the size and block
requests are all block aligned.  It also fixes a problem where sometimes
FIEMAP_EXTENT_LAST wasn't being set properly.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 39d57bc6cc7..44f35aea2f1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2315,8 +2315,9 @@ extern int vfs_fstatat(int , char __user *, struct kstat *, int);
 extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		    unsigned long arg);
 extern int __generic_block_fiemap(struct inode *inode,
-				  struct fiemap_extent_info *fieinfo, u64 start,
-				  u64 len, get_block_t *get_block);
+				  struct fiemap_extent_info *fieinfo,
+				  loff_t start, loff_t len,
+				  get_block_t *get_block);
 extern int generic_block_fiemap(struct inode *inode,
 				struct fiemap_extent_info *fieinfo, u64 start,
 				u64 len, get_block_t *get_block);
-- 
cgit v1.2.2


From 23be7468e8802a2ac1de6ee3eecb3ec7f14dc703 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Fri, 23 Apr 2010 13:17:56 -0400
Subject: hugetlb: fix infinite loop in get_futex_key() when backed by huge
 pages

If a futex key happens to be located within a huge page mapped
MAP_PRIVATE, get_futex_key() can go into an infinite loop waiting for a
page->mapping that will never exist.

See https://bugzilla.redhat.com/show_bug.cgi?id=552257 for more details
about the problem.

This patch makes page->mapping a poisoned value that includes
PAGE_MAPPING_ANON mapped MAP_PRIVATE.  This is enough for futex to
continue but because of PAGE_MAPPING_ANON, the poisoned value is not
dereferenced or used by futex.  No other part of the VM should be
dereferencing the page->mapping of a hugetlbfs page as its page cache is
not on the LRU.

This patch fixes the problem with the test case described in the bugzilla.

[akpm@linux-foundation.org: mel cant spel]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Darren Hart <darren@dvhart.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/poison.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/poison.h b/include/linux/poison.h
index 2110a81c5e2..34066ffd893 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -48,6 +48,15 @@
 #define POISON_FREE	0x6b	/* for use-after-free poisoning */
 #define	POISON_END	0xa5	/* end-byte of poisoning */
 
+/********** mm/hugetlb.c **********/
+/*
+ * Private mappings of hugetlb pages use this poisoned value for
+ * page->mapping. The core VM should not be doing anything with this mapping
+ * but futex requires the existence of some page->mapping value even though it
+ * is unused if PAGE_MAPPING_ANON is set.
+ */
+#define HUGETLB_POISON	((void *)(0x00300300 + POISON_POINTER_DELTA + PAGE_MAPPING_ANON))
+
 /********** arch/$ARCH/mm/init.c **********/
 #define POISON_FREE_INITMEM	0xcc
 
-- 
cgit v1.2.2


From 5129a469a91a91427334c40e29e64c6d0ab68caf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rn=20Engel?= <joern@logfs.org>
Date: Sun, 25 Apr 2010 08:54:42 +0200
Subject: Catch filesystems lacking s_bdi

noop_backing_dev_info is used only as a flag to mark filesystems that
don't have any backing store, like tmpfs, procfs, spufs, etc.

Signed-off-by: Joern Engel <joern@logfs.org>

Changed the BUG_ON() to a WARN_ON(). Note that adding dirty inodes
to the noop_backing_dev_info is not legal and will not result in
them being flushed, but we already catch this condition in
__mark_inode_dirty() when checking for a registered bdi.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/backing-dev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e19c677f219..bd0e3c6f323 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -247,6 +247,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 #endif
 
 extern struct backing_dev_info default_backing_dev_info;
+extern struct backing_dev_info noop_backing_dev_info;
 void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page);
 
 int writeback_in_progress(struct backing_dev_info *bdi);
-- 
cgit v1.2.2


From 33f60e9640b2f60dde6735293d4aa5ecc5b1d5d5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 28 Apr 2010 09:20:33 +0200
Subject: coda: move backing-dev.h kernel include inside __KERNEL__

Otherwise we must export backing-dev.h as well, which doesn't make
any sense.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/coda_psdev.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 644062e8d85..8859e2ede9f 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -1,13 +1,14 @@
 #ifndef __CODA_PSDEV_H
 #define __CODA_PSDEV_H
 
-#include <linux/backing-dev.h>
 #include <linux/magic.h>
 
 #define CODA_PSDEV_MAJOR 67
 #define MAX_CODADEVS  5	   /* how many do we allow */
 
 #ifdef __KERNEL__
+#include <linux/backing-dev.h>
+
 struct kstatfs;
 
 /* communication pending/processing queues */
-- 
cgit v1.2.2


From 561b1733a465cf9677356b40c27653dd45f1ac56 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Wed, 28 Apr 2010 08:47:18 +0000
Subject: sctp: avoid irq lock inversion while call sk->sk_data_ready()

sk->sk_data_ready() of sctp socket can be called from both BH and non-BH
contexts, but the default sk->sk_data_ready(), sock_def_readable(), can
not be used in this case. Therefore, we have to make a new function
sctp_data_ready() to grab sk->sk_data_ready() with BH disabling.

=========================================================
[ INFO: possible irq lock inversion dependency detected ]
2.6.33-rc6 #129
---------------------------------------------------------
sctp_darn/1517 just changed the state of lock:
 (clock-AF_INET){++.?..}, at: [<c06aab60>] sock_def_readable+0x20/0x80
but this lock took another, SOFTIRQ-unsafe lock in the past:
 (slock-AF_INET){+.-...}

and interrupts could create inverse lock ordering between them.

other info that might help us debug this:
1 lock held by sctp_darn/1517:
 #0:  (sk_lock-AF_INET){+.+.+.}, at: [<cdfe363d>] sctp_sendmsg+0x23d/0xc00 [sctp]

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 78740ec57d5..fa6cde578a1 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -128,6 +128,7 @@ extern int sctp_register_pf(struct sctp_pf *, sa_family_t);
 int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb);
 int sctp_inet_listen(struct socket *sock, int backlog);
 void sctp_write_space(struct sock *sk);
+void sctp_data_ready(struct sock *sk, int len);
 unsigned int sctp_poll(struct file *file, struct socket *sock,
 		poll_table *wait);
 void sctp_sock_rfree(struct sk_buff *skb);
-- 
cgit v1.2.2


From c0786693404cffd80ca3cb6e75ee7b35186b2825 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Wed, 28 Apr 2010 08:47:22 +0000
Subject: sctp: Fix oops when sending queued ASCONF chunks

When we finish processing ASCONF_ACK chunk, we try to send
the next queued ASCONF.  This action runs the sctp state
machine recursively and it's not prepared to do so.

kernel BUG at kernel/timer.c:790!
invalid opcode: 0000 [#1] SMP
last sysfs file: /sys/module/ipv6/initstate
Modules linked in: sha256_generic sctp libcrc32c ipv6 dm_multipath
uinput 8139too i2c_piix4 8139cp mii i2c_core pcspkr virtio_net joydev
floppy virtio_blk virtio_pci [last unloaded: scsi_wait_scan]

Pid: 0, comm: swapper Not tainted 2.6.34-rc4 #15 /Bochs
EIP: 0060:[<c044a2ef>] EFLAGS: 00010286 CPU: 0
EIP is at add_timer+0xd/0x1b
EAX: cecbab14 EBX: 000000f0 ECX: c0957b1c EDX: 03595cf4
ESI: cecba800 EDI: cf276f00 EBP: c0957aa0 ESP: c0957aa0
 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
Process swapper (pid: 0, ti=c0956000 task=c0988ba0 task.ti=c0956000)
Stack:
 c0957ae0 d1851214 c0ab62e4 c0ab5f26 0500ffff 00000004 00000005 00000004
<0> 00000000 d18694fd 00000004 1666b892 cecba800 cecba800 c0957b14
00000004
<0> c0957b94 d1851b11 ceda8b00 cecba800 cf276f00 00000001 c0957b14
000000d0
Call Trace:
 [<d1851214>] ? sctp_side_effects+0x607/0xdfc [sctp]
 [<d1851b11>] ? sctp_do_sm+0x108/0x159 [sctp]
 [<d1863386>] ? sctp_pname+0x0/0x1d [sctp]
 [<d1861a56>] ? sctp_primitive_ASCONF+0x36/0x3b [sctp]
 [<d185657c>] ? sctp_process_asconf_ack+0x2a4/0x2d3 [sctp]
 [<d184e35c>] ? sctp_sf_do_asconf_ack+0x1dd/0x2b4 [sctp]
 [<d1851ac1>] ? sctp_do_sm+0xb8/0x159 [sctp]
 [<d1863334>] ? sctp_cname+0x0/0x52 [sctp]
 [<d1854377>] ? sctp_assoc_bh_rcv+0xac/0xe1 [sctp]
 [<d1858f0f>] ? sctp_inq_push+0x2d/0x30 [sctp]
 [<d186329d>] ? sctp_rcv+0x797/0x82e [sctp]

Tested-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Yuansong Qiao <ysqiao@research.ait.ie>
Signed-off-by: Shuaijun Zhang <szhang@research.ait.ie>
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/command.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h
index 8be5135ff7a..2c55a7ea20a 100644
--- a/include/net/sctp/command.h
+++ b/include/net/sctp/command.h
@@ -107,6 +107,7 @@ typedef enum {
 	SCTP_CMD_T1_RETRAN,	 /* Mark for retransmission after T1 timeout  */
 	SCTP_CMD_UPDATE_INITTAG, /* Update peer inittag */
 	SCTP_CMD_SEND_MSG,	 /* Send the whole use message */
+	SCTP_CMD_SEND_NEXT_ASCONF, /* Send the next ASCONF after ACK */
 	SCTP_CMD_LAST
 } sctp_verb_t;
 
-- 
cgit v1.2.2


From 1142d810298e694754498dbb4983fcb6cb7fd884 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 6 May 2010 18:49:20 +0200
Subject: cpu_stop: implement stop_cpu[s]()

Implement a simplistic per-cpu maximum priority cpu monopolization
mechanism.  A non-sleeping callback can be scheduled to run on one or
multiple cpus with maximum priority monopolozing those cpus.  This is
primarily to replace and unify RT workqueue usage in stop_machine and
scheduler migration_thread which currently is serving multiple
purposes.

Four functions are provided - stop_one_cpu(), stop_one_cpu_nowait(),
stop_cpus() and try_stop_cpus().

This is to allow clean sharing of resources among stop_cpu and all the
migration thread users.  One stopper thread per cpu is created which
is currently named "stopper/CPU".  This will eventually replace the
migration thread and take on its name.

* This facility was originally named cpuhog and lived in separate
  files but Peter Zijlstra nacked the name and thus got renamed to
  cpu_stop and moved into stop_machine.c.

* Better reporting of preemption leak as per Peter's suggestion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
---
 include/linux/stop_machine.h | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index baba3a23a81..efcbd6c3794 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -1,15 +1,46 @@
 #ifndef _LINUX_STOP_MACHINE
 #define _LINUX_STOP_MACHINE
-/* "Bogolock": stop the entire machine, disable interrupts.  This is a
-   very heavy lock, which is equivalent to grabbing every spinlock
-   (and more).  So the "read" side to such a lock is anything which
-   disables preeempt. */
+
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
+#include <linux/list.h>
 #include <asm/system.h>
 
 #if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
 
+/*
+ * stop_cpu[s]() is simplistic per-cpu maximum priority cpu
+ * monopolization mechanism.  The caller can specify a non-sleeping
+ * function to be executed on a single or multiple cpus preempting all
+ * other processes and monopolizing those cpus until it finishes.
+ *
+ * Resources for this mechanism are preallocated when a cpu is brought
+ * up and requests are guaranteed to be served as long as the target
+ * cpus are online.
+ */
+
+typedef int (*cpu_stop_fn_t)(void *arg);
+
+struct cpu_stop_work {
+	struct list_head	list;		/* cpu_stopper->works */
+	cpu_stop_fn_t		fn;
+	void			*arg;
+	struct cpu_stop_done	*done;
+};
+
+int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
+void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+			 struct cpu_stop_work *work_buf);
+int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
+int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
+
+/*
+ * stop_machine "Bogolock": stop the entire machine, disable
+ * interrupts.  This is a very heavy lock, which is equivalent to
+ * grabbing every spinlock (and more).  So the "read" side to such a
+ * lock is anything which disables preeempt.
+ */
+
 /**
  * stop_machine: freeze the machine on all CPUs and run this function
  * @fn: the function to run
-- 
cgit v1.2.2


From 3fc1f1e27a5b807791d72e5d992aa33b668a6626 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 6 May 2010 18:49:20 +0200
Subject: stop_machine: reimplement using cpu_stop

Reimplement stop_machine using cpu_stop.  As cpu stoppers are
guaranteed to be available for all online cpus,
stop_machine_create/destroy() are no longer necessary and removed.

With resource management and synchronization handled by cpu_stop, the
new implementation is much simpler.  Asking the cpu_stop to execute
the stop_cpu() state machine on all online cpus with cpu hotplug
disabled is enough.

stop_machine itself doesn't need to manage any global resources
anymore, so all per-instance information is rolled into struct
stop_machine_data and the mutex and all static data variables are
removed.

The previous implementation created and destroyed RT workqueues as
necessary which made stop_machine() calls highly expensive on very
large machines.  According to Dimitri Sivanich, preventing the dynamic
creation/destruction makes booting faster more than twice on very
large machines.  cpu_stop resources are preallocated for all online
cpus and should have the same effect.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
---
 include/linux/stop_machine.h | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index efcbd6c3794..0e552e72a4c 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -67,23 +67,6 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
  */
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
 
-/**
- * stop_machine_create: create all stop_machine threads
- *
- * Description: This causes all stop_machine threads to be created before
- * stop_machine actually gets called. This can be used by subsystems that
- * need a non failing stop_machine infrastructure.
- */
-int stop_machine_create(void);
-
-/**
- * stop_machine_destroy: destroy all stop_machine threads
- *
- * Description: This causes all stop_machine threads which were created with
- * stop_machine_create to be destroyed again.
- */
-void stop_machine_destroy(void);
-
 #else
 
 static inline int stop_machine(int (*fn)(void *), void *data,
@@ -96,8 +79,5 @@ static inline int stop_machine(int (*fn)(void *), void *data,
 	return ret;
 }
 
-static inline int stop_machine_create(void) { return 0; }
-static inline void stop_machine_destroy(void) { }
-
 #endif /* CONFIG_SMP */
 #endif /* _LINUX_STOP_MACHINE */
-- 
cgit v1.2.2


From 969c79215a35b06e5e3efe69b9412f858df7856c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 6 May 2010 18:49:21 +0200
Subject: sched: replace migration_thread with cpu_stop

Currently migration_thread is serving three purposes - migration
pusher, context to execute active_load_balance() and forced context
switcher for expedited RCU synchronize_sched.  All three roles are
hardcoded into migration_thread() and determining which job is
scheduled is slightly messy.

This patch kills migration_thread and replaces all three uses with
cpu_stop.  The three different roles of migration_thread() are
splitted into three separate cpu_stop callbacks -
migration_cpu_stop(), active_load_balance_cpu_stop() and
synchronize_sched_expedited_cpu_stop() - and each use case now simply
asks cpu_stop to execute the callback as necessary.

synchronize_sched_expedited() was implemented with private
preallocated resources and custom multi-cpu queueing and waiting
logic, both of which are provided by cpu_stop.
synchronize_sched_expedited_count is made atomic and all other shared
resources along with the mutex are dropped.

synchronize_sched_expedited() also implemented a check to detect cases
where not all the callback got executed on their assigned cpus and
fall back to synchronize_sched().  If called with cpu hotplug blocked,
cpu_stop already guarantees that and the condition cannot happen;
otherwise, stop_machine() would break.  However, this patch preserves
the paranoid check using a cpumask to record on which cpus the stopper
ran so that it can serve as a bisection point if something actually
goes wrong theree.

Because the internal execution state is no longer visible,
rcu_expedited_torture_stats() is removed.

This patch also renames cpu_stop threads to from "stopper/%d" to
"migration/%d".  The names of these threads ultimately don't matter
and there's no reason to make unnecessary userland visible changes.

With this patch applied, stop_machine() and sched now share the same
resources.  stop_machine() is faster without wasting any resources and
sched migration users are much cleaner.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Josh Triplett <josh@freedesktop.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
---
 include/linux/rcutiny.h | 2 --
 include/linux/rcutree.h | 1 -
 2 files changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index a5195875480..0006b2df00e 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -60,8 +60,6 @@ static inline long rcu_batches_completed_bh(void)
 	return 0;
 }
 
-extern int rcu_expedited_torture_stats(char *page);
-
 static inline void rcu_force_quiescent_state(void)
 {
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 42cc3a04779..24e467e526b 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -35,7 +35,6 @@ struct notifier_block;
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
 extern int rcu_needs_cpu(int cpu);
-extern int rcu_expedited_torture_stats(char *page);
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
-- 
cgit v1.2.2


From 27a9da6538ee18046d7bff8e36a9f783542c54c3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 4 May 2010 20:36:56 +0200
Subject: sched: Remove rq argument to the tracepoints

struct rq isn't visible outside of sched.o so its near useless to
expose the pointer, also there are no users of it, so remove it.

Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1272997616.1642.207.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/events/sched.h | 32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index cfceb0b73e2..4f733ecea46 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -51,15 +51,12 @@ TRACE_EVENT(sched_kthread_stop_ret,
 
 /*
  * Tracepoint for waiting on task to unschedule:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
  */
 TRACE_EVENT(sched_wait_task,
 
-	TP_PROTO(struct rq *rq, struct task_struct *p),
+	TP_PROTO(struct task_struct *p),
 
-	TP_ARGS(rq, p),
+	TP_ARGS(p),
 
 	TP_STRUCT__entry(
 		__array(	char,	comm,	TASK_COMM_LEN	)
@@ -79,15 +76,12 @@ TRACE_EVENT(sched_wait_task,
 
 /*
  * Tracepoint for waking up a task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
  */
 DECLARE_EVENT_CLASS(sched_wakeup_template,
 
-	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+	TP_PROTO(struct task_struct *p, int success),
 
-	TP_ARGS(rq, p, success),
+	TP_ARGS(p, success),
 
 	TP_STRUCT__entry(
 		__array(	char,	comm,	TASK_COMM_LEN	)
@@ -111,31 +105,25 @@ DECLARE_EVENT_CLASS(sched_wakeup_template,
 );
 
 DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
-	     TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-	     TP_ARGS(rq, p, success));
+	     TP_PROTO(struct task_struct *p, int success),
+	     TP_ARGS(p, success));
 
 /*
  * Tracepoint for waking up a new task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
  */
 DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
-	     TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-	     TP_ARGS(rq, p, success));
+	     TP_PROTO(struct task_struct *p, int success),
+	     TP_ARGS(p, success));
 
 /*
  * Tracepoint for task switches, performed by the scheduler:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
  */
 TRACE_EVENT(sched_switch,
 
-	TP_PROTO(struct rq *rq, struct task_struct *prev,
+	TP_PROTO(struct task_struct *prev,
 		 struct task_struct *next),
 
-	TP_ARGS(rq, prev, next),
+	TP_ARGS(prev, next),
 
 	TP_STRUCT__entry(
 		__array(	char,	prev_comm,	TASK_COMM_LEN	)
-- 
cgit v1.2.2


From bbf1bb3eee86f2eef2baa14e600be454d09109ee Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 8 May 2010 16:20:53 +0200
Subject: cpu_stop: add dummy implementation for UP

When !CONFIG_SMP, cpu_stop functions weren't defined at all which
could lead to build failures if UP code uses cpu_stop facility.  Add
dummy cpu_stop implementation for UP.  The waiting variants execute
the work function directly with preempt disabled and
stop_one_cpu_nowait() schedules a workqueue work.

Makefile and ifdefs around stop_machine implementation are updated to
accomodate CONFIG_SMP && !CONFIG_STOP_MACHINE case.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/stop_machine.h | 69 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 63 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 0e552e72a4c..6b524a0d02e 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -6,8 +6,6 @@
 #include <linux/list.h>
 #include <asm/system.h>
 
-#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
-
 /*
  * stop_cpu[s]() is simplistic per-cpu maximum priority cpu
  * monopolization mechanism.  The caller can specify a non-sleeping
@@ -18,9 +16,10 @@
  * up and requests are guaranteed to be served as long as the target
  * cpus are online.
  */
-
 typedef int (*cpu_stop_fn_t)(void *arg);
 
+#ifdef CONFIG_SMP
+
 struct cpu_stop_work {
 	struct list_head	list;		/* cpu_stopper->works */
 	cpu_stop_fn_t		fn;
@@ -34,12 +33,70 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
 int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
 
+#else	/* CONFIG_SMP */
+
+#include <linux/workqueue.h>
+
+struct cpu_stop_work {
+	struct work_struct	work;
+	cpu_stop_fn_t		fn;
+	void			*arg;
+};
+
+static inline int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
+{
+	int ret = -ENOENT;
+	preempt_disable();
+	if (cpu == smp_processor_id())
+		ret = fn(arg);
+	preempt_enable();
+	return ret;
+}
+
+static void stop_one_cpu_nowait_workfn(struct work_struct *work)
+{
+	struct cpu_stop_work *stwork =
+		container_of(work, struct cpu_stop_work, work);
+	preempt_disable();
+	stwork->fn(stwork->arg);
+	preempt_enable();
+}
+
+static inline void stop_one_cpu_nowait(unsigned int cpu,
+				       cpu_stop_fn_t fn, void *arg,
+				       struct cpu_stop_work *work_buf)
+{
+	if (cpu == smp_processor_id()) {
+		INIT_WORK(&work_buf->work, stop_one_cpu_nowait_workfn);
+		work_buf->fn = fn;
+		work_buf->arg = arg;
+		schedule_work(&work_buf->work);
+	}
+}
+
+static inline int stop_cpus(const struct cpumask *cpumask,
+			    cpu_stop_fn_t fn, void *arg)
+{
+	if (cpumask_test_cpu(raw_smp_processor_id(), cpumask))
+		return stop_one_cpu(raw_smp_processor_id(), fn, arg);
+	return -ENOENT;
+}
+
+static inline int try_stop_cpus(const struct cpumask *cpumask,
+				cpu_stop_fn_t fn, void *arg)
+{
+	return stop_cpus(cpumask, fn, arg);
+}
+
+#endif	/* CONFIG_SMP */
+
 /*
  * stop_machine "Bogolock": stop the entire machine, disable
  * interrupts.  This is a very heavy lock, which is equivalent to
  * grabbing every spinlock (and more).  So the "read" side to such a
  * lock is anything which disables preeempt.
  */
+#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
 
 /**
  * stop_machine: freeze the machine on all CPUs and run this function
@@ -67,7 +124,7 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
  */
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
 
-#else
+#else	 /* CONFIG_STOP_MACHINE && CONFIG_SMP */
 
 static inline int stop_machine(int (*fn)(void *), void *data,
 			       const struct cpumask *cpus)
@@ -79,5 +136,5 @@ static inline int stop_machine(int (*fn)(void *), void *data,
 	return ret;
 }
 
-#endif /* CONFIG_SMP */
-#endif /* _LINUX_STOP_MACHINE */
+#endif	/* CONFIG_STOP_MACHINE && CONFIG_SMP */
+#endif	/* _LINUX_STOP_MACHINE */
-- 
cgit v1.2.2


From e0e37c200f1357db0dd986edb359c41c57d24f6e Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 9 May 2010 08:24:39 -0700
Subject: sched: Eliminate the ts->idle_lastupdate field

Now that the only user of ts->idle_lastupdate is
update_ts_time_stats(), the entire field can be eliminated.

In update_ts_time_stats(), idle_lastupdate is first set to
"now", and a few lines later, the only user is an if() statement
that assigns a variable either to "now" or to
ts->idle_lastupdate, which has the value of "now" at that point.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: davej@redhat.com
LKML-Reference: <20100509082439.2fab0b4f@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/tick.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index d2ae79e21be..0343eed4061 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -60,7 +60,6 @@ struct tick_sched {
 	ktime_t				idle_waketime;
 	ktime_t				idle_exittime;
 	ktime_t				idle_sleeptime;
-	ktime_t				idle_lastupdate;
 	ktime_t				sleep_length;
 	unsigned long			last_jiffies;
 	unsigned long			next_jiffies;
-- 
cgit v1.2.2


From 0224cf4c5ee0d7faec83956b8e21f7d89e3df3bd Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 9 May 2010 08:25:23 -0700
Subject: sched: Intoduce get_cpu_iowait_time_us()

For the ondemand cpufreq governor, it is desired that the iowait
time is microaccounted in a similar way as idle time is.

This patch introduces the infrastructure to account and expose
this information via the get_cpu_iowait_time_us() function.

[akpm@linux-foundation.org: fix CONFIG_NO_HZ=n build]
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: davej@redhat.com
LKML-Reference: <20100509082523.284feab6@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/tick.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 0343eed4061..b232ccc0ee2 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -42,6 +42,7 @@ enum tick_nohz_mode {
  * @idle_waketime:	Time when the idle was interrupted
  * @idle_exittime:	Time when the idle state was left
  * @idle_sleeptime:	Sum of the time slept in idle with sched tick stopped
+ * @iowait_sleeptime:	Sum of the time slept in idle with sched tick stopped, with IO outstanding
  * @sleep_length:	Duration of the current idle sleep
  * @do_timer_lst:	CPU was the last one doing do_timer before going idle
  */
@@ -60,6 +61,7 @@ struct tick_sched {
 	ktime_t				idle_waketime;
 	ktime_t				idle_exittime;
 	ktime_t				idle_sleeptime;
+	ktime_t				iowait_sleeptime;
 	ktime_t				sleep_length;
 	unsigned long			last_jiffies;
 	unsigned long			next_jiffies;
@@ -123,6 +125,7 @@ extern void tick_nohz_stop_sched_tick(int inidle);
 extern void tick_nohz_restart_sched_tick(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
+extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 # else
 static inline void tick_nohz_stop_sched_tick(int inidle) { }
 static inline void tick_nohz_restart_sched_tick(void) { }
@@ -133,6 +136,7 @@ static inline ktime_t tick_nohz_get_sleep_length(void)
 	return len;
 }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
+static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
 # endif /* !NO_HZ */
 
 #endif
-- 
cgit v1.2.2


From a93d2f1744206827ccf416e2cdc5018aa503314e Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Fri, 7 May 2010 14:33:26 +0800
Subject: sched, wait: Use wrapper functions

epoll should not touch flags in wait_queue_t. This patch introduces a new
function __add_wait_queue_exclusive(), for the users, who use wait queue as a
LIFO queue.

__add_wait_queue_tail_exclusive() is introduced too instead of
add_wait_queue_exclusive_locked(). remove_wait_queue_locked() is removed, as
it is a duplicate of __remove_wait_queue(), disliked by users, and with less
users.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: <containers@lists.linux-foundation.org>
LKML-Reference: <1273214006-2979-1-git-send-email-xiaosuo@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/wait.h | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index a48e16b77d5..76d96d035ea 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -127,12 +127,26 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
 /*
  * Used for wake-one threads:
  */
+static inline void __add_wait_queue_exclusive(wait_queue_head_t *q,
+					      wait_queue_t *wait)
+{
+	wait->flags |= WQ_FLAG_EXCLUSIVE;
+	__add_wait_queue(q, wait);
+}
+
 static inline void __add_wait_queue_tail(wait_queue_head_t *head,
-						wait_queue_t *new)
+					 wait_queue_t *new)
 {
 	list_add_tail(&new->task_list, &head->task_list);
 }
 
+static inline void __add_wait_queue_tail_exclusive(wait_queue_head_t *q,
+					      wait_queue_t *wait)
+{
+	wait->flags |= WQ_FLAG_EXCLUSIVE;
+	__add_wait_queue_tail(q, wait);
+}
+
 static inline void __remove_wait_queue(wait_queue_head_t *head,
 							wait_queue_t *old)
 {
@@ -403,25 +417,6 @@ do {									\
 	__ret;								\
 })
 
-/*
- * Must be called with the spinlock in the wait_queue_head_t held.
- */
-static inline void add_wait_queue_exclusive_locked(wait_queue_head_t *q,
-						   wait_queue_t * wait)
-{
-	wait->flags |= WQ_FLAG_EXCLUSIVE;
-	__add_wait_queue_tail(q,  wait);
-}
-
-/*
- * Must be called with the spinlock in the wait_queue_head_t held.
- */
-static inline void remove_wait_queue_locked(wait_queue_head_t *q,
-					    wait_queue_t * wait)
-{
-	__remove_wait_queue(q,  wait);
-}
-
 /*
  * These are the old interfaces to sleep waiting for an event.
  * They are racy.  DO NOT use them, use the wait_event* interfaces above.
-- 
cgit v1.2.2