From 4b41308d2d0398409620613c7eaaaf52c738b042 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 10 Aug 2011 10:37:59 -0700
Subject: alarmtimers: Change alarmtimer functions to return alarmtimer_restart
 values

In order to properly fix the denial of service issue with high freq
periodic alarm timers, we need to push the re-arming logic into the
alarm timer handler, much as the hrtimer code does.

This patch introduces alarmtimer_restart enum and changes the
alarmtimer handler declarations to use it as a return value. Further,
to ease following changes, it extends the alarmtimer handler functions
to also take the time at expiration. No logic is yet modified.

CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/alarmtimer.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index ea5e1a928d5b..9e786053ffa2 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -196,7 +196,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
 		}
 		spin_unlock_irqrestore(&base->lock, flags);
 		if (alarm->function)
-			alarm->function(alarm);
+			alarm->function(alarm, now);
 		spin_lock_irqsave(&base->lock, flags);
 	}
 
@@ -299,7 +299,7 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
  * @function: callback that is run when the alarm fires
  */
 void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
-		void (*function)(struct alarm *))
+		enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
 {
 	timerqueue_init(&alarm->node);
 	alarm->period = ktime_set(0, 0);
@@ -365,12 +365,15 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
  *
  * Posix timer callback for expired alarm timers.
  */
-static void alarm_handle_timer(struct alarm *alarm)
+static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
+							ktime_t now)
 {
 	struct k_itimer *ptr = container_of(alarm, struct k_itimer,
 						it.alarmtimer);
 	if (posix_timer_event(ptr, 0) != 0)
 		ptr->it_overrun++;
+
+	return ALARMTIMER_NORESTART;
 }
 
 /**
@@ -509,13 +512,15 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
  *
  * Wakes up the task that set the alarmtimer
  */
-static void alarmtimer_nsleep_wakeup(struct alarm *alarm)
+static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
+								ktime_t now)
 {
 	struct task_struct *task = (struct task_struct *)alarm->data;
 
 	alarm->data = NULL;
 	if (task)
 		wake_up_process(task);
+	return ALARMTIMER_NORESTART;
 }
 
 /**
-- 
cgit v1.2.2


From 54da23b720d5d612f8f1669f9ed3744008fb7382 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 10 Aug 2011 11:08:07 -0700
Subject: alarmtimers: Push rearming peroidic timers down into alamrtimer
 handler

This patch pushes the periodic alarmtimer re-arming down into the alarmtimer
handler, mimicking how hrtimers handle this.

CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/alarmtimer.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 9e786053ffa2..55e634ea6054 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -174,6 +174,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
 	unsigned long flags;
 	ktime_t now;
 	int ret = HRTIMER_NORESTART;
+	int restart = ALARMTIMER_NORESTART;
 
 	spin_lock_irqsave(&base->lock, flags);
 	now = base->gettime();
@@ -188,16 +189,16 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
 
 		timerqueue_del(&base->timerqueue, &alarm->node);
 		alarm->enabled = 0;
-		/* Re-add periodic timers */
-		if (alarm->period.tv64) {
-			alarm->node.expires = ktime_add(expired, alarm->period);
-			timerqueue_add(&base->timerqueue, &alarm->node);
-			alarm->enabled = 1;
-		}
+
 		spin_unlock_irqrestore(&base->lock, flags);
 		if (alarm->function)
-			alarm->function(alarm, now);
+			restart = alarm->function(alarm, now);
 		spin_lock_irqsave(&base->lock, flags);
+
+		if (restart != ALARMTIMER_NORESTART) {
+			timerqueue_add(&base->timerqueue, &alarm->node);
+			alarm->enabled = 1;
+		}
 	}
 
 	if (next) {
@@ -373,6 +374,11 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
 	if (posix_timer_event(ptr, 0) != 0)
 		ptr->it_overrun++;
 
+	/* Re-add periodic timers */
+	if (alarm->period.tv64) {
+		alarm->node.expires = ktime_add(now, alarm->period);
+		return ALARMTIMER_RESTART;
+	}
 	return ALARMTIMER_NORESTART;
 }
 
-- 
cgit v1.2.2


From dce75a8c71819ed4c7efdcd53c9b6f6356dc8cb5 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 10 Aug 2011 11:31:03 -0700
Subject: alarmtimers: Add alarm_forward functionality

In order to avoid wasting time expiring and re-adding very high freq
periodic alarmtimers, introduce alarm_forward() which is similar to
hrtimer_forward and moves the timer to the next future expiration time
and returns the number of overruns.

CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/alarmtimer.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 55e634ea6054..f03b04291b6f 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -347,6 +347,41 @@ void alarm_cancel(struct alarm *alarm)
 }
 
 
+
+u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
+{
+	u64 overrun = 1;
+	ktime_t delta;
+
+	delta = ktime_sub(now, alarm->node.expires);
+
+	if (delta.tv64 < 0)
+		return 0;
+
+	if (unlikely(delta.tv64 >= interval.tv64)) {
+		s64 incr = ktime_to_ns(interval);
+
+		overrun = ktime_divns(delta, incr);
+
+		alarm->node.expires = ktime_add_ns(alarm->node.expires,
+							incr*overrun);
+
+		if (alarm->node.expires.tv64 > now.tv64)
+			return overrun;
+		/*
+		 * This (and the ktime_add() below) is the
+		 * correction for exact:
+		 */
+		overrun++;
+	}
+
+	alarm->node.expires = ktime_add(alarm->node.expires, interval);
+	return overrun;
+}
+
+
+
+
 /**
  * clock2alarm - helper that converts from clockid to alarmtypes
  * @clockid: clockid.
@@ -376,7 +411,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
 
 	/* Re-add periodic timers */
 	if (alarm->period.tv64) {
-		alarm->node.expires = ktime_add(now, alarm->period);
+		ptr->it_overrun += alarm_forward(alarm, now, alarm->period);
 		return ALARMTIMER_RESTART;
 	}
 	return ALARMTIMER_NORESTART;
-- 
cgit v1.2.2


From d77e23accec56bf2ba12187fe77a2f500a511282 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 10 Aug 2011 11:40:23 -0700
Subject: alarmtimers: Remove interval cap limit hack

Now that the alarmtimers code has been refactored, the interval
cap limit can be removed.

CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/alarmtimer.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index f03b04291b6f..a522c007e6fd 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -525,15 +525,6 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
 	if (!rtcdev)
 		return -ENOTSUPP;
 
-	/*
-	 * XXX HACK! Currently we can DOS a system if the interval
-	 * period on alarmtimers is too small. Cap the interval here
-	 * to 100us and solve this properly in a future patch! -jstultz
-	 */
-	if ((new_setting->it_interval.tv_sec == 0) &&
-			(new_setting->it_interval.tv_nsec < 100000))
-		new_setting->it_interval.tv_nsec = 100000;
-
 	if (old_setting)
 		alarm_timer_get(timr, old_setting);
 
-- 
cgit v1.2.2


From 9e26476243e438f4534a562660c1296a15a9e202 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 10 Aug 2011 12:09:24 -0700
Subject: alarmtimers: Remove period from alarm structure

Now that periodic alarmtimers are managed by the handler function,
remove the period value from the alarm structure and let the handlers
manage the interval on their own.

CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/alarmtimer.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index a522c007e6fd..90935591dd44 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -303,7 +303,6 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
 		enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
 {
 	timerqueue_init(&alarm->node);
-	alarm->period = ktime_set(0, 0);
 	alarm->function = function;
 	alarm->type = type;
 	alarm->enabled = 0;
@@ -313,9 +312,8 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
  * alarm_start - Sets an alarm to fire
  * @alarm: ptr to alarm to set
  * @start: time to run the alarm
- * @period: period at which the alarm will recur
  */
-void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period)
+void alarm_start(struct alarm *alarm, ktime_t start)
 {
 	struct alarm_base *base = &alarm_bases[alarm->type];
 	unsigned long flags;
@@ -324,7 +322,6 @@ void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period)
 	if (alarm->enabled)
 		alarmtimer_remove(base, alarm);
 	alarm->node.expires = start;
-	alarm->period = period;
 	alarmtimer_enqueue(base, alarm);
 	alarm->enabled = 1;
 	spin_unlock_irqrestore(&base->lock, flags);
@@ -405,13 +402,14 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
 							ktime_t now)
 {
 	struct k_itimer *ptr = container_of(alarm, struct k_itimer,
-						it.alarmtimer);
+						it.alarm.alarmtimer);
 	if (posix_timer_event(ptr, 0) != 0)
 		ptr->it_overrun++;
 
 	/* Re-add periodic timers */
-	if (alarm->period.tv64) {
-		ptr->it_overrun += alarm_forward(alarm, now, alarm->period);
+	if (ptr->it.alarm.interval.tv64) {
+		ptr->it_overrun += alarm_forward(alarm, now,
+						ptr->it.alarm.interval);
 		return ALARMTIMER_RESTART;
 	}
 	return ALARMTIMER_NORESTART;
@@ -471,7 +469,7 @@ static int alarm_timer_create(struct k_itimer *new_timer)
 
 	type = clock2alarm(new_timer->it_clock);
 	base = &alarm_bases[type];
-	alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer);
+	alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);
 	return 0;
 }
 
@@ -488,9 +486,9 @@ static void alarm_timer_get(struct k_itimer *timr,
 	memset(cur_setting, 0, sizeof(struct itimerspec));
 
 	cur_setting->it_interval =
-			ktime_to_timespec(timr->it.alarmtimer.period);
+			ktime_to_timespec(timr->it.alarm.interval);
 	cur_setting->it_value =
-			ktime_to_timespec(timr->it.alarmtimer.node.expires);
+		ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires);
 	return;
 }
 
@@ -505,7 +503,7 @@ static int alarm_timer_del(struct k_itimer *timr)
 	if (!rtcdev)
 		return -ENOTSUPP;
 
-	alarm_cancel(&timr->it.alarmtimer);
+	alarm_cancel(&timr->it.alarm.alarmtimer);
 	return 0;
 }
 
@@ -529,12 +527,12 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
 		alarm_timer_get(timr, old_setting);
 
 	/* If the timer was already set, cancel it */
-	alarm_cancel(&timr->it.alarmtimer);
+	alarm_cancel(&timr->it.alarm.alarmtimer);
 
 	/* start the timer */
-	alarm_start(&timr->it.alarmtimer,
-			timespec_to_ktime(new_setting->it_value),
-			timespec_to_ktime(new_setting->it_interval));
+	timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
+	alarm_start(&timr->it.alarm.alarmtimer,
+			timespec_to_ktime(new_setting->it_value));
 	return 0;
 }
 
@@ -567,7 +565,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
 	alarm->data = (void *)current;
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
-		alarm_start(alarm, absexp, ktime_set(0, 0));
+		alarm_start(alarm, absexp);
 		if (likely(alarm->data))
 			schedule();
 
-- 
cgit v1.2.2


From a28cde81ab13cc251748a4c4ef06883dd09a10ea Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 10 Aug 2011 12:30:21 -0700
Subject: alarmtimers: Add more refined alarm state tracking

In order to allow for functionality like try_to_cancel, add
more refined  state tracking (similar to hrtimers).

CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/alarmtimer.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 90935591dd44..5b14cc29b6a6 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -126,6 +126,8 @@ static struct rtc_device *alarmtimer_get_rtcdev(void)
 static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
 {
 	timerqueue_add(&base->timerqueue, &alarm->node);
+	alarm->state |= ALARMTIMER_STATE_ENQUEUED;
+
 	if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
 		hrtimer_try_to_cancel(&base->timer);
 		hrtimer_start(&base->timer, alarm->node.expires,
@@ -147,7 +149,12 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
 {
 	struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
 
+	if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
+		return;
+
 	timerqueue_del(&base->timerqueue, &alarm->node);
+	alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
+
 	if (next == &alarm->node) {
 		hrtimer_try_to_cancel(&base->timer);
 		next = timerqueue_getnext(&base->timerqueue);
@@ -188,16 +195,18 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
 		alarm = container_of(next, struct alarm, node);
 
 		timerqueue_del(&base->timerqueue, &alarm->node);
-		alarm->enabled = 0;
+		alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
 
+		alarm->state |= ALARMTIMER_STATE_CALLBACK;
 		spin_unlock_irqrestore(&base->lock, flags);
 		if (alarm->function)
 			restart = alarm->function(alarm, now);
 		spin_lock_irqsave(&base->lock, flags);
+		alarm->state &= ~ALARMTIMER_STATE_CALLBACK;
 
 		if (restart != ALARMTIMER_NORESTART) {
 			timerqueue_add(&base->timerqueue, &alarm->node);
-			alarm->enabled = 1;
+			alarm->state |= ALARMTIMER_STATE_ENQUEUED;
 		}
 	}
 
@@ -305,7 +314,7 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
 	timerqueue_init(&alarm->node);
 	alarm->function = function;
 	alarm->type = type;
-	alarm->enabled = 0;
+	alarm->state = ALARMTIMER_STATE_INACTIVE;
 }
 
 /**
@@ -319,11 +328,10 @@ void alarm_start(struct alarm *alarm, ktime_t start)
 	unsigned long flags;
 
 	spin_lock_irqsave(&base->lock, flags);
-	if (alarm->enabled)
+	if (alarmtimer_active(alarm))
 		alarmtimer_remove(base, alarm);
 	alarm->node.expires = start;
 	alarmtimer_enqueue(base, alarm);
-	alarm->enabled = 1;
 	spin_unlock_irqrestore(&base->lock, flags);
 }
 
@@ -337,9 +345,8 @@ void alarm_cancel(struct alarm *alarm)
 	unsigned long flags;
 
 	spin_lock_irqsave(&base->lock, flags);
-	if (alarm->enabled)
+	if (alarmtimer_is_queued(alarm))
 		alarmtimer_remove(base, alarm);
-	alarm->enabled = 0;
 	spin_unlock_irqrestore(&base->lock, flags);
 }
 
-- 
cgit v1.2.2


From 9082c465a5403f4a98734193e078552991a2e283 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 10 Aug 2011 12:41:36 -0700
Subject: alarmtimers: Add try_to_cancel functionality

There's a number of edge cases when cancelling a alarm, so
to be sure we accurately do so, introduce try_to_cancel, which
returns proper failure errors if it cannot. Also modify cancel
to spin until the alarm is properly disabled.

CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/alarmtimer.c | 43 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 5b14cc29b6a6..bdb7342b6896 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -336,21 +336,49 @@ void alarm_start(struct alarm *alarm, ktime_t start)
 }
 
 /**
- * alarm_cancel - Tries to cancel an alarm timer
+ * alarm_try_to_cancel - Tries to cancel an alarm timer
  * @alarm: ptr to alarm to be canceled
+ *
+ * Returns 1 if the timer was canceled, 0 if it was not running,
+ * and -1 if the callback was running
  */
-void alarm_cancel(struct alarm *alarm)
+int alarm_try_to_cancel(struct alarm *alarm)
 {
 	struct alarm_base *base = &alarm_bases[alarm->type];
 	unsigned long flags;
-
+	int ret = -1;
 	spin_lock_irqsave(&base->lock, flags);
-	if (alarmtimer_is_queued(alarm))
+
+	if (alarmtimer_callback_running(alarm))
+		goto out;
+
+	if (alarmtimer_is_queued(alarm)) {
 		alarmtimer_remove(base, alarm);
+		ret = 1;
+	} else
+		ret = 0;
+out:
 	spin_unlock_irqrestore(&base->lock, flags);
+	return ret;
 }
 
 
+/**
+ * alarm_cancel - Spins trying to cancel an alarm timer until it is done
+ * @alarm: ptr to alarm to be canceled
+ *
+ * Returns 1 if the timer was canceled, 0 if it was not active.
+ */
+int alarm_cancel(struct alarm *alarm)
+{
+	for (;;) {
+		int ret = alarm_try_to_cancel(alarm);
+		if (ret >= 0)
+			return ret;
+		cpu_relax();
+	}
+}
+
 
 u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
 {
@@ -510,7 +538,9 @@ static int alarm_timer_del(struct k_itimer *timr)
 	if (!rtcdev)
 		return -ENOTSUPP;
 
-	alarm_cancel(&timr->it.alarm.alarmtimer);
+	if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
+		return TIMER_RETRY;
+
 	return 0;
 }
 
@@ -534,7 +564,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
 		alarm_timer_get(timr, old_setting);
 
 	/* If the timer was already set, cancel it */
-	alarm_cancel(&timr->it.alarm.alarmtimer);
+	if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
+		return TIMER_RETRY;
 
 	/* start the timer */
 	timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
-- 
cgit v1.2.2


From 8bc0dafb5cf38a19484dfb16e2c6d29e85820046 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 14 Jul 2011 18:35:13 -0700
Subject: alarmtimers: Rework RTC device selection using class interface

This allows cleaner detection of the RTC device being registered, rather
then probing any time someone calls alarmtimer_get_rtcdev.

CC: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/alarmtimer.c | 78 +++++++++++++++++++++++++-----------------------
 1 file changed, 40 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index bdb7342b6896..154d5563ab1b 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -52,27 +52,6 @@ static struct rtc_timer		rtctimer;
 static struct rtc_device	*rtcdev;
 static DEFINE_SPINLOCK(rtcdev_lock);
 
-/**
- * has_wakealarm - check rtc device has wakealarm ability
- * @dev: current device
- * @name_ptr: name to be returned
- *
- * This helper function checks to see if the rtc device can wake
- * from suspend.
- */
-static int has_wakealarm(struct device *dev, void *name_ptr)
-{
-	struct rtc_device *candidate = to_rtc_device(dev);
-
-	if (!candidate->ops->set_alarm)
-		return 0;
-	if (!device_may_wakeup(candidate->dev.parent))
-		return 0;
-
-	*(const char **)name_ptr = dev_name(dev);
-	return 1;
-}
-
 /**
  * alarmtimer_get_rtcdev - Return selected rtcdevice
  *
@@ -82,37 +61,58 @@ static int has_wakealarm(struct device *dev, void *name_ptr)
  */
 static struct rtc_device *alarmtimer_get_rtcdev(void)
 {
-	struct device *dev;
-	char *str;
 	unsigned long flags;
 	struct rtc_device *ret;
 
 	spin_lock_irqsave(&rtcdev_lock, flags);
-	if (!rtcdev) {
-		/* Find an rtc device and init the rtc_timer */
-		dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
-		/* If we have a device then str is valid. See has_wakealarm() */
-		if (dev) {
-			rtcdev = rtc_class_open(str);
-			/*
-			 * Drop the reference we got in class_find_device,
-			 * rtc_open takes its own.
-			 */
-			put_device(dev);
-			rtc_timer_init(&rtctimer, NULL, NULL);
-		}
-	}
 	ret = rtcdev;
 	spin_unlock_irqrestore(&rtcdev_lock, flags);
 
 	return ret;
 }
+
+
+static int alarmtimer_rtc_add_device(struct device *dev,
+				struct class_interface *class_intf)
+{
+	unsigned long flags;
+	struct rtc_device *rtc = to_rtc_device(dev);
+
+	if (rtcdev)
+		return -EBUSY;
+
+	if (!rtc->ops->set_alarm)
+		return -1;
+	if (!device_may_wakeup(rtc->dev.parent))
+		return -1;
+
+	spin_lock_irqsave(&rtcdev_lock, flags);
+	if (!rtcdev) {
+		rtcdev = rtc;
+		/* hold a reference so it doesn't go away */
+		get_device(dev);
+	}
+	spin_unlock_irqrestore(&rtcdev_lock, flags);
+	return 0;
+}
+
+static struct class_interface alarmtimer_rtc_interface = {
+	.add_dev = &alarmtimer_rtc_add_device,
+};
+
+static void alarmtimer_rtc_interface_setup(void)
+{
+	alarmtimer_rtc_interface.class = rtc_class;
+	class_interface_register(&alarmtimer_rtc_interface);
+}
 #else
 #define alarmtimer_get_rtcdev() (0)
 #define rtcdev (0)
+#define alarmtimer_rtc_interface_setup()
 #endif
 
 
+
 /**
  * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
  * @base: pointer to the base where the timer is being run
@@ -244,7 +244,7 @@ static int alarmtimer_suspend(struct device *dev)
 	freezer_delta = ktime_set(0, 0);
 	spin_unlock_irqrestore(&freezer_delta_lock, flags);
 
-	rtc = rtcdev;
+	rtc = alarmtimer_get_rtcdev();
 	/* If we have no rtcdev, just return */
 	if (!rtc)
 		return 0;
@@ -792,6 +792,8 @@ static int __init alarmtimer_init(void)
 				HRTIMER_MODE_ABS);
 		alarm_bases[i].timer.function = alarmtimer_fired;
 	}
+
+	alarmtimer_rtc_interface_setup();
 	error = platform_driver_register(&alarmtimer_driver);
 	platform_device_register_simple("alarmtimer", -1, NULL, 0);
 
-- 
cgit v1.2.2


From 3a301d7c1c68fd96bbcf82db82d9508918e0dc22 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 8 Aug 2011 21:39:39 -0400
Subject: tracing: Clean up tb_fmt to not give faulty compile warning

gcc incorrectly states that the variable "fmt" is uninitialized when
CC_OPITMIZE_FOR_SIZE is set.

Instead of just blindly setting fmt to NULL, the code is cleaned up
a little to be a bit easier for humans to follow, as well as gcc
to know the variables are initialized.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_printk.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 1f06468a10d7..6fd4ffd042f9 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -59,18 +59,19 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
 			continue;
 		}
 
+		fmt = NULL;
 		tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
-		if (tb_fmt)
+		if (tb_fmt) {
 			fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
-		if (tb_fmt && fmt) {
-			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
-			strcpy(fmt, *iter);
-			tb_fmt->fmt = fmt;
-			*iter = tb_fmt->fmt;
-		} else {
-			kfree(tb_fmt);
-			*iter = NULL;
+			if (fmt) {
+				list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+				strcpy(fmt, *iter);
+				tb_fmt->fmt = fmt;
+			} else
+				kfree(tb_fmt);
 		}
+		*iter = fmt;
+
 	}
 	mutex_unlock(&btrace_mutex);
 }
-- 
cgit v1.2.2


From b75ef8b44b1cb95f5a26484b0e2fe37a63b12b44 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Wed, 10 Aug 2011 15:18:39 -0400
Subject: Tracepoint: Dissociate from module mutex

Copy the information needed from struct module into a local module list
held within tracepoint.c from within the module coming/going notifier.

This vastly simplifies locking of tracepoint registration /
unregistration, because we don't have to take the module mutex to
register and unregister tracepoints anymore. Steven Rostedt ran into
dependency problems related to modules mutex vs kprobes mutex vs ftrace
mutex vs tracepoint mutex that seems to be hard to fix without removing
this dependency between tracepoint and module mutex. (note: it should be
investigated whether kprobes could benefit of being dissociated from the
modules mutex too.)

This also fixes module handling of tracepoint list iterators, because it
was expecting the list to be sorted by pointer address. Given we have
control on our own list now, it's OK to sort this list which has
tracepoints as its only purpose. The reason why this sorting is required
is to handle the fact that seq files (and any read() operation from
user-space) cannot hold the tracepoint mutex across multiple calls, so
list entries may vanish between calls. With sorting, the tracepoint
iterator becomes usable even if the list don't contain the exact item
pointed to by the iterator anymore.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Jason Baron <jbaron@redhat.com>
CC: Ingo Molnar <mingo@elte.hu>
CC: Lai Jiangshan <laijs@cn.fujitsu.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Link: http://lkml.kernel.org/r/20110810191839.GC8525@Krystal
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/module.c     |  47 ---------------
 kernel/tracepoint.c | 169 +++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 147 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 04379f92f843..93342d992f34 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3487,50 +3487,3 @@ void module_layout(struct module *mod,
 }
 EXPORT_SYMBOL(module_layout);
 #endif
-
-#ifdef CONFIG_TRACEPOINTS
-void module_update_tracepoints(void)
-{
-	struct module *mod;
-
-	mutex_lock(&module_mutex);
-	list_for_each_entry(mod, &modules, list)
-		if (!mod->taints)
-			tracepoint_update_probe_range(mod->tracepoints_ptrs,
-				mod->tracepoints_ptrs + mod->num_tracepoints);
-	mutex_unlock(&module_mutex);
-}
-
-/*
- * Returns 0 if current not found.
- * Returns 1 if current found.
- */
-int module_get_iter_tracepoints(struct tracepoint_iter *iter)
-{
-	struct module *iter_mod;
-	int found = 0;
-
-	mutex_lock(&module_mutex);
-	list_for_each_entry(iter_mod, &modules, list) {
-		if (!iter_mod->taints) {
-			/*
-			 * Sorted module list
-			 */
-			if (iter_mod < iter->module)
-				continue;
-			else if (iter_mod > iter->module)
-				iter->tracepoint = NULL;
-			found = tracepoint_get_iter_range(&iter->tracepoint,
-				iter_mod->tracepoints_ptrs,
-				iter_mod->tracepoints_ptrs
-					+ iter_mod->num_tracepoints);
-			if (found) {
-				iter->module = iter_mod;
-				break;
-			}
-		}
-	}
-	mutex_unlock(&module_mutex);
-	return found;
-}
-#endif
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index b219f1449c54..db110b8ae030 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -34,11 +34,16 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[];
 static const int tracepoint_debug;
 
 /*
- * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the
- * builtin and module tracepoints and the hash table.
+ * Tracepoints mutex protects the builtin and module tracepoints and the hash
+ * table, as well as the local module list.
  */
 static DEFINE_MUTEX(tracepoints_mutex);
 
+#ifdef CONFIG_MODULES
+/* Local list of struct module */
+static LIST_HEAD(tracepoint_module_list);
+#endif /* CONFIG_MODULES */
+
 /*
  * Tracepoint hash table, containing the active tracepoints.
  * Protected by tracepoints_mutex.
@@ -292,9 +297,10 @@ static void disable_tracepoint(struct tracepoint *elem)
  * @end: end of the range
  *
  * Updates the probe callback corresponding to a range of tracepoints.
+ * Called with tracepoints_mutex held.
  */
-void tracepoint_update_probe_range(struct tracepoint * const *begin,
-				   struct tracepoint * const *end)
+static void tracepoint_update_probe_range(struct tracepoint * const *begin,
+					  struct tracepoint * const *end)
 {
 	struct tracepoint * const *iter;
 	struct tracepoint_entry *mark_entry;
@@ -302,7 +308,6 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin,
 	if (!begin)
 		return;
 
-	mutex_lock(&tracepoints_mutex);
 	for (iter = begin; iter < end; iter++) {
 		mark_entry = get_tracepoint((*iter)->name);
 		if (mark_entry) {
@@ -312,11 +317,27 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin,
 			disable_tracepoint(*iter);
 		}
 	}
-	mutex_unlock(&tracepoints_mutex);
 }
 
+#ifdef CONFIG_MODULES
+void module_update_tracepoints(void)
+{
+	struct tp_module *tp_mod;
+
+	list_for_each_entry(tp_mod, &tracepoint_module_list, list)
+		tracepoint_update_probe_range(tp_mod->tracepoints_ptrs,
+			tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints);
+}
+#else /* CONFIG_MODULES */
+void module_update_tracepoints(void)
+{
+}
+#endif /* CONFIG_MODULES */
+
+
 /*
  * Update probes, removing the faulty probes.
+ * Called with tracepoints_mutex held.
  */
 static void tracepoint_update_probes(void)
 {
@@ -359,11 +380,12 @@ int tracepoint_probe_register(const char *name, void *probe, void *data)
 
 	mutex_lock(&tracepoints_mutex);
 	old = tracepoint_add_probe(name, probe, data);
-	mutex_unlock(&tracepoints_mutex);
-	if (IS_ERR(old))
+	if (IS_ERR(old)) {
+		mutex_unlock(&tracepoints_mutex);
 		return PTR_ERR(old);
-
+	}
 	tracepoint_update_probes();		/* may update entry */
+	mutex_unlock(&tracepoints_mutex);
 	release_probes(old);
 	return 0;
 }
@@ -402,11 +424,12 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data)
 
 	mutex_lock(&tracepoints_mutex);
 	old = tracepoint_remove_probe(name, probe, data);
-	mutex_unlock(&tracepoints_mutex);
-	if (IS_ERR(old))
+	if (IS_ERR(old)) {
+		mutex_unlock(&tracepoints_mutex);
 		return PTR_ERR(old);
-
+	}
 	tracepoint_update_probes();		/* may update entry */
+	mutex_unlock(&tracepoints_mutex);
 	release_probes(old);
 	return 0;
 }
@@ -489,9 +512,8 @@ void tracepoint_probe_update_all(void)
 	if (!list_empty(&old_probes))
 		list_replace_init(&old_probes, &release_probes);
 	need_update = 0;
-	mutex_unlock(&tracepoints_mutex);
-
 	tracepoint_update_probes();
+	mutex_unlock(&tracepoints_mutex);
 	list_for_each_entry_safe(pos, next, &release_probes, u.list) {
 		list_del(&pos->u.list);
 		call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
@@ -509,7 +531,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
  * Will return the first tracepoint in the range if the input tracepoint is
  * NULL.
  */
-int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
+static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
 	struct tracepoint * const *begin, struct tracepoint * const *end)
 {
 	if (!*tracepoint && begin != end) {
@@ -520,11 +542,12 @@ int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
 		return 1;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(tracepoint_get_iter_range);
 
+#ifdef CONFIG_MODULES
 static void tracepoint_get_iter(struct tracepoint_iter *iter)
 {
 	int found = 0;
+	struct tp_module *iter_mod;
 
 	/* Core kernel tracepoints */
 	if (!iter->module) {
@@ -534,12 +557,43 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
 		if (found)
 			goto end;
 	}
-	/* tracepoints in modules. */
-	found = module_get_iter_tracepoints(iter);
+	/* Tracepoints in modules */
+	mutex_lock(&tracepoints_mutex);
+	list_for_each_entry(iter_mod, &tracepoint_module_list, list) {
+		/*
+		 * Sorted module list
+		 */
+		if (iter_mod < iter->module)
+			continue;
+		else if (iter_mod > iter->module)
+			iter->tracepoint = NULL;
+		found = tracepoint_get_iter_range(&iter->tracepoint,
+			iter_mod->tracepoints_ptrs,
+			iter_mod->tracepoints_ptrs
+				+ iter_mod->num_tracepoints);
+		if (found) {
+			iter->module = iter_mod;
+			break;
+		}
+	}
+	mutex_unlock(&tracepoints_mutex);
 end:
 	if (!found)
 		tracepoint_iter_reset(iter);
 }
+#else /* CONFIG_MODULES */
+static void tracepoint_get_iter(struct tracepoint_iter *iter)
+{
+	int found = 0;
+
+	/* Core kernel tracepoints */
+	found = tracepoint_get_iter_range(&iter->tracepoint,
+			__start___tracepoints_ptrs,
+			__stop___tracepoints_ptrs);
+	if (!found)
+		tracepoint_iter_reset(iter);
+}
+#endif /* CONFIG_MODULES */
 
 void tracepoint_iter_start(struct tracepoint_iter *iter)
 {
@@ -566,26 +620,98 @@ EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
 
 void tracepoint_iter_reset(struct tracepoint_iter *iter)
 {
+#ifdef CONFIG_MODULES
 	iter->module = NULL;
+#endif /* CONFIG_MODULES */
 	iter->tracepoint = NULL;
 }
 EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
 
 #ifdef CONFIG_MODULES
+static int tracepoint_module_coming(struct module *mod)
+{
+	struct tp_module *tp_mod, *iter;
+	int ret = 0;
+
+	/*
+	 * We skip modules that tain the kernel, especially those with different
+	 * module header (for forced load), to make sure we don't cause a crash.
+	 */
+	if (mod->taints)
+		return 0;
+	mutex_lock(&tracepoints_mutex);
+	tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
+	if (!tp_mod) {
+		ret = -ENOMEM;
+		goto end;
+	}
+	tp_mod->num_tracepoints = mod->num_tracepoints;
+	tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
+
+	/*
+	 * tracepoint_module_list is kept sorted by struct module pointer
+	 * address for iteration on tracepoints from a seq_file that can release
+	 * the mutex between calls.
+	 */
+	list_for_each_entry_reverse(iter, &tracepoint_module_list, list) {
+		BUG_ON(iter == tp_mod);	/* Should never be in the list twice */
+		if (iter < tp_mod) {
+			/* We belong to the location right after iter. */
+			list_add(&tp_mod->list, &iter->list);
+			goto module_added;
+		}
+	}
+	/* We belong to the beginning of the list */
+	list_add(&tp_mod->list, &tracepoint_module_list);
+module_added:
+	tracepoint_update_probe_range(mod->tracepoints_ptrs,
+		mod->tracepoints_ptrs + mod->num_tracepoints);
+end:
+	mutex_unlock(&tracepoints_mutex);
+	return ret;
+}
+
+static int tracepoint_module_going(struct module *mod)
+{
+	struct tp_module *pos;
+
+	mutex_lock(&tracepoints_mutex);
+	tracepoint_update_probe_range(mod->tracepoints_ptrs,
+		mod->tracepoints_ptrs + mod->num_tracepoints);
+	list_for_each_entry(pos, &tracepoint_module_list, list) {
+		if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) {
+			list_del(&pos->list);
+			kfree(pos);
+			break;
+		}
+	}
+	/*
+	 * In the case of modules that were tainted at "coming", we'll simply
+	 * walk through the list without finding it. We cannot use the "tainted"
+	 * flag on "going", in case a module taints the kernel only after being
+	 * loaded.
+	 */
+	mutex_unlock(&tracepoints_mutex);
+	return 0;
+}
 
 int tracepoint_module_notify(struct notifier_block *self,
 			     unsigned long val, void *data)
 {
 	struct module *mod = data;
+	int ret = 0;
 
 	switch (val) {
 	case MODULE_STATE_COMING:
+		ret = tracepoint_module_coming(mod);
+		break;
+	case MODULE_STATE_LIVE:
+		break;
 	case MODULE_STATE_GOING:
-		tracepoint_update_probe_range(mod->tracepoints_ptrs,
-			mod->tracepoints_ptrs + mod->num_tracepoints);
+		ret = tracepoint_module_going(mod);
 		break;
 	}
-	return 0;
+	return ret;
 }
 
 struct notifier_block tracepoint_module_nb = {
@@ -598,7 +724,6 @@ static int init_tracepoints(void)
 	return register_module_notifier(&tracepoint_module_nb);
 }
 __initcall(init_tracepoints);
-
 #endif /* CONFIG_MODULES */
 
 #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
-- 
cgit v1.2.2


From 144060fee07e9c22e179d00819c83c86fbcbf82c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 1 Aug 2011 12:49:14 +0200
Subject: perf: Add PM notifiers to fix CPU hotplug races

Francis reports that s2r gets him spurious NMIs, this is because the
suspend code leaves the boot cpu up and running.

Cure this by adding a suspend notifier. The problem is that hotplug
and suspend are completely un-serialized and the PM notifiers run
before the suspend cpu unplug of all but the boot cpu.

This leaves a window where the user can initialize another hotplug
operation (either remove or add a cpu) resulting in either one too
many or one too few hotplug ops. Thus we cannot use the hotplug code
for the suspend case.

There's another reason to not use the hotplug code, which is that the
hotplug code totally destroys the perf state, we can do better for
suspend and simply remove all counters from the PMU so that we can
re-instate them on resume.

Reported-by: Francis Moreau <francis.moro@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-1cvevybkgmv4s6v5y37t4847@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/events/core.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 95 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index b8785e26ee1c..d4c85425e3a0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -29,6 +29,7 @@
 #include <linux/hardirq.h>
 #include <linux/rculist.h>
 #include <linux/uaccess.h>
+#include <linux/suspend.h>
 #include <linux/syscalls.h>
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
@@ -6809,7 +6810,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
 	mutex_lock(&swhash->hlist_mutex);
-	if (swhash->hlist_refcount > 0) {
+	if (swhash->hlist_refcount > 0 && !swhash->swevent_hlist) {
 		struct swevent_hlist *hlist;
 
 		hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
@@ -6898,7 +6899,14 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (long)hcpu;
 
-	switch (action & ~CPU_TASKS_FROZEN) {
+	/*
+	 * Ignore suspend/resume action, the perf_pm_notifier will
+	 * take care of that.
+	 */
+	if (action & CPU_TASKS_FROZEN)
+		return NOTIFY_OK;
+
+	switch (action) {
 
 	case CPU_UP_PREPARE:
 	case CPU_DOWN_FAILED:
@@ -6917,6 +6925,90 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
+static void perf_pm_resume_cpu(void *unused)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
+	struct pmu *pmu;
+	int idx;
+
+	idx = srcu_read_lock(&pmus_srcu);
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+		ctx = cpuctx->task_ctx;
+
+		perf_ctx_lock(cpuctx, ctx);
+		perf_pmu_disable(cpuctx->ctx.pmu);
+
+		cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+		if (ctx)
+			ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+
+		perf_pmu_enable(cpuctx->ctx.pmu);
+		perf_ctx_unlock(cpuctx, ctx);
+	}
+	srcu_read_unlock(&pmus_srcu, idx);
+}
+
+static void perf_pm_suspend_cpu(void *unused)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
+	struct pmu *pmu;
+	int idx;
+
+	idx = srcu_read_lock(&pmus_srcu);
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+		ctx = cpuctx->task_ctx;
+
+		perf_ctx_lock(cpuctx, ctx);
+		perf_pmu_disable(cpuctx->ctx.pmu);
+
+		perf_event_sched_in(cpuctx, ctx, current);
+
+		perf_pmu_enable(cpuctx->ctx.pmu);
+		perf_ctx_unlock(cpuctx, ctx);
+	}
+	srcu_read_unlock(&pmus_srcu, idx);
+}
+
+static int perf_resume(void)
+{
+	get_online_cpus();
+	smp_call_function(perf_pm_resume_cpu, NULL, 1);
+	put_online_cpus();
+
+	return NOTIFY_OK;
+}
+
+static int perf_suspend(void)
+{
+	get_online_cpus();
+	smp_call_function(perf_pm_suspend_cpu, NULL, 1);
+	put_online_cpus();
+
+	return NOTIFY_OK;
+}
+
+static int perf_pm(struct notifier_block *self, unsigned long action, void *ptr)
+{
+	switch (action) {
+	case PM_POST_HIBERNATION:
+	case PM_POST_SUSPEND:
+		return perf_resume();
+	case PM_HIBERNATION_PREPARE:
+	case PM_SUSPEND_PREPARE:
+		return perf_suspend();
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static struct notifier_block perf_pm_notifier = {
+	.notifier_call = perf_pm,
+};
+
 void __init perf_event_init(void)
 {
 	int ret;
@@ -6931,6 +7023,7 @@ void __init perf_event_init(void)
 	perf_tp_register();
 	perf_cpu_notifier(perf_cpu_notify);
 	register_reboot_notifier(&perf_reboot_notifier);
+	register_pm_notifier(&perf_pm_notifier);
 
 	ret = init_hw_breakpoint();
 	WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
-- 
cgit v1.2.2


From 7e5b2a01d2ca2eae4ef913b59f84341f9a70e206 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 11 Aug 2011 12:31:20 +0100
Subject: perf: provide PMU when initing events

Currently, an event's 'pmu' field is set after pmu::event_init() is
called. This means that pmu::event_init() must figure out which struct
pmu the event was initialised from. This makes it difficult to
consolidate common event initialisation code for similar PMUs, and
very difficult to implement drivers for PMUs which can have multiple
instances (e.g. a USB controller PMU, a GPU PMU, etc).

This patch sets the 'pmu' field before initialising the event, allowing
event init code to identify the struct pmu instance easily. In the
event of failure to initialise an event, the event is destroyed via
kfree() without calling perf_event::destroy(), so this shouldn't
result in bad behaviour even if the destroy field was set before
failure to initialise was noted.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1313062280-19123-1-git-send-email-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/events/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d4c85425e3a0..adc3ef37b7e8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5716,6 +5716,7 @@ struct pmu *perf_init_event(struct perf_event *event)
 	pmu = idr_find(&pmu_idr, event->attr.type);
 	rcu_read_unlock();
 	if (pmu) {
+		event->pmu = pmu;
 		ret = pmu->event_init(event);
 		if (ret)
 			pmu = ERR_PTR(ret);
@@ -5723,6 +5724,7 @@ struct pmu *perf_init_event(struct perf_event *event)
 	}
 
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		event->pmu = pmu;
 		ret = pmu->event_init(event);
 		if (!ret)
 			goto unlock;
@@ -5849,8 +5851,6 @@ done:
 		return ERR_PTR(err);
 	}
 
-	event->pmu = pmu;
-
 	if (!event->parent) {
 		if (event->attach_state & PERF_ATTACH_TASK)
 			jump_label_inc(&perf_sched_events);
-- 
cgit v1.2.2


From 18e5a45db30e0e338cdd663eda05a8288cc14fa5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 3 Aug 2011 13:59:04 -0400
Subject: watchdog: Make the kthreads NUMA affine

Watchdog kthreads can use kthread_create_on_node() to NUMA affine their
stack and task_struct.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1312394344-18815-1-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/watchdog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 36491cd5b7d4..e952a1394d26 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -438,7 +438,7 @@ static int watchdog_enable(int cpu)
 
 	/* create the watchdog thread */
 	if (!p) {
-		p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
+		p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
 		if (IS_ERR(p)) {
 			printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
 			if (!err) {
-- 
cgit v1.2.2


From e2b245f89ee3f5b03fb42d843a79a58cf4773181 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= <schnhrr@cs.tu-berlin.de>
Date: Mon, 1 Aug 2011 11:03:28 +0200
Subject: sched: Remove rq->avg_load_per_task
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit a2d47777 ("sched: fix stale value in average load per task")
the variable rq->avg_load_per_task is no longer required. Remove it.

Signed-off-by: Jan H. Schönherr <schnhrr@cs.tu-berlin.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1312189408-17172-1-git-send-email-schnhrr@cs.tu-berlin.de
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ccacdbdecf45..b1e8f0eca1f9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -520,8 +520,6 @@ struct rq {
 	int cpu;
 	int online;
 
-	unsigned long avg_load_per_task;
-
 	u64 rt_avg;
 	u64 age_stamp;
 	u64 idle_stamp;
@@ -1569,11 +1567,9 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
 
 	if (nr_running)
-		rq->avg_load_per_task = rq->load.weight / nr_running;
-	else
-		rq->avg_load_per_task = 0;
+		return rq->load.weight / nr_running;
 
-	return rq->avg_load_per_task;
+	return 0;
 }
 
 #ifdef CONFIG_PREEMPT
-- 
cgit v1.2.2


From 2c2efaed9bc973e3aeab1385c618017b56c8f6d7 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Fri, 29 Jul 2011 16:20:33 +0800
Subject: sched: Kill WAKEUP_PREEMPT

Remove the WAKEUP_PREEMPT feature, disabling it doesn't make any sense
and its outlived its use by a long long while.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Acked-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110729082033.GB12106@zhy
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 9 +--------
 kernel/sched_features.h | 5 -----
 2 files changed, 1 insertion(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bc8ee9993814..241fc86bc613 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1095,9 +1095,6 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	 * narrow margin doesn't have to wait for a full slice.
 	 * This also mitigates buddy induced latencies under load.
 	 */
-	if (!sched_feat(WAKEUP_PREEMPT))
-		return;
-
 	if (delta_exec < sysctl_sched_min_granularity)
 		return;
 
@@ -1233,7 +1230,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 		return;
 #endif
 
-	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
+	if (cfs_rq->nr_running > 1)
 		check_preempt_tick(cfs_rq, curr);
 }
 
@@ -1899,10 +1896,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	if (unlikely(p->policy != SCHED_NORMAL))
 		return;
 
-
-	if (!sched_feat(WAKEUP_PREEMPT))
-		return;
-
 	find_matching_se(&se, &pse);
 	update_curr(cfs_rq_of(se));
 	BUG_ON(!pse);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 2e74677cb040..efa0a7b75dde 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -11,11 +11,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
  */
 SCHED_FEAT(START_DEBIT, 1)
 
-/*
- * Should wakeups try to preempt running tasks.
- */
-SCHED_FEAT(WAKEUP_PREEMPT, 1)
-
 /*
  * Based on load and program behaviour, see if it makes sense to place
  * a newly woken task on the same cpu as the task that woke it --
-- 
cgit v1.2.2


From c350a04efd1c89cd256b2abc8f07a21d0d53ff24 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 27 Jul 2011 17:14:55 +0200
Subject: sched: fix broken SCHED_RESET_ON_FORK handling

Setting child->prio = current->normal_prio _after_ SCHED_RESET_ON_FORK has
been handled for an RT parent gives birth to a deranged mutant child with
non-RT policy, but RT prio and sched_class.

Move PI leakage protection up, always set priorities and weight, and if the
child is leaving RT class, reset rt_priority to the proper value.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1311779695.8691.2.camel@marge.simson.net
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b1e8f0eca1f9..cf427bb2b65e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2843,20 +2843,24 @@ void sched_fork(struct task_struct *p)
 	 */
 	p->state = TASK_RUNNING;
 
+	/*
+	 * Make sure we do not leak PI boosting priority to the child.
+	 */
+	p->prio = current->normal_prio;
+
 	/*
 	 * Revert to default priority/policy on fork if requested.
 	 */
 	if (unlikely(p->sched_reset_on_fork)) {
-		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
+		if (task_has_rt_policy(p)) {
 			p->policy = SCHED_NORMAL;
-			p->normal_prio = p->static_prio;
-		}
-
-		if (PRIO_TO_NICE(p->static_prio) < 0) {
 			p->static_prio = NICE_TO_PRIO(0);
-			p->normal_prio = p->static_prio;
-			set_load_weight(p);
-		}
+			p->rt_priority = 0;
+		} else if (PRIO_TO_NICE(p->static_prio) < 0)
+			p->static_prio = NICE_TO_PRIO(0);
+
+		p->prio = p->normal_prio = __normal_prio(p);
+		set_load_weight(p);
 
 		/*
 		 * We don't need the reset flag anymore after the fork. It has
@@ -2865,11 +2869,6 @@ void sched_fork(struct task_struct *p)
 		p->sched_reset_on_fork = 0;
 	}
 
-	/*
-	 * Make sure we do not leak PI boosting priority to the child.
-	 */
-	p->prio = current->normal_prio;
-
 	if (!rt_prio(p->prio))
 		p->sched_class = &fair_sched_class;
 
-- 
cgit v1.2.2


From 67d955383ab2ef8866c494c14156a4f3d29e441c Mon Sep 17 00:00:00 2001
From: Hillf Danton <dhillf@gmail.com>
Date: Thu, 16 Jun 2011 21:55:18 -0400
Subject: sched: Remove noop in next_prio()

When computing the next priority for a given run-queue, the check for
RT priority of the task determined by the pick_next_highest_task_rt()
function could be removed, since only RT tasks are returned by the
function.

Reviewed-by: Yong Zhang <yong.zhang0@gmail.com>
Signed-off-by: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/BANLkTimxmWiof9s5AvS3v_0X+sMiE=0x5g@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 97540f0c9e47..e2698c0fc697 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -704,7 +704,7 @@ static inline int next_prio(struct rq *rq)
 {
 	struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
 
-	if (next && rt_prio(next->prio))
+	if (next)
 		return next->prio;
 	else
 		return MAX_RT_PRIO;
-- 
cgit v1.2.2


From 0835471697255b415edcefd6b1e25b6c034439f2 Mon Sep 17 00:00:00 2001
From: Hillf Danton <dhillf@gmail.com>
Date: Thu, 16 Jun 2011 21:55:19 -0400
Subject: sched: Remove noop in lowest_flag_domain()

Checking for the validity of sd is removed, since it is already
checked by the for_each_domain macro.

Signed-off-by: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/BANLkTimT+Tut-3TshCDm-NiLLXrOznibNA@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 241fc86bc613..f4b732a3552b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3660,7 +3660,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 	struct sched_domain *sd;
 
 	for_each_domain(cpu, sd)
-		if (sd && (sd->flags & flag))
+		if (sd->flags & flag)
 			break;
 
 	return sd;
-- 
cgit v1.2.2


From 311e800e16f63d909136a64ed17ca353a160be59 Mon Sep 17 00:00:00 2001
From: Hillf Danton <dhillf@gmail.com>
Date: Thu, 16 Jun 2011 21:55:20 -0400
Subject: sched, rt: Fix rq->rt.pushable_tasks bug in push_rt_task()

Do not call dequeue_pushable_task() when failing to push an eligible
task, as it remains pushable, merely not at this particular moment.

Signed-off-by: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Mike Galbraith <mgalbraith@gmx.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Yong Zhang <yong.zhang0@gmail.com>
Link: http://lkml.kernel.org/r/1306895385.4791.26.camel@marge.simson.net
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index e2698c0fc697..8e189455ed12 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1394,6 +1394,7 @@ static int push_rt_task(struct rq *rq)
 {
 	struct task_struct *next_task;
 	struct rq *lowest_rq;
+	int ret = 0;
 
 	if (!rq->rt.overloaded)
 		return 0;
@@ -1426,7 +1427,7 @@ retry:
 	if (!lowest_rq) {
 		struct task_struct *task;
 		/*
-		 * find lock_lowest_rq releases rq->lock
+		 * find_lock_lowest_rq releases rq->lock
 		 * so it is possible that next_task has migrated.
 		 *
 		 * We need to make sure that the task is still on the same
@@ -1436,12 +1437,11 @@ retry:
 		task = pick_next_pushable_task(rq);
 		if (task_cpu(next_task) == rq->cpu && task == next_task) {
 			/*
-			 * If we get here, the task hasn't moved at all, but
-			 * it has failed to push.  We will not try again,
-			 * since the other cpus will pull from us when they
-			 * are ready.
+			 * The task hasn't migrated, and is still the next
+			 * eligible task, but we failed to find a run-queue
+			 * to push it to.  Do not retry in this case, since
+			 * other cpus will pull from us when ready.
 			 */
-			dequeue_pushable_task(rq, next_task);
 			goto out;
 		}
 
@@ -1460,6 +1460,7 @@ retry:
 	deactivate_task(rq, next_task, 0);
 	set_task_cpu(next_task, lowest_rq->cpu);
 	activate_task(lowest_rq, next_task, 0);
+	ret = 1;
 
 	resched_task(lowest_rq->curr);
 
@@ -1468,7 +1469,7 @@ retry:
 out:
 	put_task_struct(next_task);
 
-	return 1;
+	return ret;
 }
 
 static void push_rt_tasks(struct rq *rq)
-- 
cgit v1.2.2


From 1812a643ccbfeb61a00a7f0d7219606e63d8815b Mon Sep 17 00:00:00 2001
From: Hillf Danton <dhillf@gmail.com>
Date: Thu, 16 Jun 2011 21:55:21 -0400
Subject: sched: Remove resetting exec_start in put_prev_task_rt()

There's no reason to clean the exec_start in put_prev_task_rt() as it is reset
when the task gets back to the run queue. This saves us doing a store() in the
fast path.

Signed-off-by: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Yong Zhang <yong.zhang0@gmail.com>
Link: http://lkml.kernel.org/r/BANLkTimqWD=q6YnSDi-v9y=LMWecgEzEWg@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8e189455ed12..70107a39b1ba 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1178,7 +1178,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 {
 	update_curr_rt(rq);
-	p->se.exec_start = 0;
 
 	/*
 	 * The previous task needs to be made eligible for pushing
-- 
cgit v1.2.2


From c37495fd0f64fc139b5a07d242bcb485174d1206 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 16 Jun 2011 21:55:22 -0400
Subject: sched: Balance RT tasks when forked as well

When a new task is woken, the code to balance the RT task is currently
skipped in the select_task_rq() call. But it will be pushed if the rq
is currently overloaded with RT tasks anyway. The issue is that we
already queued the task, and if it does get pushed, it will have to
be dequeued and requeued on the new run queue. The advantage with
pushing it first is that we avoid this requeuing as we are pushing it
off before the task is ever queued.

See commit 318e0893ce3f524 ("sched: pre-route RT tasks on wakeup")
for more details.

The return of select_task_rq() when it is not a wake up has also been
changed to return task_cpu() instead of smp_processor_id(). This is more
of a sanity because the current only other user of select_task_rq()
besides wake ups, is an exec, where task_cpu() should also be the same
as smp_processor_id(). But if it is used for other purposes, lets keep
the task on the same CPU. Why would we mant to migrate it to the current
CPU?

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hillf Danton <dhillf@gmail.com>
Link: http://lkml.kernel.org/r/20110617015919.832743148@goodmis.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 70107a39b1ba..2153a87e6157 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1017,10 +1017,12 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
 	struct rq *rq;
 	int cpu;
 
-	if (sd_flag != SD_BALANCE_WAKE)
-		return smp_processor_id();
-
 	cpu = task_cpu(p);
+
+	/* For anything but wake ups, just return the task_cpu */
+	if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+		goto out;
+
 	rq = cpu_rq(cpu);
 
 	rcu_read_lock();
@@ -1059,6 +1061,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
 	}
 	rcu_read_unlock();
 
+out:
 	return cpu;
 }
 
-- 
cgit v1.2.2


From 5181f4a46afd99e5e85c639b189e43e0a42b53df Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 16 Jun 2011 21:55:23 -0400
Subject: sched: Use pushable_tasks to determine next highest prio

Hillf Danton proposed a patch (see link) that cleaned up the
sched_rt code that calculates the priority of the next highest priority
task to be used in finding run queues to pull from.

His patch removed the calculating of the next prio to just use the current
prio when deteriming if we should examine a run queue to pull from. The problem
with his patch was that it caused more false checks. Because we check a run
queue for pushable tasks if the current priority of that run queue is higher
in priority than the task about to run on our run queue. But after grabbing
the locks and doing the real check, we find that there may not be a task
that has a higher prio task to pull. Thus the locks were taken with nothing to
do.

I added some trace_printks() to record when and how many times the run queue
locks were taken to check for pullable tasks, compared to how many times we
pulled a task.

With the current method, it was:

  3806 locks taken vs 2812 pulled tasks

With Hillf's patch:

  6728 locks taken vs 2804 pulled tasks

The number of times locks were taken to pull a task went up almost double with
no more success rate.

But his patch did get me thinking. When we look at the priority of the highest
task to consider taking the locks to do a pull, a failure to pull can be one
of the following: (in order of most likely)

 o RT task was pushed off already between the check and taking the lock
 o Waiting RT task can not be migrated
 o RT task's CPU affinity does not include the target run queue's CPU
 o RT task's priority changed between the check and taking the lock

And with Hillf's patch, the thing that caused most of the failures, is
the RT task to pull was not at the right priority to pull (not greater than
the current RT task priority on the target run queue).

Most of the above cases we can't help. But the current method does not check
if the next highest prio RT task can be migrated or not, and if it can not,
we still grab the locks to do the test (we don't find out about this fact until
after we have the locks). I thought about this case, and realized that the
pushable task plist that is maintained only holds RT tasks that can migrate.
If we move the calculating of the next highest prio task from the inc/dec_rt_task()
functions into the queuing of the pushable tasks, then we only measure the
priorities of those tasks that we push, and we get this basically for free.

Not only does this patch make the code a little more efficient, it cleans it
up and makes it a little simpler.

Thanks to Hillf Danton for inspiring me on this patch.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Gregory Haskins <ghaskins@novell.com>
Link: http://lkml.kernel.org/r/BANLkTimQ67180HxCx5vgMqumqw1EkFh3qg@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 61 ++++++++++++++++---------------------------------------
 1 file changed, 18 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2153a87e6157..a8c207ff3492 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -124,21 +124,33 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	update_rt_migration(rt_rq);
 }
 
+static inline int has_pushable_tasks(struct rq *rq)
+{
+	return !plist_head_empty(&rq->rt.pushable_tasks);
+}
+
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 {
 	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 	plist_node_init(&p->pushable_tasks, p->prio);
 	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+
+	/* Update the highest prio pushable task */
+	if (p->prio < rq->rt.highest_prio.next)
+		rq->rt.highest_prio.next = p->prio;
 }
 
 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 {
 	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
-}
 
-static inline int has_pushable_tasks(struct rq *rq)
-{
-	return !plist_head_empty(&rq->rt.pushable_tasks);
+	/* Update the new highest prio pushable task */
+	if (has_pushable_tasks(rq)) {
+		p = plist_first_entry(&rq->rt.pushable_tasks,
+				      struct task_struct, pushable_tasks);
+		rq->rt.highest_prio.next = p->prio;
+	} else
+		rq->rt.highest_prio.next = MAX_RT_PRIO;
 }
 
 #else
@@ -698,47 +710,13 @@ static void update_curr_rt(struct rq *rq)
 
 #if defined CONFIG_SMP
 
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
-
-static inline int next_prio(struct rq *rq)
-{
-	struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
-
-	if (next)
-		return next->prio;
-	else
-		return MAX_RT_PRIO;
-}
-
 static void
 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
 	struct rq *rq = rq_of_rt_rq(rt_rq);
 
-	if (prio < prev_prio) {
-
-		/*
-		 * If the new task is higher in priority than anything on the
-		 * run-queue, we know that the previous high becomes our
-		 * next-highest.
-		 */
-		rt_rq->highest_prio.next = prev_prio;
-
-		if (rq->online)
-			cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
-
-	} else if (prio == rt_rq->highest_prio.curr)
-		/*
-		 * If the next task is equal in priority to the highest on
-		 * the run-queue, then we implicitly know that the next highest
-		 * task cannot be any lower than current
-		 */
-		rt_rq->highest_prio.next = prio;
-	else if (prio < rt_rq->highest_prio.next)
-		/*
-		 * Otherwise, we need to recompute next-highest
-		 */
-		rt_rq->highest_prio.next = next_prio(rq);
+	if (rq->online && prio < prev_prio)
+		cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 }
 
 static void
@@ -746,9 +724,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
 	struct rq *rq = rq_of_rt_rq(rt_rq);
 
-	if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
-		rt_rq->highest_prio.next = next_prio(rq);
-
 	if (rq->online && rt_rq->highest_prio.curr != prev_prio)
 		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 }
-- 
cgit v1.2.2


From c92211d9b772792a9dea530c042efb4ab5562f50 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 2 Aug 2011 16:36:12 -0400
Subject: sched/cpupri: Remove the vec->lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

sched/cpupri: Remove the vec->lock

The cpupri vec->lock has been showing up as a top contention
lately. This is because of the RT push/pull logic takes an
agressive approach for migrating RT tasks. The cpupri logic is
in place to improve the performance of the push/pull when dealing
with large number CPU machines.

The problem though is a vec->lock is required, where a vec is a
global per RT priority structure. That is, if there are lots of
RT tasks at the same priority, every time they are added or removed
from the RT queue, this global vec->lock is taken. Now that more
kernel threads are becoming RT (RCU boost and threaded interrupts)
this is becoming much more of an issue.

There are two variables that are being synced by the vec->lock.
The cpupri bitmask, and the vec->counter. The cpupri bitmask
is one bit per priority. If a RT priority vec has a process queued,
then the vec->count is > 0 and the cpupri bitmask is set for that
RT priority.

If the cpupri bitmask gets out of sync with the vec->counter, we could
end up pushing a low proirity RT task to a high priority queue.
That RT task that could have run immediately could be queued on a
run queue with a higher priority task indefinitely.

The solution is not to use the cpupri bitmask and just look at the
vec->count directly when doing a pull. The cpupri bitmask is just
a fast way to scan the RT priorities when a pull is made. Instead
of using the bitmask, and just examine all RT priorities, and
look at the vec->counts, we could eliminate the vec->lock. The
scan of RT tasks is to find a run queue that we can push an RT task
to, and we do not push to a high priority queue, thus the scan only
needs to go from 1 to RT task->prio, and not all 100 RT priorities.

The push algorithm, which does the scan of RT priorities (and
scan of the bitmask) only happens when we have an overloaded RT run
queue (more than one RT task queued). The grabbing of the vec->lock
happens every time any RT task is queued or dequeued on the run
queue for that priority. The slowing down of the scan by not using
a bitmask is negligible by the speed up of removing the vec->lock
contention, and replacing it with an atomic counter and memory barrier.

To prove this, I wrote a patch that times both the loop and the code
that grabs the vec->locks. I passed the patches to various people
(and companies) to test and show the results. I let everyone choose
their own load to test, giving different loads on the system,
for various different setups.

Here's some of the results: (snipping to a few CPUs to not make
this change log huge, but the results were consistent across
the entire system).

System 1 (24 CPUs)

Before patch:
CPU:    Name    Count   Max     Min     Average Total
----    ----    -----   ---     ---     ------- -----
[...]
cpu 20: loop    3057    1.766   0.061   0.642   1963.170
        vec     6782949 90.469  0.089   0.414   2811760.503
cpu 21: loop    2617    1.723   0.062   0.641   1679.074
        vec     6782810 90.499  0.089   0.291   1978499.900
cpu 22: loop    2212    1.863   0.063   0.699   1547.160
        vec     6767244 85.685  0.089   0.435   2949676.898
cpu 23: loop    2320    2.013   0.062   0.594   1380.265
        vec     6781694 87.923  0.088   0.431   2928538.224

After patch:
cpu 20: loop    2078    1.579   0.061   0.533   1108.006
        vec     6164555 5.704   0.060   0.143   885185.809
cpu 21: loop    2268    1.712   0.065   0.575   1305.248
        vec     6153376 5.558   0.060   0.187   1154960.469
cpu 22: loop    1542    1.639   0.095   0.533   823.249
        vec     6156510 5.720   0.060   0.190   1172727.232
cpu 23: loop    1650    1.733   0.068   0.545   900.781
        vec     6170784 5.533   0.060   0.167   1034287.953

All times are in microseconds. The 'loop' is the amount of time spent
doing the loop across the priorities (before patch uses bitmask).
the 'vec' is the amount of time in the code that requires grabbing
the vec->lock. The second patch just does not have the vec lock, but
encompasses the same code.

Amazingly the loop code even went down on average. The vec code went
from .5 down to .18, that's more than half the time spent!

Note, more than one test was run, but they all had the same results.

System 2 (64 CPUs)

Before patch:
CPU:    Name    Count   Max     Min     Average Total
----    ----    -----   ---     ---     ------- -----
cpu 60: loop    0       0       0       0       0
        vec     5410840 277.954 0.084   0.782   4232895.727
cpu 61: loop    0       0       0       0       0
        vec     4915648 188.399 0.084   0.570   2803220.301
cpu 62: loop    0       0       0       0       0
        vec     5356076 276.417 0.085   0.786   4214544.548
cpu 63: loop    0       0       0       0       0
        vec     4891837 170.531 0.085   0.799   3910948.833

After patch:
cpu 60: loop    0       0       0       0       0
        vec     5365118 5.080   0.021   0.063   340490.267
cpu 61: loop    0       0       0       0       0
        vec     4898590 1.757   0.019   0.071   347903.615
cpu 62: loop    0       0       0       0       0
        vec     5737130 3.067   0.021   0.119   687108.734
cpu 63: loop    0       0       0       0       0
        vec     4903228 1.822   0.021   0.071   348506.477

The test run during the measurement did not have any (very few,
from other CPUs) RT tasks pushing. But this shows that it helped
out tremendously with the contention, as the contention happens
because the vec->lock is taken only on queuing at an RT priority,
and different CPUs that queue tasks at the same priority will
have contention.

I tested on my own 4 CPU machine with the following results:

Before patch:
CPU:    Name    Count   Max     Min     Average Total
----    ----    -----   ---     ---     ------- -----
cpu 0:  loop    2377    1.489   0.158   0.588   1398.395
        vec     4484    770.146 2.301   4.396   19711.755
cpu 1:  loop    2169    1.962   0.160   0.576   1250.110
        vec     4425    152.769 2.297   4.030   17834.228
cpu 2:  loop    2324    1.749   0.155   0.559   1299.799
        vec     4368    779.632 2.325   4.665   20379.268
cpu 3:  loop    2325    1.629   0.157   0.561   1306.113
        vec     4650    408.782 2.394   4.348   20222.577

After patch:
CPU:    Name    Count   Max     Min     Average Total
----    ----    -----   ---     ---     ------- -----
cpu 0:  loop    2121    1.616   0.113   0.636   1349.189
        vec     4303    1.151   0.225   0.421   1811.966
cpu 1:  loop    2130    1.638   0.178   0.644   1372.927
        vec     4627    1.379   0.235   0.428   1983.648
cpu 2:  loop    2056    1.464   0.165   0.637   1310.141
        vec     4471    1.311   0.217   0.433   1937.927
cpu 3:  loop    2154    1.481   0.162   0.601   1295.083
        vec     4236    1.253   0.230   0.425   1803.008

This was running my migrate.c code that can be found at:
http://lwn.net/Articles/425763/

The migrate code does stress the RT tasks a bit. This shows that
the loop did increase a little after the patch, but not by much.
The vec code dropped dramatically. From 4.3us down to .42us.
That's a 10x improvement!

Tested-by: Mike Galbraith <mgalbraith@suse.de>
Tested-by: Luis Claudio R. Gonçalves <lgoncalv@redhat.com>
Tested-by: Matthew Hank Sabins<msabins@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Gregory Haskins <gregory.haskins@gmail.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Chris Mason <chris.mason@oracle.com>
Link: http://lkml.kernel.org/r/1312317372.18583.101.camel@gandalf.stny.rr.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_cpupri.c | 62 ++++++++++++++++++++++++++++++++-------------------
 kernel/sched_cpupri.h |  5 ++---
 2 files changed, 41 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 2722dc1b4138..7761a2669fff 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,6 @@ static int convert_prio(int prio)
 	return cpupri;
 }
 
-#define for_each_cpupri_active(array, idx)                    \
-	for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
-
 /**
  * cpupri_find - find the best (lowest-pri) CPU in the system
  * @cp: The cpupri context
@@ -71,11 +68,33 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 	int                  idx      = 0;
 	int                  task_pri = convert_prio(p->prio);
 
-	for_each_cpupri_active(cp->pri_active, idx) {
+	if (task_pri >= MAX_RT_PRIO)
+		return 0;
+
+	for (idx = 0; idx < task_pri; idx++) {
 		struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
 
-		if (idx >= task_pri)
-			break;
+		if (!atomic_read(&(vec)->count))
+			continue;
+		/*
+		 * When looking at the vector, we need to read the counter,
+		 * do a memory barrier, then read the mask.
+		 *
+		 * Note: This is still all racey, but we can deal with it.
+		 *  Ideally, we only want to look at masks that are set.
+		 *
+		 *  If a mask is not set, then the only thing wrong is that we
+		 *  did a little more work than necessary.
+		 *
+		 *  If we read a zero count but the mask is set, because of the
+		 *  memory barriers, that can only happen when the highest prio
+		 *  task for a run queue has left the run queue, in which case,
+		 *  it will be followed by a pull. If the task we are processing
+		 *  fails to find a proper place to go, that pull request will
+		 *  pull this task if the run queue is running at a lower
+		 *  priority.
+		 */
+		smp_rmb();
 
 		if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
 			continue;
@@ -115,7 +134,6 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 {
 	int                 *currpri = &cp->cpu_to_pri[cpu];
 	int                  oldpri  = *currpri;
-	unsigned long        flags;
 
 	newpri = convert_prio(newpri);
 
@@ -134,26 +152,25 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 	if (likely(newpri != CPUPRI_INVALID)) {
 		struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
 
-		raw_spin_lock_irqsave(&vec->lock, flags);
-
 		cpumask_set_cpu(cpu, vec->mask);
-		vec->count++;
-		if (vec->count == 1)
-			set_bit(newpri, cp->pri_active);
-
-		raw_spin_unlock_irqrestore(&vec->lock, flags);
+		/*
+		 * When adding a new vector, we update the mask first,
+		 * do a write memory barrier, and then update the count, to
+		 * make sure the vector is visible when count is set.
+		 */
+		smp_wmb();
+		atomic_inc(&(vec)->count);
 	}
 	if (likely(oldpri != CPUPRI_INVALID)) {
 		struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
 
-		raw_spin_lock_irqsave(&vec->lock, flags);
-
-		vec->count--;
-		if (!vec->count)
-			clear_bit(oldpri, cp->pri_active);
+		/*
+		 * When removing from the vector, we decrement the counter first
+		 * do a memory barrier and then clear the mask.
+		 */
+		atomic_dec(&(vec)->count);
+		smp_wmb();
 		cpumask_clear_cpu(cpu, vec->mask);
-
-		raw_spin_unlock_irqrestore(&vec->lock, flags);
 	}
 
 	*currpri = newpri;
@@ -175,8 +192,7 @@ int cpupri_init(struct cpupri *cp)
 	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
 		struct cpupri_vec *vec = &cp->pri_to_cpu[i];
 
-		raw_spin_lock_init(&vec->lock);
-		vec->count = 0;
+		atomic_set(&vec->count, 0);
 		if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
 			goto cleanup;
 	}
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 9fc7d386fea4..6b4cd17dead6 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -12,9 +12,8 @@
 /* values 2-101 are RT priorities 0-99 */
 
 struct cpupri_vec {
-	raw_spinlock_t lock;
-	int        count;
-	cpumask_var_t mask;
+	atomic_t	count;
+	cpumask_var_t	mask;
 };
 
 struct cpupri {
-- 
cgit v1.2.2


From d473750b4073f16f23f46f30dc1bd3de45c35754 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 5 Aug 2011 08:27:49 -0400
Subject: sched/cpupri: Fix memory barriers for vec updates to always be in
 order

[ This patch actually compiles. Thanks to Mike Galbraith for pointing
that out. I compiled and booted this patch with no issues. ]

Re-examining the cpupri patch, I see there's a possible race because the
update of the two priorities vec->counts are not protected by a memory
barrier.

When a RT runqueue is overloaded and wants to push an RT task to another
runqueue, it scans the RT priority vectors in a loop from lowest
priority to highest.

When we queue or dequeue an RT task that changes a runqueue's highest
priority task, we update the vectors to show that a runqueue is rated at
a different priority. To do this, we first set the new priority mask,
and increment the vec->count, and then set the old priority mask by
decrementing the vec->count.

If we are lowering the runqueue's RT priority rating, it will trigger a
RT pull, and we do not care if we miss pushing to this runqueue or not.

But if we raise the priority, but the priority is still lower than an RT
task that is looking to be pushed, we must make sure that this runqueue
is still seen by the push algorithm (the loop).

Because the loop reads from lowest to highest, and the new priority is
set before the old one is cleared, we will either see the new or old
priority set and the vector will be checked.

But! Since there's no memory barrier between the updates of the two, the
old count may be decremented first before the new count is incremented.
This means the loop may see the old count of zero and skip it, and also
the new count of zero before it was updated. A possible runqueue that
the RT task could move to could be missed.

A conditional memory barrier is placed between the vec->count updates
and is only called when both updates are done.

The smp_wmb() has also been changed to smp_mb__before_atomic_inc/dec(),
as they are not needed by archs that already synchronize
atomic_inc/dec().

The smp_rmb() has been moved to be called at every iteration of the loop
so that the race between seeing the two updates is visible by each
iteration of the loop, as an arch is free to optimize the reading of
memory of the counters in the loop.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1312547269.18583.194.camel@gandalf.stny.rr.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_cpupri.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 7761a2669fff..90faffdbdf98 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -73,9 +73,10 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 
 	for (idx = 0; idx < task_pri; idx++) {
 		struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+		int skip = 0;
 
 		if (!atomic_read(&(vec)->count))
-			continue;
+			skip = 1;
 		/*
 		 * When looking at the vector, we need to read the counter,
 		 * do a memory barrier, then read the mask.
@@ -96,6 +97,10 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 		 */
 		smp_rmb();
 
+		/* Need to do the rmb for every iteration */
+		if (skip)
+			continue;
+
 		if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
 			continue;
 
@@ -134,6 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 {
 	int                 *currpri = &cp->cpu_to_pri[cpu];
 	int                  oldpri  = *currpri;
+	int                  do_mb = 0;
 
 	newpri = convert_prio(newpri);
 
@@ -158,18 +164,34 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 		 * do a write memory barrier, and then update the count, to
 		 * make sure the vector is visible when count is set.
 		 */
-		smp_wmb();
+		smp_mb__before_atomic_inc();
 		atomic_inc(&(vec)->count);
+		do_mb = 1;
 	}
 	if (likely(oldpri != CPUPRI_INVALID)) {
 		struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
 
+		/*
+		 * Because the order of modification of the vec->count
+		 * is important, we must make sure that the update
+		 * of the new prio is seen before we decrement the
+		 * old prio. This makes sure that the loop sees
+		 * one or the other when we raise the priority of
+		 * the run queue. We don't care about when we lower the
+		 * priority, as that will trigger an rt pull anyway.
+		 *
+		 * We only need to do a memory barrier if we updated
+		 * the new priority vec.
+		 */
+		if (do_mb)
+			smp_mb__after_atomic_inc();
+
 		/*
 		 * When removing from the vector, we decrement the counter first
 		 * do a memory barrier and then clear the mask.
 		 */
 		atomic_dec(&(vec)->count);
-		smp_wmb();
+		smp_mb__after_atomic_inc();
 		cpumask_clear_cpu(cpu, vec->mask);
 	}
 
-- 
cgit v1.2.2


From 5710f15b52664ae0bfa60a66d75464769d297b2b Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Sat, 6 Aug 2011 08:10:04 +0800
Subject: sched/cpupri: Remove cpupri->pri_active

Since [sched/cpupri: Remove the vec->lock], member pri_active
of struct cpupri is not needed any more, just remove it. Also
clean stuff related to it.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110806001004.GA2207@zhy
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_cpupri.c | 3 +--
 kernel/sched_cpupri.h | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 90faffdbdf98..a86cf9d9eb11 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -152,8 +152,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 	 * If the cpu was currently mapped to a different value, we
 	 * need to map it to the new value then remove the old value.
 	 * Note, we must add the new value first, otherwise we risk the
-	 * cpu being cleared from pri_active, and this cpu could be
-	 * missed for a push or pull.
+	 * cpu being missed by the priority loop in cpupri_find.
 	 */
 	if (likely(newpri != CPUPRI_INVALID)) {
 		struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 6b4cd17dead6..f6d756173491 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -4,7 +4,6 @@
 #include <linux/sched.h>
 
 #define CPUPRI_NR_PRIORITIES	(MAX_RT_PRIO + 2)
-#define CPUPRI_NR_PRI_WORDS	BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
 
 #define CPUPRI_INVALID -1
 #define CPUPRI_IDLE     0
@@ -18,7 +17,6 @@ struct cpupri_vec {
 
 struct cpupri {
 	struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
-	long              pri_active[CPUPRI_NR_PRI_WORDS];
 	int               cpu_to_pri[NR_CPUS];
 };
 
-- 
cgit v1.2.2


From 953bfcd10e6f3697233e8e5128c611d275da39c1 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:27 -0700
Subject: sched: Implement hierarchical task accounting for SCHED_OTHER

Introduce hierarchical task accounting for the group scheduling case in CFS, as
well as promoting the responsibility for maintaining rq->nr_running to the
scheduling classes.

The primary motivation for this is that with scheduling classes supporting
bandwidth throttling it is possible for entities participating in throttled
sub-trees to not have root visible changes in rq->nr_running across activate
and de-activate operations.  This in turn leads to incorrect idle and
weight-per-task load balance decisions.

This also allows us to make a small fixlet to the fastpath in pick_next_task()
under group scheduling.

Note: this issue also exists with the existing sched_rt throttling mechanism.
This patch does not address that.

Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184756.878333391@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          | 6 ++----
 kernel/sched_fair.c     | 6 ++++++
 kernel/sched_rt.c       | 5 ++++-
 kernel/sched_stoptask.c | 2 ++
 4 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index cf427bb2b65e..cd1a531ca8ff 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -311,7 +311,7 @@ struct task_group root_task_group;
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
-	unsigned long nr_running;
+	unsigned long nr_running, h_nr_running;
 
 	u64 exec_clock;
 	u64 min_vruntime;
@@ -1802,7 +1802,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
 		rq->nr_uninterruptible--;
 
 	enqueue_task(rq, p, flags);
-	inc_nr_running(rq);
 }
 
 /*
@@ -1814,7 +1813,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 		rq->nr_uninterruptible++;
 
 	dequeue_task(rq, p, flags);
-	dec_nr_running(rq);
 }
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -4258,7 +4256,7 @@ pick_next_task(struct rq *rq)
 	 * Optimization: we know that if all tasks are in
 	 * the fair class we can call that function directly:
 	 */
-	if (likely(rq->nr_running == rq->cfs.nr_running)) {
+	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
 		p = fair_sched_class.pick_next_task(rq);
 		if (likely(p))
 			return p;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f4b732a3552b..f86b0cb5eb29 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1310,16 +1310,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 		cfs_rq = cfs_rq_of(se);
 		enqueue_entity(cfs_rq, se, flags);
+		cfs_rq->h_nr_running++;
 		flags = ENQUEUE_WAKEUP;
 	}
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
+		cfs_rq->h_nr_running++;
 
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 	}
 
+	inc_nr_running(rq);
 	hrtick_update(rq);
 }
 
@@ -1339,6 +1342,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
+		cfs_rq->h_nr_running--;
 
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
@@ -1358,11 +1362,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
+		cfs_rq->h_nr_running--;
 
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 	}
 
+	dec_nr_running(rq);
 	hrtick_update(rq);
 }
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a8c207ff3492..a9d3c6bc684a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -936,6 +936,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 
 	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
+
+	inc_nr_running(rq);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -946,6 +948,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	dequeue_rt_entity(rt_se);
 
 	dequeue_pushable_task(rq, p);
+
+	dec_nr_running(rq);
 }
 
 /*
@@ -1841,4 +1845,3 @@ static void print_rt_stats(struct seq_file *m, int cpu)
 	rcu_read_unlock();
 }
 #endif /* CONFIG_SCHED_DEBUG */
-
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 6f437632afab..8b44e7fa7fb3 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -34,11 +34,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
 static void
 enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
+	inc_nr_running(rq);
 }
 
 static void
 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
+	dec_nr_running(rq);
 }
 
 static void yield_task_stop(struct rq *rq)
-- 
cgit v1.2.2


From ab84d31e15502fb626169ba2663381e34bf965b2 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:28 -0700
Subject: sched: Introduce primitives to account for CFS bandwidth tracking

In this patch we introduce the notion of CFS bandwidth, partitioned into
globally unassigned bandwidth, and locally claimed bandwidth.

 - The global bandwidth is per task_group, it represents a pool of unclaimed
   bandwidth that cfs_rqs can allocate from.
 - The local bandwidth is tracked per-cfs_rq, this represents allotments from
   the global pool bandwidth assigned to a specific cpu.

Bandwidth is managed via cgroupfs, adding two new interfaces to the cpu subsystem:
 - cpu.cfs_period_us : the bandwidth period in usecs
 - cpu.cfs_quota_us : the cpu bandwidth (in usecs) that this tg will be allowed
   to consume over period above.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184756.972636699@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 196 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched_fair.c |  16 +++++
 2 files changed, 208 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index cd1a531ca8ff..f08cb23be96c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -247,6 +247,14 @@ struct cfs_rq;
 
 static LIST_HEAD(task_groups);
 
+struct cfs_bandwidth {
+#ifdef CONFIG_CFS_BANDWIDTH
+	raw_spinlock_t lock;
+	ktime_t period;
+	u64 quota;
+#endif
+};
+
 /* task group related information */
 struct task_group {
 	struct cgroup_subsys_state css;
@@ -278,6 +286,8 @@ struct task_group {
 #ifdef CONFIG_SCHED_AUTOGROUP
 	struct autogroup *autogroup;
 #endif
+
+	struct cfs_bandwidth cfs_bandwidth;
 };
 
 /* task_group_lock serializes the addition/removal of task groups */
@@ -377,9 +387,48 @@ struct cfs_rq {
 
 	unsigned long load_contribution;
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	int runtime_enabled;
+	s64 runtime_remaining;
+#endif
 #endif
 };
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+	return &tg->cfs_bandwidth;
+}
+
+static inline u64 default_cfs_period(void);
+
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+	raw_spin_lock_init(&cfs_b->lock);
+	cfs_b->quota = RUNTIME_INF;
+	cfs_b->period = ns_to_ktime(default_cfs_period());
+}
+
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+	cfs_rq->runtime_enabled = 0;
+}
+
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{}
+#else
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+	return NULL;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
 	struct rt_prio_array active;
@@ -7971,6 +8020,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 	/* allow initial update_cfs_load() to truncate */
 	cfs_rq->load_stamp = 1;
 #endif
+	init_cfs_rq_runtime(cfs_rq);
 
 	tg->cfs_rq[cpu] = cfs_rq;
 	tg->se[cpu] = se;
@@ -8110,6 +8160,7 @@ void __init sched_init(void)
 		 * We achieve this by letting root_task_group's tasks sit
 		 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
 		 */
+		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
@@ -8351,6 +8402,8 @@ static void free_fair_sched_group(struct task_group *tg)
 {
 	int i;
 
+	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
 	for_each_possible_cpu(i) {
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
@@ -8378,6 +8431,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 
 	tg->shares = NICE_0_LOAD;
 
+	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
 	for_each_possible_cpu(i) {
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
 				      GFP_KERNEL, cpu_to_node(i));
@@ -8753,7 +8808,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 	return walk_tg_tree(tg_schedulable, tg_nop, &data);
 }
 
-static int tg_set_bandwidth(struct task_group *tg,
+static int tg_set_rt_bandwidth(struct task_group *tg,
 		u64 rt_period, u64 rt_runtime)
 {
 	int i, err = 0;
@@ -8792,7 +8847,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 	if (rt_runtime_us < 0)
 		rt_runtime = RUNTIME_INF;
 
-	return tg_set_bandwidth(tg, rt_period, rt_runtime);
+	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 
 long sched_group_rt_runtime(struct task_group *tg)
@@ -8817,7 +8872,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
 	if (rt_period == 0)
 		return -EINVAL;
 
-	return tg_set_bandwidth(tg, rt_period, rt_runtime);
+	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 
 long sched_group_rt_period(struct task_group *tg)
@@ -9007,6 +9062,128 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 
 	return (u64) scale_load_down(tg->shares);
 }
+
+#ifdef CONFIG_CFS_BANDWIDTH
+const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
+const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+{
+	int i;
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+	static DEFINE_MUTEX(mutex);
+
+	if (tg == &root_task_group)
+		return -EINVAL;
+
+	/*
+	 * Ensure we have at some amount of bandwidth every period.  This is
+	 * to prevent reaching a state of large arrears when throttled via
+	 * entity_tick() resulting in prolonged exit starvation.
+	 */
+	if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
+		return -EINVAL;
+
+	/*
+	 * Likewise, bound things on the otherside by preventing insane quota
+	 * periods.  This also allows us to normalize in computing quota
+	 * feasibility.
+	 */
+	if (period > max_cfs_quota_period)
+		return -EINVAL;
+
+	mutex_lock(&mutex);
+	raw_spin_lock_irq(&cfs_b->lock);
+	cfs_b->period = ns_to_ktime(period);
+	cfs_b->quota = quota;
+	raw_spin_unlock_irq(&cfs_b->lock);
+
+	for_each_possible_cpu(i) {
+		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+		struct rq *rq = rq_of(cfs_rq);
+
+		raw_spin_lock_irq(&rq->lock);
+		cfs_rq->runtime_enabled = quota != RUNTIME_INF;
+		cfs_rq->runtime_remaining = 0;
+		raw_spin_unlock_irq(&rq->lock);
+	}
+	mutex_unlock(&mutex);
+
+	return 0;
+}
+
+int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+{
+	u64 quota, period;
+
+	period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+	if (cfs_quota_us < 0)
+		quota = RUNTIME_INF;
+	else
+		quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+
+	return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_quota(struct task_group *tg)
+{
+	u64 quota_us;
+
+	if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
+		return -1;
+
+	quota_us = tg_cfs_bandwidth(tg)->quota;
+	do_div(quota_us, NSEC_PER_USEC);
+
+	return quota_us;
+}
+
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+{
+	u64 quota, period;
+
+	period = (u64)cfs_period_us * NSEC_PER_USEC;
+	quota = tg_cfs_bandwidth(tg)->quota;
+
+	if (period <= 0)
+		return -EINVAL;
+
+	return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_period(struct task_group *tg)
+{
+	u64 cfs_period_us;
+
+	cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+	do_div(cfs_period_us, NSEC_PER_USEC);
+
+	return cfs_period_us;
+}
+
+static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return tg_get_cfs_quota(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
+				s64 cfs_quota_us)
+{
+	return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+}
+
+static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return tg_get_cfs_period(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+				u64 cfs_period_us)
+{
+	return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -9041,6 +9218,18 @@ static struct cftype cpu_files[] = {
 		.write_u64 = cpu_shares_write_u64,
 	},
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		.name = "cfs_quota_us",
+		.read_s64 = cpu_cfs_quota_read_s64,
+		.write_s64 = cpu_cfs_quota_write_s64,
+	},
+	{
+		.name = "cfs_period_us",
+		.read_u64 = cpu_cfs_period_read_u64,
+		.write_u64 = cpu_cfs_period_write_u64,
+	},
+#endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{
 		.name = "rt_runtime_us",
@@ -9350,4 +9539,3 @@ struct cgroup_subsys cpuacct_subsys = {
 	.subsys_id = cpuacct_subsys_id,
 };
 #endif	/* CONFIG_CGROUP_CPUACCT */
-
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f86b0cb5eb29..f24f4171019d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1234,6 +1234,22 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 		check_preempt_tick(cfs_rq, curr);
 }
 
+
+/**************************************************
+ * CFS bandwidth control machinery
+ */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * default period for cfs group bandwidth.
+ * default: 0.1s, units: nanoseconds
+ */
+static inline u64 default_cfs_period(void)
+{
+	return 100000000ULL;
+}
+#endif
+
 /**************************************************
  * CFS operations on tasks:
  */
-- 
cgit v1.2.2


From a790de99599a29ad3f18667530cf4b9f4b7e3234 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:29 -0700
Subject: sched: Validate CFS quota hierarchies

Add constraints validation for CFS bandwidth hierarchies.

Validate that:
   max(child bandwidth) <= parent_bandwidth

In a quota limited hierarchy, an unconstrained entity
(e.g. bandwidth==RUNTIME_INF) inherits the bandwidth of its parent.

This constraint is chosen over sum(child_bandwidth) as notion of over-commit is
valuable within SCHED_OTHER.  Some basic code from the RT case is re-factored
for reuse.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.083774572@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 98 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f08cb23be96c..ea6850d93b2a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -252,6 +252,7 @@ struct cfs_bandwidth {
 	raw_spinlock_t lock;
 	ktime_t period;
 	u64 quota;
+	s64 hierarchal_quota;
 #endif
 };
 
@@ -1518,7 +1519,8 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 	update_load_sub(&rq->load, load);
 }
 
-#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
+#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
+			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
 typedef int (*tg_visitor)(struct task_group *, void *);
 
 /*
@@ -8708,12 +8710,7 @@ unsigned long sched_group_shares(struct task_group *tg)
 }
 #endif
 
-#ifdef CONFIG_RT_GROUP_SCHED
-/*
- * Ensure that the real time constraints are schedulable.
- */
-static DEFINE_MUTEX(rt_constraints_mutex);
-
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
 	if (runtime == RUNTIME_INF)
@@ -8721,6 +8718,13 @@ static unsigned long to_ratio(u64 period, u64 runtime)
 
 	return div64_u64(runtime << 20, period);
 }
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Ensure that the real time constraints are schedulable.
+ */
+static DEFINE_MUTEX(rt_constraints_mutex);
 
 /* Must be called with tasklist_lock held */
 static inline int tg_has_rt_tasks(struct task_group *tg)
@@ -8741,7 +8745,7 @@ struct rt_schedulable_data {
 	u64 rt_runtime;
 };
 
-static int tg_schedulable(struct task_group *tg, void *data)
+static int tg_rt_schedulable(struct task_group *tg, void *data)
 {
 	struct rt_schedulable_data *d = data;
 	struct task_group *child;
@@ -8805,7 +8809,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 		.rt_runtime = runtime,
 	};
 
-	return walk_tg_tree(tg_schedulable, tg_nop, &data);
+	return walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
 }
 
 static int tg_set_rt_bandwidth(struct task_group *tg,
@@ -9064,14 +9068,17 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 }
 
 #ifdef CONFIG_CFS_BANDWIDTH
+static DEFINE_MUTEX(cfs_constraints_mutex);
+
 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
 
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
+
 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 {
-	int i;
+	int i, ret = 0;
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-	static DEFINE_MUTEX(mutex);
 
 	if (tg == &root_task_group)
 		return -EINVAL;
@@ -9092,7 +9099,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 	if (period > max_cfs_quota_period)
 		return -EINVAL;
 
-	mutex_lock(&mutex);
+	mutex_lock(&cfs_constraints_mutex);
+	ret = __cfs_schedulable(tg, period, quota);
+	if (ret)
+		goto out_unlock;
+
 	raw_spin_lock_irq(&cfs_b->lock);
 	cfs_b->period = ns_to_ktime(period);
 	cfs_b->quota = quota;
@@ -9107,9 +9118,10 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 		cfs_rq->runtime_remaining = 0;
 		raw_spin_unlock_irq(&rq->lock);
 	}
-	mutex_unlock(&mutex);
+out_unlock:
+	mutex_unlock(&cfs_constraints_mutex);
 
-	return 0;
+	return ret;
 }
 
 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
@@ -9183,6 +9195,78 @@ static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
 	return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
 }
 
+struct cfs_schedulable_data {
+	struct task_group *tg;
+	u64 period, quota;
+};
+
+/*
+ * normalize group quota/period to be quota/max_period
+ * note: units are usecs
+ */
+static u64 normalize_cfs_quota(struct task_group *tg,
+			       struct cfs_schedulable_data *d)
+{
+	u64 quota, period;
+
+	if (tg == d->tg) {
+		period = d->period;
+		quota = d->quota;
+	} else {
+		period = tg_get_cfs_period(tg);
+		quota = tg_get_cfs_quota(tg);
+	}
+
+	/* note: these should typically be equivalent */
+	if (quota == RUNTIME_INF || quota == -1)
+		return RUNTIME_INF;
+
+	return to_ratio(period, quota);
+}
+
+static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
+{
+	struct cfs_schedulable_data *d = data;
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+	s64 quota = 0, parent_quota = -1;
+
+	if (!tg->parent) {
+		quota = RUNTIME_INF;
+	} else {
+		struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
+
+		quota = normalize_cfs_quota(tg, d);
+		parent_quota = parent_b->hierarchal_quota;
+
+		/*
+		 * ensure max(child_quota) <= parent_quota, inherit when no
+		 * limit is set
+		 */
+		if (quota == RUNTIME_INF)
+			quota = parent_quota;
+		else if (parent_quota != RUNTIME_INF && quota > parent_quota)
+			return -EINVAL;
+	}
+	cfs_b->hierarchal_quota = quota;
+
+	return 0;
+}
+
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
+{
+	struct cfs_schedulable_data data = {
+		.tg = tg,
+		.period = period,
+		.quota = quota,
+	};
+
+	if (quota != RUNTIME_INF) {
+		do_div(data.period, NSEC_PER_USEC);
+		do_div(data.quota, NSEC_PER_USEC);
+	}
+
+	return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+}
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-- 
cgit v1.2.2


From ec12cb7f31e28854efae7dd6f9544e0a66379040 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:30 -0700
Subject: sched: Accumulate per-cfs_rq cpu usage and charge against bandwidth

Account bandwidth usage on the cfs_rq level versus the task_groups to which
they belong.  Whether we are tracking bandwidth on a given cfs_rq is maintained
under cfs_rq->runtime_enabled.

cfs_rq's which belong to a bandwidth constrained task_group have their runtime
accounted via the update_curr() path, which withdraws bandwidth from the global
pool as desired.  Updates involving the global pool are currently protected
under cfs_bandwidth->lock, local runtime is protected by rq->lock.

This patch only assigns and tracks quota, no action is taken in the case that
cfs_rq->runtime_used exceeds cfs_rq->runtime_assigned.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.179386821@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  4 ++-
 kernel/sched_fair.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sysctl.c     | 10 +++++++
 3 files changed, 90 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ea6850d93b2a..35561c63a490 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -251,7 +251,7 @@ struct cfs_bandwidth {
 #ifdef CONFIG_CFS_BANDWIDTH
 	raw_spinlock_t lock;
 	ktime_t period;
-	u64 quota;
+	u64 quota, runtime;
 	s64 hierarchal_quota;
 #endif
 };
@@ -407,6 +407,7 @@ static inline u64 default_cfs_period(void);
 static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
 	raw_spin_lock_init(&cfs_b->lock);
+	cfs_b->runtime = 0;
 	cfs_b->quota = RUNTIME_INF;
 	cfs_b->period = ns_to_ktime(default_cfs_period());
 }
@@ -9107,6 +9108,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 	raw_spin_lock_irq(&cfs_b->lock);
 	cfs_b->period = ns_to_ktime(period);
 	cfs_b->quota = quota;
+	cfs_b->runtime = quota;
 	raw_spin_unlock_irq(&cfs_b->lock);
 
 	for_each_possible_cpu(i) {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f24f4171019d..9502aa899f73 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
  */
 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
+ * each time a cfs_rq requests quota.
+ *
+ * Note: in the case that the slice exceeds the runtime remaining (either due
+ * to consumption or the quota being specified to be smaller than the slice)
+ * we will always only issue the remaining available time.
+ *
+ * default: 5 msec, units: microseconds
+  */
+unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+#endif
+
 static const struct sched_class fair_sched_class;
 
 /**************************************************************
@@ -292,6 +306,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+				   unsigned long delta_exec);
 
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
@@ -583,6 +599,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
 		cpuacct_charge(curtask, delta_exec);
 		account_group_exec_runtime(curtask, delta_exec);
 	}
+
+	account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
 
 static inline void
@@ -1248,6 +1266,58 @@ static inline u64 default_cfs_period(void)
 {
 	return 100000000ULL;
 }
+
+static inline u64 sched_cfs_bandwidth_slice(void)
+{
+	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
+}
+
+static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+	struct task_group *tg = cfs_rq->tg;
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+	u64 amount = 0, min_amount;
+
+	/* note: this is a positive sum as runtime_remaining <= 0 */
+	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+
+	raw_spin_lock(&cfs_b->lock);
+	if (cfs_b->quota == RUNTIME_INF)
+		amount = min_amount;
+	else if (cfs_b->runtime > 0) {
+		amount = min(cfs_b->runtime, min_amount);
+		cfs_b->runtime -= amount;
+	}
+	raw_spin_unlock(&cfs_b->lock);
+
+	cfs_rq->runtime_remaining += amount;
+}
+
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+				     unsigned long delta_exec)
+{
+	if (!cfs_rq->runtime_enabled)
+		return;
+
+	cfs_rq->runtime_remaining -= delta_exec;
+	if (cfs_rq->runtime_remaining > 0)
+		return;
+
+	assign_cfs_rq_runtime(cfs_rq);
+}
+
+static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+						   unsigned long delta_exec)
+{
+	if (!cfs_rq->runtime_enabled)
+		return;
+
+	__account_cfs_rq_runtime(cfs_rq, delta_exec);
+}
+
+#else
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+				     unsigned long delta_exec) {}
 #endif
 
 /**************************************************
@@ -4266,8 +4336,13 @@ static void set_curr_task_fair(struct rq *rq)
 {
 	struct sched_entity *se = &rq->curr->se;
 
-	for_each_sched_entity(se)
-		set_next_entity(cfs_rq_of(se), se);
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+		set_next_entity(cfs_rq, se);
+		/* ensure bandwidth has been allocated on our new cfs_rq */
+		account_cfs_rq_runtime(cfs_rq, 0);
+	}
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b531e50..2d2ecdcc8cdb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -379,6 +379,16 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &one,
 	},
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		.procname	= "sched_cfs_bandwidth_slice_us",
+		.data		= &sysctl_sched_cfs_bandwidth_slice,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+	},
+#endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
-- 
cgit v1.2.2


From 58088ad0152ba4b7997388c93d0ca208ec1ece75 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:31 -0700
Subject: sched: Add a timer to handle CFS bandwidth refresh

This patch adds a per-task_group timer which handles the refresh of the global
CFS bandwidth pool.

Since the RT pool is using a similar timer there's some small refactoring to
share this support.

Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.277271273@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 107 +++++++++++++++++++++++++++++++++++++++++-----------
 kernel/sched_fair.c |  40 ++++++++++++++++++--
 2 files changed, 123 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 35561c63a490..34bf8e6db9af 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void)
 	return sysctl_sched_rt_runtime >= 0;
 }
 
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
-	ktime_t now;
+	unsigned long delta;
+	ktime_t soft, hard, now;
 
+	for (;;) {
+		if (hrtimer_active(period_timer))
+			break;
+
+		now = hrtimer_cb_get_time(period_timer);
+		hrtimer_forward(period_timer, now, period);
+
+		soft = hrtimer_get_softexpires(period_timer);
+		hard = hrtimer_get_expires(period_timer);
+		delta = ktime_to_ns(ktime_sub(hard, soft));
+		__hrtimer_start_range_ns(period_timer, soft, delta,
+					 HRTIMER_MODE_ABS_PINNED, 0);
+	}
+}
+
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
 	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 		return;
 
@@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 		return;
 
 	raw_spin_lock(&rt_b->rt_runtime_lock);
-	for (;;) {
-		unsigned long delta;
-		ktime_t soft, hard;
-
-		if (hrtimer_active(&rt_b->rt_period_timer))
-			break;
-
-		now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
-		hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-
-		soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
-		hard = hrtimer_get_expires(&rt_b->rt_period_timer);
-		delta = ktime_to_ns(ktime_sub(hard, soft));
-		__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
-				HRTIMER_MODE_ABS_PINNED, 0);
-	}
+	start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
 
@@ -253,6 +256,9 @@ struct cfs_bandwidth {
 	ktime_t period;
 	u64 quota, runtime;
 	s64 hierarchal_quota;
+
+	int idle, timer_active;
+	struct hrtimer period_timer;
 #endif
 };
 
@@ -403,6 +409,28 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 }
 
 static inline u64 default_cfs_period(void);
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+	struct cfs_bandwidth *cfs_b =
+		container_of(timer, struct cfs_bandwidth, period_timer);
+	ktime_t now;
+	int overrun;
+	int idle = 0;
+
+	for (;;) {
+		now = hrtimer_cb_get_time(timer);
+		overrun = hrtimer_forward(timer, now, cfs_b->period);
+
+		if (!overrun)
+			break;
+
+		idle = do_sched_cfs_period_timer(cfs_b, overrun);
+	}
+
+	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
 
 static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
@@ -410,6 +438,9 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	cfs_b->runtime = 0;
 	cfs_b->quota = RUNTIME_INF;
 	cfs_b->period = ns_to_ktime(default_cfs_period());
+
+	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	cfs_b->period_timer.function = sched_cfs_period_timer;
 }
 
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -417,8 +448,34 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	cfs_rq->runtime_enabled = 0;
 }
 
+/* requires cfs_b->lock, may release to reprogram timer */
+static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+	/*
+	 * The timer may be active because we're trying to set a new bandwidth
+	 * period or because we're racing with the tear-down path
+	 * (timer_active==0 becomes visible before the hrtimer call-back
+	 * terminates).  In either case we ensure that it's re-programmed
+	 */
+	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+		raw_spin_unlock(&cfs_b->lock);
+		/* ensure cfs_b->lock is available while we wait */
+		hrtimer_cancel(&cfs_b->period_timer);
+
+		raw_spin_lock(&cfs_b->lock);
+		/* if someone else restarted the timer then we're done */
+		if (cfs_b->timer_active)
+			return;
+	}
+
+	cfs_b->timer_active = 1;
+	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+}
+
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{}
+{
+	hrtimer_cancel(&cfs_b->period_timer);
+}
 #else
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
@@ -9078,7 +9135,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
 
 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 {
-	int i, ret = 0;
+	int i, ret = 0, runtime_enabled;
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
 
 	if (tg == &root_task_group)
@@ -9105,10 +9162,18 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 	if (ret)
 		goto out_unlock;
 
+	runtime_enabled = quota != RUNTIME_INF;
 	raw_spin_lock_irq(&cfs_b->lock);
 	cfs_b->period = ns_to_ktime(period);
 	cfs_b->quota = quota;
 	cfs_b->runtime = quota;
+
+	/* restart the period timer (if active) to handle new period expiry */
+	if (runtime_enabled && cfs_b->timer_active) {
+		/* force a reprogram */
+		cfs_b->timer_active = 0;
+		__start_cfs_bandwidth(cfs_b);
+	}
 	raw_spin_unlock_irq(&cfs_b->lock);
 
 	for_each_possible_cpu(i) {
@@ -9116,7 +9181,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 		struct rq *rq = rq_of(cfs_rq);
 
 		raw_spin_lock_irq(&rq->lock);
-		cfs_rq->runtime_enabled = quota != RUNTIME_INF;
+		cfs_rq->runtime_enabled = runtime_enabled;
 		cfs_rq->runtime_remaining = 0;
 		raw_spin_unlock_irq(&rq->lock);
 	}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9502aa899f73..af73a8a85eef 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1284,9 +1284,16 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	raw_spin_lock(&cfs_b->lock);
 	if (cfs_b->quota == RUNTIME_INF)
 		amount = min_amount;
-	else if (cfs_b->runtime > 0) {
-		amount = min(cfs_b->runtime, min_amount);
-		cfs_b->runtime -= amount;
+	else {
+		/* ensure bandwidth timer remains active under consumption */
+		if (!cfs_b->timer_active)
+			__start_cfs_bandwidth(cfs_b);
+
+		if (cfs_b->runtime > 0) {
+			amount = min(cfs_b->runtime, min_amount);
+			cfs_b->runtime -= amount;
+			cfs_b->idle = 0;
+		}
 	}
 	raw_spin_unlock(&cfs_b->lock);
 
@@ -1315,6 +1322,33 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
 
+/*
+ * Responsible for refilling a task_group's bandwidth and unthrottling its
+ * cfs_rqs as appropriate. If there has been no activity within the last
+ * period the timer is deactivated until scheduling resumes; cfs_b->idle is
+ * used to track this state.
+ */
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+{
+	int idle = 1;
+
+	raw_spin_lock(&cfs_b->lock);
+	/* no need to continue the timer with no bandwidth constraint */
+	if (cfs_b->quota == RUNTIME_INF)
+		goto out_unlock;
+
+	idle = cfs_b->idle;
+	cfs_b->runtime = cfs_b->quota;
+
+	/* mark as potentially idle for the upcoming period */
+	cfs_b->idle = 1;
+out_unlock:
+	if (idle)
+		cfs_b->timer_active = 0;
+	raw_spin_unlock(&cfs_b->lock);
+
+	return idle;
+}
 #else
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 				     unsigned long delta_exec) {}
-- 
cgit v1.2.2


From a9cf55b2861057a213e610da2fec52125439a11d Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:32 -0700
Subject: sched: Expire invalid runtime

Since quota is managed using a global state but consumed on a per-cpu basis
we need to ensure that our per-cpu state is appropriately synchronized.
Most importantly, runtime that is state (from a previous period) should not be
locally consumable.

We take advantage of existing sched_clock synchronization about the jiffy to
efficiently detect whether we have (globally) crossed a quota boundary above.

One catch is that the direction of spread on sched_clock is undefined,
specifically, we don't know whether our local clock is behind or ahead
of the one responsible for the current expiration time.

Fortunately we can differentiate these by considering whether the
global deadline has advanced.  If it has not, then we assume our clock to be
"fast" and advance our local expiration; otherwise, we know the deadline has
truly passed and we expire our local runtime.

Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.379275352@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  4 ++-
 kernel/sched_fair.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 84 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 34bf8e6db9af..a2d55144bd9c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -256,6 +256,7 @@ struct cfs_bandwidth {
 	ktime_t period;
 	u64 quota, runtime;
 	s64 hierarchal_quota;
+	u64 runtime_expires;
 
 	int idle, timer_active;
 	struct hrtimer period_timer;
@@ -396,6 +397,7 @@ struct cfs_rq {
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
 	int runtime_enabled;
+	u64 runtime_expires;
 	s64 runtime_remaining;
 #endif
 #endif
@@ -9166,8 +9168,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 	raw_spin_lock_irq(&cfs_b->lock);
 	cfs_b->period = ns_to_ktime(period);
 	cfs_b->quota = quota;
-	cfs_b->runtime = quota;
 
+	__refill_cfs_bandwidth_runtime(cfs_b);
 	/* restart the period timer (if active) to handle new period expiry */
 	if (runtime_enabled && cfs_b->timer_active) {
 		/* force a reprogram */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index af73a8a85eef..9d1adbd0b615 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1272,11 +1272,30 @@ static inline u64 sched_cfs_bandwidth_slice(void)
 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
 }
 
+/*
+ * Replenish runtime according to assigned quota and update expiration time.
+ * We use sched_clock_cpu directly instead of rq->clock to avoid adding
+ * additional synchronization around rq->lock.
+ *
+ * requires cfs_b->lock
+ */
+static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
+{
+	u64 now;
+
+	if (cfs_b->quota == RUNTIME_INF)
+		return;
+
+	now = sched_clock_cpu(smp_processor_id());
+	cfs_b->runtime = cfs_b->quota;
+	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+}
+
 static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	struct task_group *tg = cfs_rq->tg;
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-	u64 amount = 0, min_amount;
+	u64 amount = 0, min_amount, expires;
 
 	/* note: this is a positive sum as runtime_remaining <= 0 */
 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -1285,9 +1304,16 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	if (cfs_b->quota == RUNTIME_INF)
 		amount = min_amount;
 	else {
-		/* ensure bandwidth timer remains active under consumption */
-		if (!cfs_b->timer_active)
+		/*
+		 * If the bandwidth pool has become inactive, then at least one
+		 * period must have elapsed since the last consumption.
+		 * Refresh the global state and ensure bandwidth timer becomes
+		 * active.
+		 */
+		if (!cfs_b->timer_active) {
+			__refill_cfs_bandwidth_runtime(cfs_b);
 			__start_cfs_bandwidth(cfs_b);
+		}
 
 		if (cfs_b->runtime > 0) {
 			amount = min(cfs_b->runtime, min_amount);
@@ -1295,19 +1321,61 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 			cfs_b->idle = 0;
 		}
 	}
+	expires = cfs_b->runtime_expires;
 	raw_spin_unlock(&cfs_b->lock);
 
 	cfs_rq->runtime_remaining += amount;
+	/*
+	 * we may have advanced our local expiration to account for allowed
+	 * spread between our sched_clock and the one on which runtime was
+	 * issued.
+	 */
+	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+		cfs_rq->runtime_expires = expires;
 }
 
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-				     unsigned long delta_exec)
+/*
+ * Note: This depends on the synchronization provided by sched_clock and the
+ * fact that rq->clock snapshots this value.
+ */
+static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
-	if (!cfs_rq->runtime_enabled)
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	struct rq *rq = rq_of(cfs_rq);
+
+	/* if the deadline is ahead of our clock, nothing to do */
+	if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
+		return;
+
+	if (cfs_rq->runtime_remaining < 0)
 		return;
 
+	/*
+	 * If the local deadline has passed we have to consider the
+	 * possibility that our sched_clock is 'fast' and the global deadline
+	 * has not truly expired.
+	 *
+	 * Fortunately we can check determine whether this the case by checking
+	 * whether the global deadline has advanced.
+	 */
+
+	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
+		/* extend local deadline, drift is bounded above by 2 ticks */
+		cfs_rq->runtime_expires += TICK_NSEC;
+	} else {
+		/* global deadline is ahead, expiration has passed */
+		cfs_rq->runtime_remaining = 0;
+	}
+}
+
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+				     unsigned long delta_exec)
+{
+	/* dock delta_exec before expiring quota (as it could span periods) */
 	cfs_rq->runtime_remaining -= delta_exec;
-	if (cfs_rq->runtime_remaining > 0)
+	expire_cfs_rq_runtime(cfs_rq);
+
+	if (likely(cfs_rq->runtime_remaining > 0))
 		return;
 
 	assign_cfs_rq_runtime(cfs_rq);
@@ -1338,7 +1406,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 		goto out_unlock;
 
 	idle = cfs_b->idle;
-	cfs_b->runtime = cfs_b->quota;
+	/* if we're going inactive then everything else can be deferred */
+	if (idle)
+		goto out_unlock;
+
+	__refill_cfs_bandwidth_runtime(cfs_b);
+
 
 	/* mark as potentially idle for the upcoming period */
 	cfs_b->idle = 1;
@@ -1557,7 +1630,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 
 	return wl;
 }
-
 #else
 
 static inline unsigned long effective_load(struct task_group *tg, int cpu,
-- 
cgit v1.2.2


From 85dac906bec3bb41bfaa7ccaa65c4706de5cfdf8 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:33 -0700
Subject: sched: Add support for throttling group entities

Now that consumption is tracked (via update_curr()) we add support to throttle
group entities (and their corresponding cfs_rqs) in the case where this is no
run-time remaining.

Throttled entities are dequeued to prevent scheduling, additionally we mark
them as throttled (using cfs_rq->throttled) to prevent them from becoming
re-enqueued until they are unthrottled.  A list of a task_group's throttled
entities are maintained on the cfs_bandwidth structure.

Note: While the machinery for throttling is added in this patch the act of
throttling an entity exceeding its bandwidth is deferred until later within
the series.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.480608533@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  7 +++++
 kernel/sched_fair.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 92 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a2d55144bd9c..044260a9418d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -260,6 +260,8 @@ struct cfs_bandwidth {
 
 	int idle, timer_active;
 	struct hrtimer period_timer;
+	struct list_head throttled_cfs_rq;
+
 #endif
 };
 
@@ -399,6 +401,9 @@ struct cfs_rq {
 	int runtime_enabled;
 	u64 runtime_expires;
 	s64 runtime_remaining;
+
+	int throttled;
+	struct list_head throttled_list;
 #endif
 #endif
 };
@@ -441,6 +446,7 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	cfs_b->quota = RUNTIME_INF;
 	cfs_b->period = ns_to_ktime(default_cfs_period());
 
+	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_b->period_timer.function = sched_cfs_period_timer;
 }
@@ -448,6 +454,7 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->runtime_enabled = 0;
+	INIT_LIST_HEAD(&cfs_rq->throttled_list);
 }
 
 /* requires cfs_b->lock, may release to reprogram timer */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9d1adbd0b615..72c9d4ed5991 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1291,7 +1291,8 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
 }
 
-static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	struct task_group *tg = cfs_rq->tg;
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
@@ -1332,6 +1333,8 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	 */
 	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
 		cfs_rq->runtime_expires = expires;
+
+	return cfs_rq->runtime_remaining > 0;
 }
 
 /*
@@ -1378,7 +1381,12 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 	if (likely(cfs_rq->runtime_remaining > 0))
 		return;
 
-	assign_cfs_rq_runtime(cfs_rq);
+	/*
+	 * if we're unable to extend our runtime we resched so that the active
+	 * hierarchy can be throttled
+	 */
+	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
+		resched_task(rq_of(cfs_rq)->curr);
 }
 
 static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
@@ -1390,6 +1398,47 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
 
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->throttled;
+}
+
+static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	struct sched_entity *se;
+	long task_delta, dequeue = 1;
+
+	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+	/* account load preceding throttle */
+	update_cfs_load(cfs_rq, 0);
+
+	task_delta = cfs_rq->h_nr_running;
+	for_each_sched_entity(se) {
+		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+		/* throttled entity or throttle-on-deactivate */
+		if (!se->on_rq)
+			break;
+
+		if (dequeue)
+			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+		qcfs_rq->h_nr_running -= task_delta;
+
+		if (qcfs_rq->load.weight)
+			dequeue = 0;
+	}
+
+	if (!se)
+		rq->nr_running -= task_delta;
+
+	cfs_rq->throttled = 1;
+	raw_spin_lock(&cfs_b->lock);
+	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+	raw_spin_unlock(&cfs_b->lock);
+}
+
 /*
  * Responsible for refilling a task_group's bandwidth and unthrottling its
  * cfs_rqs as appropriate. If there has been no activity within the last
@@ -1425,6 +1474,11 @@ out_unlock:
 #else
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 				     unsigned long delta_exec) {}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
 #endif
 
 /**************************************************
@@ -1503,7 +1557,17 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 		cfs_rq = cfs_rq_of(se);
 		enqueue_entity(cfs_rq, se, flags);
+
+		/*
+		 * end evaluation on encountering a throttled cfs_rq
+		 *
+		 * note: in the case of encountering a throttled cfs_rq we will
+		 * post the final h_nr_running increment below.
+		*/
+		if (cfs_rq_throttled(cfs_rq))
+			break;
 		cfs_rq->h_nr_running++;
+
 		flags = ENQUEUE_WAKEUP;
 	}
 
@@ -1511,11 +1575,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running++;
 
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 	}
 
-	inc_nr_running(rq);
+	if (!se)
+		inc_nr_running(rq);
 	hrtick_update(rq);
 }
 
@@ -1535,6 +1603,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
+
+		/*
+		 * end evaluation on encountering a throttled cfs_rq
+		 *
+		 * note: in the case of encountering a throttled cfs_rq we will
+		 * post the final h_nr_running decrement below.
+		*/
+		if (cfs_rq_throttled(cfs_rq))
+			break;
 		cfs_rq->h_nr_running--;
 
 		/* Don't dequeue parent if it has other entities besides us */
@@ -1557,11 +1634,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running--;
 
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 	}
 
-	dec_nr_running(rq);
+	if (!se)
+		dec_nr_running(rq);
 	hrtick_update(rq);
 }
 
-- 
cgit v1.2.2


From 671fd9dabe5239ad218c7eb48b2b9edee50250e6 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:34 -0700
Subject: sched: Add support for unthrottling group entities

At the start of each period we refresh the global bandwidth pool.  At this time
we must also unthrottle any cfs_rq entities who are now within bandwidth once
more (as quota permits).

Unthrottled entities have their corresponding cfs_rq->throttled flag cleared
and their entities re-enqueued.

Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.574628950@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |   3 ++
 kernel/sched_fair.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 126 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 044260a9418d..4bbabc2c4a77 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9192,6 +9192,9 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 		raw_spin_lock_irq(&rq->lock);
 		cfs_rq->runtime_enabled = runtime_enabled;
 		cfs_rq->runtime_remaining = 0;
+
+		if (cfs_rq_throttled(cfs_rq))
+			unthrottle_cfs_rq(cfs_rq);
 		raw_spin_unlock_irq(&rq->lock);
 	}
 out_unlock:
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 72c9d4ed5991..76411950ff3b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1439,6 +1439,84 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	raw_spin_unlock(&cfs_b->lock);
 }
 
+static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	struct sched_entity *se;
+	int enqueue = 1;
+	long task_delta;
+
+	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+	cfs_rq->throttled = 0;
+	raw_spin_lock(&cfs_b->lock);
+	list_del_rcu(&cfs_rq->throttled_list);
+	raw_spin_unlock(&cfs_b->lock);
+
+	if (!cfs_rq->load.weight)
+		return;
+
+	task_delta = cfs_rq->h_nr_running;
+	for_each_sched_entity(se) {
+		if (se->on_rq)
+			enqueue = 0;
+
+		cfs_rq = cfs_rq_of(se);
+		if (enqueue)
+			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+		cfs_rq->h_nr_running += task_delta;
+
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+	}
+
+	if (!se)
+		rq->nr_running += task_delta;
+
+	/* determine whether we need to wake up potentially idle cpu */
+	if (rq->curr == rq->idle && rq->cfs.nr_running)
+		resched_task(rq->curr);
+}
+
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
+		u64 remaining, u64 expires)
+{
+	struct cfs_rq *cfs_rq;
+	u64 runtime = remaining;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
+				throttled_list) {
+		struct rq *rq = rq_of(cfs_rq);
+
+		raw_spin_lock(&rq->lock);
+		if (!cfs_rq_throttled(cfs_rq))
+			goto next;
+
+		runtime = -cfs_rq->runtime_remaining + 1;
+		if (runtime > remaining)
+			runtime = remaining;
+		remaining -= runtime;
+
+		cfs_rq->runtime_remaining += runtime;
+		cfs_rq->runtime_expires = expires;
+
+		/* we check whether we're throttled above */
+		if (cfs_rq->runtime_remaining > 0)
+			unthrottle_cfs_rq(cfs_rq);
+
+next:
+		raw_spin_unlock(&rq->lock);
+
+		if (!remaining)
+			break;
+	}
+	rcu_read_unlock();
+
+	return remaining;
+}
+
 /*
  * Responsible for refilling a task_group's bandwidth and unthrottling its
  * cfs_rqs as appropriate. If there has been no activity within the last
@@ -1447,23 +1525,64 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
  */
 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 {
-	int idle = 1;
+	u64 runtime, runtime_expires;
+	int idle = 1, throttled;
 
 	raw_spin_lock(&cfs_b->lock);
 	/* no need to continue the timer with no bandwidth constraint */
 	if (cfs_b->quota == RUNTIME_INF)
 		goto out_unlock;
 
-	idle = cfs_b->idle;
+	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+	/* idle depends on !throttled (for the case of a large deficit) */
+	idle = cfs_b->idle && !throttled;
+
 	/* if we're going inactive then everything else can be deferred */
 	if (idle)
 		goto out_unlock;
 
 	__refill_cfs_bandwidth_runtime(cfs_b);
 
+	if (!throttled) {
+		/* mark as potentially idle for the upcoming period */
+		cfs_b->idle = 1;
+		goto out_unlock;
+	}
+
+	/*
+	 * There are throttled entities so we must first use the new bandwidth
+	 * to unthrottle them before making it generally available.  This
+	 * ensures that all existing debts will be paid before a new cfs_rq is
+	 * allowed to run.
+	 */
+	runtime = cfs_b->runtime;
+	runtime_expires = cfs_b->runtime_expires;
+	cfs_b->runtime = 0;
+
+	/*
+	 * This check is repeated as we are holding onto the new bandwidth
+	 * while we unthrottle.  This can potentially race with an unthrottled
+	 * group trying to acquire new bandwidth from the global pool.
+	 */
+	while (throttled && runtime > 0) {
+		raw_spin_unlock(&cfs_b->lock);
+		/* we can't nest cfs_b->lock while distributing bandwidth */
+		runtime = distribute_cfs_runtime(cfs_b, runtime,
+						 runtime_expires);
+		raw_spin_lock(&cfs_b->lock);
+
+		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+	}
 
-	/* mark as potentially idle for the upcoming period */
-	cfs_b->idle = 1;
+	/* return (any) remaining runtime */
+	cfs_b->runtime = runtime;
+	/*
+	 * While we are ensured activity in the period following an
+	 * unthrottle, this also covers the case in which the new bandwidth is
+	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
+	 * timer to remain active while there are any throttled entities.)
+	 */
+	cfs_b->idle = 0;
 out_unlock:
 	if (idle)
 		cfs_b->timer_active = 0;
-- 
cgit v1.2.2


From 8277434ef1202ce30315f8edb3fc760aa6e74493 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:35 -0700
Subject: sched: Allow for positional tg_tree walks

Extend walk_tg_tree to accept a positional argument

static int walk_tg_tree_from(struct task_group *from,
			     tg_visitor down, tg_visitor up, void *data)

Existing semantics are preserved, caller must hold rcu_lock() or sufficient
analogue.

Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.677889157@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 50 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 4bbabc2c4a77..8ec1e7ac2894 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1591,20 +1591,23 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 typedef int (*tg_visitor)(struct task_group *, void *);
 
 /*
- * Iterate the full tree, calling @down when first entering a node and @up when
- * leaving it for the final time.
+ * Iterate task_group tree rooted at *from, calling @down when first entering a
+ * node and @up when leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
  */
-static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+static int walk_tg_tree_from(struct task_group *from,
+			     tg_visitor down, tg_visitor up, void *data)
 {
 	struct task_group *parent, *child;
 	int ret;
 
-	rcu_read_lock();
-	parent = &root_task_group;
+	parent = from;
+
 down:
 	ret = (*down)(parent, data);
 	if (ret)
-		goto out_unlock;
+		goto out;
 	list_for_each_entry_rcu(child, &parent->children, siblings) {
 		parent = child;
 		goto down;
@@ -1613,19 +1616,29 @@ up:
 		continue;
 	}
 	ret = (*up)(parent, data);
-	if (ret)
-		goto out_unlock;
+	if (ret || parent == from)
+		goto out;
 
 	child = parent;
 	parent = parent->parent;
 	if (parent)
 		goto up;
-out_unlock:
-	rcu_read_unlock();
-
+out:
 	return ret;
 }
 
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+ */
+
+static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+{
+	return walk_tg_tree_from(&root_task_group, down, up, data);
+}
+
 static int tg_nop(struct task_group *tg, void *data)
 {
 	return 0;
@@ -8870,13 +8883,19 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
 
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
+	int ret;
+
 	struct rt_schedulable_data data = {
 		.tg = tg,
 		.rt_period = period,
 		.rt_runtime = runtime,
 	};
 
-	return walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+	rcu_read_lock();
+	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+	rcu_read_unlock();
+
+	return ret;
 }
 
 static int tg_set_rt_bandwidth(struct task_group *tg,
@@ -9333,6 +9352,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
 
 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 {
+	int ret;
 	struct cfs_schedulable_data data = {
 		.tg = tg,
 		.period = period,
@@ -9344,7 +9364,11 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 		do_div(data.quota, NSEC_PER_USEC);
 	}
 
-	return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+	rcu_read_lock();
+	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+	rcu_read_unlock();
+
+	return ret;
 }
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
-- 
cgit v1.2.2


From 64660c864f46202b932b911a69deb09805bdbaf8 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:36 -0700
Subject: sched: Prevent interactions with throttled entities

From the perspective of load-balance and shares distribution, throttled
entities should be invisible.

However, both of these operations work on 'active' lists and are not
inherently aware of what group hierarchies may be present.  In some cases this
may be side-stepped (e.g. we could sideload via tg_load_down in load balance)
while in others (e.g. update_shares()) it is more difficult to compute without
incurring some O(n^2) costs.

Instead, track hierarchicaal throttled state at time of transition.  This
allows us to easily identify whether an entity belongs to a throttled hierarchy
and avoid incorrect interactions with it.

Also, when an entity leaves a throttled hierarchy we need to advance its
time averaging for shares averaging so that the elapsed throttled time is not
considered as part of the cfs_rq's operation.

We also use this information to prevent buddy interactions in the wakeup and
yield_to() paths.

Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.777916795@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  2 +-
 kernel/sched_fair.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 94 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8ec1e7ac2894..5db05f6fb470 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -402,7 +402,7 @@ struct cfs_rq {
 	u64 runtime_expires;
 	s64 runtime_remaining;
 
-	int throttled;
+	int throttled, throttle_count;
 	struct list_head throttled_list;
 #endif
 #endif
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 76411950ff3b..5a2089492a98 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -706,6 +706,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+/* we need this in update_cfs_load and load-balance functions below */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 # ifdef CONFIG_SMP
 static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
 					    int global_update)
@@ -728,7 +730,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 	u64 now, delta;
 	unsigned long load = cfs_rq->load.weight;
 
-	if (cfs_rq->tg == &root_task_group)
+	if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
 		return;
 
 	now = rq_of(cfs_rq)->clock_task;
@@ -837,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
 
 	tg = cfs_rq->tg;
 	se = tg->se[cpu_of(rq_of(cfs_rq))];
-	if (!se)
+	if (!se || throttled_hierarchy(cfs_rq))
 		return;
 #ifndef CONFIG_SMP
 	if (likely(se->load.weight == tg->shares))
@@ -1403,6 +1405,65 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 	return cfs_rq->throttled;
 }
 
+/* check whether cfs_rq, or any parent, is throttled */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->throttle_count;
+}
+
+/*
+ * Ensure that neither of the group entities corresponding to src_cpu or
+ * dest_cpu are members of a throttled hierarchy when performing group
+ * load-balance operations.
+ */
+static inline int throttled_lb_pair(struct task_group *tg,
+				    int src_cpu, int dest_cpu)
+{
+	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
+
+	src_cfs_rq = tg->cfs_rq[src_cpu];
+	dest_cfs_rq = tg->cfs_rq[dest_cpu];
+
+	return throttled_hierarchy(src_cfs_rq) ||
+	       throttled_hierarchy(dest_cfs_rq);
+}
+
+/* updated child weight may affect parent so we have to do this bottom up */
+static int tg_unthrottle_up(struct task_group *tg, void *data)
+{
+	struct rq *rq = data;
+	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+	cfs_rq->throttle_count--;
+#ifdef CONFIG_SMP
+	if (!cfs_rq->throttle_count) {
+		u64 delta = rq->clock_task - cfs_rq->load_stamp;
+
+		/* leaving throttled state, advance shares averaging windows */
+		cfs_rq->load_stamp += delta;
+		cfs_rq->load_last += delta;
+
+		/* update entity weight now that we are on_rq again */
+		update_cfs_shares(cfs_rq);
+	}
+#endif
+
+	return 0;
+}
+
+static int tg_throttle_down(struct task_group *tg, void *data)
+{
+	struct rq *rq = data;
+	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+	/* group is entering throttled state, record last load */
+	if (!cfs_rq->throttle_count)
+		update_cfs_load(cfs_rq, 0);
+	cfs_rq->throttle_count++;
+
+	return 0;
+}
+
 static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
@@ -1413,7 +1474,9 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 
 	/* account load preceding throttle */
-	update_cfs_load(cfs_rq, 0);
+	rcu_read_lock();
+	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
+	rcu_read_unlock();
 
 	task_delta = cfs_rq->h_nr_running;
 	for_each_sched_entity(se) {
@@ -1454,6 +1517,10 @@ static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 	list_del_rcu(&cfs_rq->throttled_list);
 	raw_spin_unlock(&cfs_b->lock);
 
+	update_rq_clock(rq);
+	/* update hierarchical throttle state */
+	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
+
 	if (!cfs_rq->load.weight)
 		return;
 
@@ -1598,6 +1665,17 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
 	return 0;
 }
+
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
+static inline int throttled_lb_pair(struct task_group *tg,
+				    int src_cpu, int dest_cpu)
+{
+	return 0;
+}
 #endif
 
 /**************************************************
@@ -2493,6 +2571,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 	for_each_leaf_cfs_rq(busiest, cfs_rq) {
 		list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
+			if (throttled_lb_pair(task_group(p),
+					      busiest->cpu, this_cpu))
+				break;
 
 			if (!can_migrate_task(p, busiest, this_cpu,
 						sd, idle, &pinned))
@@ -2608,8 +2689,13 @@ static void update_shares(int cpu)
 	 * Iterates the task_group tree in a bottom up fashion, see
 	 * list_add_leaf_cfs_rq() for details.
 	 */
-	for_each_leaf_cfs_rq(rq, cfs_rq)
+	for_each_leaf_cfs_rq(rq, cfs_rq) {
+		/* throttled entities do not contribute to load */
+		if (throttled_hierarchy(cfs_rq))
+			continue;
+
 		update_shares_cpu(cfs_rq->tg, cpu);
+	}
 	rcu_read_unlock();
 }
 
@@ -2659,9 +2745,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		u64 rem_load, moved_load;
 
 		/*
-		 * empty group
+		 * empty group or part of a throttled hierarchy
 		 */
-		if (!busiest_cfs_rq->task_weight)
+		if (!busiest_cfs_rq->task_weight ||
+		    throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
 			continue;
 
 		rem_load = (u64)rem_load_move * busiest_weight;
-- 
cgit v1.2.2


From 5238cdd3873e67a98b28c1161d65d2a615c320a3 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:37 -0700
Subject: sched: Prevent buddy interactions with throttled entities

Buddies allow us to select "on-rq" entities without actually selecting them
from a cfs_rq's rb_tree.  As a result we must ensure that throttled entities
are not falsely nominated as buddies.  The fact that entities are dequeued
within throttle_entity is not sufficient for clearing buddy status as the
nomination may occur after throttling.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.886850167@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5a2089492a98..1d4acbea9e60 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2348,6 +2348,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	if (unlikely(se == pse))
 		return;
 
+	/*
+	 * This is possible from callers such as pull_task(), in which we
+	 * unconditionally check_prempt_curr() after an enqueue (which may have
+	 * lead to a throttle).  This both saves work and prevents false
+	 * next-buddy nomination below.
+	 */
+	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+		return;
+
 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
 		set_next_buddy(pse);
 		next_buddy_marked = 1;
@@ -2356,6 +2365,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	/*
 	 * We can come here with TIF_NEED_RESCHED already set from new task
 	 * wake up path.
+	 *
+	 * Note: this also catches the edge-case of curr being in a throttled
+	 * group (e.g. via set_curr_task), since update_curr() (in the
+	 * enqueue of curr) will have resulted in resched being set.  This
+	 * prevents us from potentially nominating it as a false LAST_BUDDY
+	 * below.
 	 */
 	if (test_tsk_need_resched(curr))
 		return;
@@ -2474,7 +2489,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 {
 	struct sched_entity *se = &p->se;
 
-	if (!se->on_rq)
+	/* throttled hierarchies are not runnable */
+	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
 		return false;
 
 	/* Tell the scheduler that we'd really like pse to run next. */
-- 
cgit v1.2.2


From 8cb120d3e41a0464a559d639d519cef563717a4e Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:38 -0700
Subject: sched: Migrate throttled tasks on HOTPLUG

Throttled tasks are invisisble to cpu-offline since they are not eligible for
selection by pick_next_task().  The regular 'escape' path for a thread that is
blocked at offline is via ttwu->select_task_rq, however this will not handle a
throttled group since there are no individual thread wakeups on an unthrottle.

Resolve this by unthrottling offline cpus so that threads can be migrated.

Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.989000590@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5db05f6fb470..397317248ddd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6335,6 +6335,30 @@ static void calc_global_load_remove(struct rq *rq)
 	rq->calc_load_active = 0;
 }
 
+#ifdef CONFIG_CFS_BANDWIDTH
+static void unthrottle_offline_cfs_rqs(struct rq *rq)
+{
+	struct cfs_rq *cfs_rq;
+
+	for_each_leaf_cfs_rq(rq, cfs_rq) {
+		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+
+		if (!cfs_rq->runtime_enabled)
+			continue;
+
+		/*
+		 * clock_task is not advancing so we just need to make sure
+		 * there's some valid quota amount
+		 */
+		cfs_rq->runtime_remaining = cfs_b->quota;
+		if (cfs_rq_throttled(cfs_rq))
+			unthrottle_cfs_rq(cfs_rq);
+	}
+}
+#else
+static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+#endif
+
 /*
  * Migrate all tasks from the rq, sleeping tasks will be migrated by
  * try_to_wake_up()->select_task_rq().
@@ -6360,6 +6384,9 @@ static void migrate_tasks(unsigned int dead_cpu)
 	 */
 	rq->stop = NULL;
 
+	/* Ensure any throttled groups are reachable by pick_next_task */
+	unthrottle_offline_cfs_rqs(rq);
+
 	for ( ; ; ) {
 		/*
 		 * There's this thread running, bail when that's the only
-- 
cgit v1.2.2


From d3d9dc3302368269acf94b7381663b93000fe2fe Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:39 -0700
Subject: sched: Throttle entities exceeding their allowed bandwidth

With the machinery in place to throttle and unthrottle entities, as well as
handle their participation (or lack there of) we can now enable throttling.

There are 2 points that we must check whether it's time to set throttled state:
 put_prev_entity() and enqueue_entity().

- put_prev_entity() is the typical throttle path, we reach it by exceeding our
  allocated run-time within update_curr()->account_cfs_rq_runtime() and going
  through a reschedule.

- enqueue_entity() covers the case of a wake-up into an already throttled
  group.  In this case we know the group cannot be on_rq and can throttle
  immediately.  Checks are added at time of put_prev_entity() and
  enqueue_entity()

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184758.091415417@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1d4acbea9e60..f9f671a7d0af 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -970,6 +970,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 	se->vruntime = vruntime;
 }
 
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -999,8 +1001,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		__enqueue_entity(cfs_rq, se);
 	se->on_rq = 1;
 
-	if (cfs_rq->nr_running == 1)
+	if (cfs_rq->nr_running == 1) {
 		list_add_leaf_cfs_rq(cfs_rq);
+		check_enqueue_throttle(cfs_rq);
+	}
 }
 
 static void __clear_buddies_last(struct sched_entity *se)
@@ -1202,6 +1206,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 	return se;
 }
 
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
 	/*
@@ -1211,6 +1217,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	if (prev->on_rq)
 		update_curr(cfs_rq);
 
+	/* throttle cfs_rqs exceeding runtime */
+	check_cfs_rq_runtime(cfs_rq);
+
 	check_spread(cfs_rq, prev);
 	if (prev->on_rq) {
 		update_stats_wait_start(cfs_rq, prev);
@@ -1464,7 +1473,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
 	return 0;
 }
 
-static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
@@ -1657,9 +1666,48 @@ out_unlock:
 
 	return idle;
 }
+
+/*
+ * When a group wakes up we want to make sure that its quota is not already
+ * expired/exceeded, otherwise it may be allowed to steal additional ticks of
+ * runtime as update_curr() throttling can not not trigger until it's on-rq.
+ */
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
+{
+	/* an active group must be handled by the update_curr()->put() path */
+	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+		return;
+
+	/* ensure the group is not already throttled */
+	if (cfs_rq_throttled(cfs_rq))
+		return;
+
+	/* update runtime allocation */
+	account_cfs_rq_runtime(cfs_rq, 0);
+	if (cfs_rq->runtime_remaining <= 0)
+		throttle_cfs_rq(cfs_rq);
+}
+
+/* conditionally throttle active cfs_rq's from put_prev_entity() */
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+		return;
+
+	/*
+	 * it's possible for a throttled entity to be forced into a running
+	 * state (e.g. set_curr_task), in this case we're finished.
+	 */
+	if (cfs_rq_throttled(cfs_rq))
+		return;
+
+	throttle_cfs_rq(cfs_rq);
+}
 #else
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 				     unsigned long delta_exec) {}
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
-- 
cgit v1.2.2


From e8da1b18b32064c43881bceef0f051c2110c9ab9 Mon Sep 17 00:00:00 2001
From: Nikhil Rao <ncrao@google.com>
Date: Thu, 21 Jul 2011 09:43:40 -0700
Subject: sched: Add exports tracking cfs bandwidth control statistics

This change introduces statistics exports for the cpu sub-system, these are
added through the use of a stat file similar to that exported by other
subsystems.

The following exports are included:

nr_periods:	number of periods in which execution occurred
nr_throttled:	the number of periods above in which execution was throttle
throttled_time:	cumulative wall-time that any cpus have been throttled for
this group

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184758.198901931@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 21 +++++++++++++++++++++
 kernel/sched_fair.c |  7 +++++++
 2 files changed, 28 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 397317248ddd..35c91859f8a6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -262,6 +262,9 @@ struct cfs_bandwidth {
 	struct hrtimer period_timer;
 	struct list_head throttled_cfs_rq;
 
+	/* statistics */
+	int nr_periods, nr_throttled;
+	u64 throttled_time;
 #endif
 };
 
@@ -402,6 +405,7 @@ struct cfs_rq {
 	u64 runtime_expires;
 	s64 runtime_remaining;
 
+	u64 throttled_timestamp;
 	int throttled, throttle_count;
 	struct list_head throttled_list;
 #endif
@@ -9397,6 +9401,19 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 
 	return ret;
 }
+
+static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
+		struct cgroup_map_cb *cb)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+
+	cb->fill(cb, "nr_periods", cfs_b->nr_periods);
+	cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
+	cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+
+	return 0;
+}
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
@@ -9443,6 +9460,10 @@ static struct cftype cpu_files[] = {
 		.read_u64 = cpu_cfs_period_read_u64,
 		.write_u64 = cpu_cfs_period_write_u64,
 	},
+	{
+		.name = "stat",
+		.read_map = cpu_stats_show,
+	},
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f9f671a7d0af..d201f28c1de7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1506,6 +1506,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 		rq->nr_running -= task_delta;
 
 	cfs_rq->throttled = 1;
+	cfs_rq->throttled_timestamp = rq->clock;
 	raw_spin_lock(&cfs_b->lock);
 	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
 	raw_spin_unlock(&cfs_b->lock);
@@ -1523,8 +1524,10 @@ static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	cfs_rq->throttled = 0;
 	raw_spin_lock(&cfs_b->lock);
+	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
 	list_del_rcu(&cfs_rq->throttled_list);
 	raw_spin_unlock(&cfs_b->lock);
+	cfs_rq->throttled_timestamp = 0;
 
 	update_rq_clock(rq);
 	/* update hierarchical throttle state */
@@ -1612,6 +1615,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 	/* idle depends on !throttled (for the case of a large deficit) */
 	idle = cfs_b->idle && !throttled;
+	cfs_b->nr_periods += overrun;
 
 	/* if we're going inactive then everything else can be deferred */
 	if (idle)
@@ -1625,6 +1629,9 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 		goto out_unlock;
 	}
 
+	/* account preceding periods in which throttling occurred */
+	cfs_b->nr_throttled += overrun;
+
 	/*
 	 * There are throttled entities so we must first use the new bandwidth
 	 * to unthrottle them before making it generally available.  This
-- 
cgit v1.2.2


From d8b4986d3dbc4fabc2054d63f1d31d6ed2fb1ca8 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:41 -0700
Subject: sched: Return unused runtime on group dequeue

When a local cfs_rq blocks we return the majority of its remaining quota to the
global bandwidth pool for use by other runqueues.

We do this only when the quota is current and there is more than
min_cfs_rq_quota [1ms by default] of runtime remaining on the rq.

In the case where there are throttled runqueues and we have sufficient
bandwidth to meter out a slice, a second timer is kicked off to handle this
delivery, unthrottling where appropriate.

Using a 'worst case' antagonist which executes on each cpu
for 1ms before moving onto the next on a fairly large machine:

no quota generations:

 197.47 ms       /cgroup/a/cpuacct.usage
 199.46 ms       /cgroup/a/cpuacct.usage
 205.46 ms       /cgroup/a/cpuacct.usage
 198.46 ms       /cgroup/a/cpuacct.usage
 208.39 ms       /cgroup/a/cpuacct.usage

Since we are allowed to use "stale" quota our usage is effectively bounded by
the rate of input into the global pool and performance is relatively stable.

with quota generations [1s increments]:

 119.58 ms       /cgroup/a/cpuacct.usage
 119.65 ms       /cgroup/a/cpuacct.usage
 119.64 ms       /cgroup/a/cpuacct.usage
 119.63 ms       /cgroup/a/cpuacct.usage
 119.60 ms       /cgroup/a/cpuacct.usage

The large deficit here is due to quota generations (/intentionally/) preventing
us from now using previously stranded slack quota.  The cost is that this quota
becomes unavailable.

with quota generations and quota return:

 200.09 ms       /cgroup/a/cpuacct.usage
 200.09 ms       /cgroup/a/cpuacct.usage
 198.09 ms       /cgroup/a/cpuacct.usage
 200.09 ms       /cgroup/a/cpuacct.usage
 200.06 ms       /cgroup/a/cpuacct.usage

By returning unused quota we're able to both stably consume our desired quota
and prevent unintentional overages due to the abuse of slack quota from
previous quota periods (especially on a large machine).

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184758.306848658@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  15 +++++++-
 kernel/sched_fair.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 35c91859f8a6..6baade0d7649 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -259,7 +259,7 @@ struct cfs_bandwidth {
 	u64 runtime_expires;
 
 	int idle, timer_active;
-	struct hrtimer period_timer;
+	struct hrtimer period_timer, slack_timer;
 	struct list_head throttled_cfs_rq;
 
 	/* statistics */
@@ -421,6 +421,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 
 static inline u64 default_cfs_period(void);
 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+	struct cfs_bandwidth *cfs_b =
+		container_of(timer, struct cfs_bandwidth, slack_timer);
+	do_sched_cfs_slack_timer(cfs_b);
+
+	return HRTIMER_NORESTART;
+}
 
 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 {
@@ -453,6 +463,8 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_b->period_timer.function = sched_cfs_period_timer;
+	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	cfs_b->slack_timer.function = sched_cfs_slack_timer;
 }
 
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -488,6 +500,7 @@ static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
 	hrtimer_cancel(&cfs_b->period_timer);
+	hrtimer_cancel(&cfs_b->slack_timer);
 }
 #else
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d201f28c1de7..1ca2cd44d64a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1052,6 +1052,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		__clear_buddies_skip(se);
 }
 
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -1090,6 +1092,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (!(flags & DEQUEUE_SLEEP))
 		se->vruntime -= cfs_rq->min_vruntime;
 
+	/* return excess runtime on last dequeue */
+	return_cfs_rq_runtime(cfs_rq);
+
 	update_min_vruntime(cfs_rq);
 	update_cfs_shares(cfs_rq);
 }
@@ -1674,6 +1679,108 @@ out_unlock:
 	return idle;
 }
 
+/* a cfs_rq won't donate quota below this amount */
+static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
+/* minimum remaining period time to redistribute slack quota */
+static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
+/* how long we wait to gather additional slack before distributing */
+static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+
+/* are we near the end of the current quota period? */
+static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+{
+	struct hrtimer *refresh_timer = &cfs_b->period_timer;
+	u64 remaining;
+
+	/* if the call-back is running a quota refresh is already occurring */
+	if (hrtimer_callback_running(refresh_timer))
+		return 1;
+
+	/* is a quota refresh about to occur? */
+	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
+	if (remaining < min_expire)
+		return 1;
+
+	return 0;
+}
+
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
+
+	/* if there's a quota refresh soon don't bother with slack */
+	if (runtime_refresh_within(cfs_b, min_left))
+		return;
+
+	start_bandwidth_timer(&cfs_b->slack_timer,
+				ns_to_ktime(cfs_bandwidth_slack_period));
+}
+
+/* we know any runtime found here is valid as update_curr() precedes return */
+static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
+
+	if (slack_runtime <= 0)
+		return;
+
+	raw_spin_lock(&cfs_b->lock);
+	if (cfs_b->quota != RUNTIME_INF &&
+	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+		cfs_b->runtime += slack_runtime;
+
+		/* we are under rq->lock, defer unthrottling using a timer */
+		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
+		    !list_empty(&cfs_b->throttled_cfs_rq))
+			start_cfs_slack_bandwidth(cfs_b);
+	}
+	raw_spin_unlock(&cfs_b->lock);
+
+	/* even if it's not valid for return we don't want to try again */
+	cfs_rq->runtime_remaining -= slack_runtime;
+}
+
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+	if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+		return;
+
+	__return_cfs_rq_runtime(cfs_rq);
+}
+
+/*
+ * This is done with a timer (instead of inline with bandwidth return) since
+ * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
+ */
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
+{
+	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+	u64 expires;
+
+	/* confirm we're still not at a refresh boundary */
+	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+		return;
+
+	raw_spin_lock(&cfs_b->lock);
+	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+		runtime = cfs_b->runtime;
+		cfs_b->runtime = 0;
+	}
+	expires = cfs_b->runtime_expires;
+	raw_spin_unlock(&cfs_b->lock);
+
+	if (!runtime)
+		return;
+
+	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+
+	raw_spin_lock(&cfs_b->lock);
+	if (expires == cfs_b->runtime_expires)
+		cfs_b->runtime = runtime;
+	raw_spin_unlock(&cfs_b->lock);
+}
+
 /*
  * When a group wakes up we want to make sure that its quota is not already
  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
@@ -1715,6 +1822,7 @@ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 				     unsigned long delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
-- 
cgit v1.2.2


From 81570d9caaad46a056580c9af078c5c55e6c764f Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 11 Aug 2011 16:25:45 +0200
Subject: tracing/filter: Use static allocation for filter predicates

Don't dynamically allocate filter_pred struct, use static memory.
This way we can get rid of the code managing the dynamic filter_pred
struct object.

The create_pred function integrates create_logical_pred function.
This way the static predicate memory is returned only from
one place.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/r/1313072754-4620-2-git-send-email-jolsa@redhat.com
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 57 +++++++++++---------------------------
 1 file changed, 16 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 256764ecccd6..cb295a117ee7 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -630,11 +630,7 @@ find_event_field(struct ftrace_event_call *call, char *name)
 
 static void filter_free_pred(struct filter_pred *pred)
 {
-	if (!pred)
-		return;
-
 	kfree(pred->field_name);
-	kfree(pred);
 }
 
 static void filter_clear_pred(struct filter_pred *pred)
@@ -1302,39 +1298,30 @@ parse_operand:
 	return 0;
 }
 
-static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
+static struct filter_pred *create_pred(struct filter_parse_state *ps,
+				       int op, char *operand1, char *operand2)
 {
-	struct filter_pred *pred;
+	static struct filter_pred pred;
 
-	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-	if (!pred)
-		return NULL;
+	memset(&pred, 0, sizeof(pred));
+	pred.op = op;
 
-	pred->field_name = kstrdup(operand1, GFP_KERNEL);
-	if (!pred->field_name) {
-		kfree(pred);
+	if (op == OP_AND || op == OP_OR)
+		return &pred;
+
+	if (!operand1 || !operand2) {
+		parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
 		return NULL;
 	}
 
-	strcpy(pred->regex.pattern, operand2);
-	pred->regex.len = strlen(pred->regex.pattern);
-
-	pred->op = op;
-
-	return pred;
-}
-
-static struct filter_pred *create_logical_pred(int op)
-{
-	struct filter_pred *pred;
-
-	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-	if (!pred)
+	pred.field_name = kstrdup(operand1, GFP_KERNEL);
+	if (!pred.field_name)
 		return NULL;
 
-	pred->op = op;
+	strcpy(pred.regex.pattern, operand2);
+	pred.regex.len = strlen(pred.regex.pattern);
 
-	return pred;
+	return &pred;
 }
 
 static int check_preds(struct filter_parse_state *ps)
@@ -1643,19 +1630,7 @@ static int replace_preds(struct ftrace_event_call *call,
 			goto fail;
 		}
 
-		if (elt->op == OP_AND || elt->op == OP_OR) {
-			pred = create_logical_pred(elt->op);
-			goto add_pred;
-		}
-
-		if (!operand1 || !operand2) {
-			parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
-			err = -EINVAL;
-			goto fail;
-		}
-
-		pred = create_pred(elt->op, operand1, operand2);
-add_pred:
+		pred = create_pred(ps, elt->op, operand1, operand2);
 		if (!pred) {
 			err = -ENOMEM;
 			goto fail;
-- 
cgit v1.2.2


From 9d96cd1743547f07a8a6c51a3f7741cfca0a0bee Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 11 Aug 2011 16:25:46 +0200
Subject: tracing/filter: Separate predicate init and filter addition

Making the code cleaner by having one function to fully prepare
the predicate (create_pred), and another to add the predicate to
the filter (filter_add_pred).

As a benefit, this way the dry_run flag stays only inside the
replace_preds function and is not passed deeper.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/r/1313072754-4620-3-git-send-email-jolsa@redhat.com
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 56 ++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index cb295a117ee7..61c8dec3b2b7 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -685,8 +685,7 @@ __pop_pred_stack(struct pred_stack *stack)
 static int filter_set_pred(struct event_filter *filter,
 			   int idx,
 			   struct pred_stack *stack,
-			   struct filter_pred *src,
-			   filter_pred_fn_t fn)
+			   struct filter_pred *src)
 {
 	struct filter_pred *dest = &filter->preds[idx];
 	struct filter_pred *left;
@@ -698,7 +697,6 @@ static int filter_set_pred(struct event_filter *filter,
 		if (!dest->field_name)
 			return -ENOMEM;
 	}
-	dest->fn = fn;
 	dest->index = idx;
 
 	if (dest->op == OP_OR || dest->op == OP_AND) {
@@ -836,12 +834,10 @@ static void filter_free_subsystem_filters(struct event_subsystem *system)
 	}
 }
 
-static int filter_add_pred_fn(struct filter_parse_state *ps,
-			      struct ftrace_event_call *call,
-			      struct event_filter *filter,
-			      struct filter_pred *pred,
-			      struct pred_stack *stack,
-			      filter_pred_fn_t fn)
+static int filter_add_pred(struct filter_parse_state *ps,
+			   struct event_filter *filter,
+			   struct filter_pred *pred,
+			   struct pred_stack *stack)
 {
 	int idx, err;
 
@@ -852,7 +848,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
 
 	idx = filter->n_preds;
 	filter_clear_pred(&filter->preds[idx]);
-	err = filter_set_pred(filter, idx, stack, pred, fn);
+	err = filter_set_pred(filter, idx, stack, pred);
 	if (err)
 		return err;
 
@@ -933,25 +929,16 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
 	return fn;
 }
 
-static int filter_add_pred(struct filter_parse_state *ps,
-			   struct ftrace_event_call *call,
-			   struct event_filter *filter,
-			   struct filter_pred *pred,
-			   struct pred_stack *stack,
-			   bool dry_run)
+static int init_pred(struct filter_parse_state *ps,
+		     struct ftrace_event_call *call,
+		     struct filter_pred *pred)
+
 {
 	struct ftrace_event_field *field;
-	filter_pred_fn_t fn;
+	filter_pred_fn_t fn = filter_pred_none;
 	unsigned long long val;
 	int ret;
 
-	fn = pred->fn = filter_pred_none;
-
-	if (pred->op == OP_AND)
-		goto add_pred_fn;
-	else if (pred->op == OP_OR)
-		goto add_pred_fn;
-
 	field = find_event_field(call, pred->field_name);
 	if (!field) {
 		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
@@ -997,9 +984,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
 	if (pred->op == OP_NE)
 		pred->not = 1;
 
-add_pred_fn:
-	if (!dry_run)
-		return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
+	pred->fn = fn;
 	return 0;
 }
 
@@ -1299,6 +1284,7 @@ parse_operand:
 }
 
 static struct filter_pred *create_pred(struct filter_parse_state *ps,
+				       struct ftrace_event_call *call,
 				       int op, char *operand1, char *operand2)
 {
 	static struct filter_pred pred;
@@ -1321,7 +1307,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
 	strcpy(pred.regex.pattern, operand2);
 	pred.regex.len = strlen(pred.regex.pattern);
 
-	return &pred;
+	return init_pred(ps, call, &pred) ? NULL : &pred;
 }
 
 static int check_preds(struct filter_parse_state *ps)
@@ -1630,16 +1616,20 @@ static int replace_preds(struct ftrace_event_call *call,
 			goto fail;
 		}
 
-		pred = create_pred(ps, elt->op, operand1, operand2);
+		pred = create_pred(ps, call, elt->op, operand1, operand2);
 		if (!pred) {
 			err = -ENOMEM;
 			goto fail;
 		}
-		err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
-		filter_free_pred(pred);
-		if (err)
-			goto fail;
+		if (!dry_run) {
+			err = filter_add_pred(ps, filter, pred, &stack);
+			if (err) {
+				filter_free_pred(pred);
+				goto fail;
+			}
+		}
 
+		filter_free_pred(pred);
 		operand1 = operand2 = NULL;
 	}
 
-- 
cgit v1.2.2


From 61aaef55300088e12d7f853adeea65d1aa1562db Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 11 Aug 2011 16:25:47 +0200
Subject: tracing/filter: Remove field_name from filter_pred struct

The field_name was used just for finding event's fields. This way we
don't need to care about field_name allocation/free.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/r/1313072754-4620-4-git-send-email-jolsa@redhat.com
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h               | 11 +-------
 kernel/trace/trace_events_filter.c | 53 +++++++++-----------------------------
 2 files changed, 13 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 616846bcfee5..2eb3cf6d37bc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -761,16 +761,7 @@ struct filter_pred {
 	filter_pred_fn_t 	fn;
 	u64 			val;
 	struct regex		regex;
-	/*
-	 * Leaf nodes use field_name, ops is used by AND and OR
-	 * nodes. The field_name is always freed when freeing a pred.
-	 * We can overload field_name for ops and have it freed
-	 * as well.
-	 */
-	union {
-		char		*field_name;
-		unsigned short	*ops;
-	};
+	unsigned short		*ops;
 	int 			offset;
 	int 			not;
 	int 			op;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 61c8dec3b2b7..97b93f31884b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -628,18 +628,6 @@ find_event_field(struct ftrace_event_call *call, char *name)
 	return __find_event_field(head, name);
 }
 
-static void filter_free_pred(struct filter_pred *pred)
-{
-	kfree(pred->field_name);
-}
-
-static void filter_clear_pred(struct filter_pred *pred)
-{
-	kfree(pred->field_name);
-	pred->field_name = NULL;
-	pred->regex.len = 0;
-}
-
 static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
 {
 	stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
@@ -692,11 +680,6 @@ static int filter_set_pred(struct event_filter *filter,
 	struct filter_pred *right;
 
 	*dest = *src;
-	if (src->field_name) {
-		dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
-		if (!dest->field_name)
-			return -ENOMEM;
-	}
 	dest->index = idx;
 
 	if (dest->op == OP_OR || dest->op == OP_AND) {
@@ -737,11 +720,7 @@ static int filter_set_pred(struct event_filter *filter,
 
 static void __free_preds(struct event_filter *filter)
 {
-	int i;
-
 	if (filter->preds) {
-		for (i = 0; i < filter->a_preds; i++)
-			kfree(filter->preds[i].field_name);
 		kfree(filter->preds);
 		filter->preds = NULL;
 	}
@@ -839,16 +818,14 @@ static int filter_add_pred(struct filter_parse_state *ps,
 			   struct filter_pred *pred,
 			   struct pred_stack *stack)
 {
-	int idx, err;
+	int err;
 
 	if (WARN_ON(filter->n_preds == filter->a_preds)) {
 		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
 		return -ENOSPC;
 	}
 
-	idx = filter->n_preds;
-	filter_clear_pred(&filter->preds[idx]);
-	err = filter_set_pred(filter, idx, stack, pred);
+	err = filter_set_pred(filter, filter->n_preds, stack, pred);
 	if (err)
 		return err;
 
@@ -930,21 +907,14 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
 }
 
 static int init_pred(struct filter_parse_state *ps,
-		     struct ftrace_event_call *call,
+		     struct ftrace_event_field *field,
 		     struct filter_pred *pred)
 
 {
-	struct ftrace_event_field *field;
 	filter_pred_fn_t fn = filter_pred_none;
 	unsigned long long val;
 	int ret;
 
-	field = find_event_field(call, pred->field_name);
-	if (!field) {
-		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
-		return -EINVAL;
-	}
-
 	pred->offset = field->offset;
 
 	if (!is_legal_op(field, pred->op)) {
@@ -1287,6 +1257,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
 				       struct ftrace_event_call *call,
 				       int op, char *operand1, char *operand2)
 {
+	struct ftrace_event_field *field;
 	static struct filter_pred pred;
 
 	memset(&pred, 0, sizeof(pred));
@@ -1300,14 +1271,16 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
 		return NULL;
 	}
 
-	pred.field_name = kstrdup(operand1, GFP_KERNEL);
-	if (!pred.field_name)
+	field = find_event_field(call, operand1);
+	if (!field) {
+		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
 		return NULL;
+	}
 
 	strcpy(pred.regex.pattern, operand2);
 	pred.regex.len = strlen(pred.regex.pattern);
 
-	return init_pred(ps, call, &pred) ? NULL : &pred;
+	return init_pred(ps, field, &pred) ? NULL : &pred;
 }
 
 static int check_preds(struct filter_parse_state *ps)
@@ -1618,18 +1591,16 @@ static int replace_preds(struct ftrace_event_call *call,
 
 		pred = create_pred(ps, call, elt->op, operand1, operand2);
 		if (!pred) {
-			err = -ENOMEM;
+			err = -EINVAL;
 			goto fail;
 		}
+
 		if (!dry_run) {
 			err = filter_add_pred(ps, filter, pred, &stack);
-			if (err) {
-				filter_free_pred(pred);
+			if (err)
 				goto fail;
-			}
 		}
 
-		filter_free_pred(pred);
 		operand1 = operand2 = NULL;
 	}
 
-- 
cgit v1.2.2


From 3f78f935e7fec3067b7990f9e4d324e1de70f79c Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 11 Aug 2011 16:25:48 +0200
Subject: tracing/filter: Simplify tracepoint event lookup

We dont need to perform lookup through the ftrace_events list,
instead we can use the 'tp_event' field.

Each perf_event contains tracepoint event field 'tp_event', which
got initialized during the tracepoint event initialization.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/r/1313072754-4620-5-git-send-email-jolsa@redhat.com
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 97b93f31884b..0948905dd39c 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1894,17 +1894,14 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 	int err;
 	struct event_filter *filter;
 	struct filter_parse_state *ps;
-	struct ftrace_event_call *call = NULL;
+	struct ftrace_event_call *call;
 
 	mutex_lock(&event_mutex);
 
-	list_for_each_entry(call, &ftrace_events, list) {
-		if (call->event.type == event_id)
-			break;
-	}
+	call = event->tp_event;
 
 	err = -EINVAL;
-	if (&call->list == &ftrace_events)
+	if (!call)
 		goto out_unlock;
 
 	err = -EEXIST;
-- 
cgit v1.2.2


From f03f5979945c573801c25ba3089ef17c4d7edc61 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 11 Aug 2011 16:25:49 +0200
Subject: tracing/filter: Unify predicate tree walking, change check_pred_tree
 function to use it

Adding walk_pred_tree function to be used for walking throught
the filter predicates.

For each predicate the callback function is called, allowing
users to add their own functionality or customize their way
through the filter predicates.

Changing check_pred_tree function to use walk_pred_tree.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/r/1313072754-4620-6-git-send-email-jolsa@redhat.com
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 137 +++++++++++++++++++++++--------------
 1 file changed, 86 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 0948905dd39c..5b889d43d856 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -381,6 +381,63 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
 	return pred;
 }
 
+enum walk_return {
+	WALK_PRED_ABORT,
+	WALK_PRED_PARENT,
+	WALK_PRED_DEFAULT,
+};
+
+typedef int (*filter_pred_walkcb_t) (enum move_type move,
+				     struct filter_pred *pred,
+				     int *err, void *data);
+
+static int walk_pred_tree(struct filter_pred *preds,
+			  struct filter_pred *root,
+			  filter_pred_walkcb_t cb, void *data)
+{
+	struct filter_pred *pred = root;
+	enum move_type move = MOVE_DOWN;
+	int done = 0;
+
+	if  (!preds)
+		return -EINVAL;
+
+	do {
+		int err = 0, ret;
+
+		ret = cb(move, pred, &err, data);
+		if (ret == WALK_PRED_ABORT)
+			return err;
+		if (ret == WALK_PRED_PARENT)
+			goto get_parent;
+
+		switch (move) {
+		case MOVE_DOWN:
+			if (pred->left != FILTER_PRED_INVALID) {
+				pred = &preds[pred->left];
+				continue;
+			}
+			goto get_parent;
+		case MOVE_UP_FROM_LEFT:
+			pred = &preds[pred->right];
+			move = MOVE_DOWN;
+			continue;
+		case MOVE_UP_FROM_RIGHT:
+ get_parent:
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent,
+					       &move);
+			continue;
+		}
+		done = 1;
+	} while (!done);
+
+	/* We are fine. */
+	return 0;
+}
+
 /*
  * A series of AND or ORs where found together. Instead of
  * climbing up and down the tree branches, an array of the
@@ -1321,6 +1378,23 @@ static int count_preds(struct filter_parse_state *ps)
 	return n_preds;
 }
 
+struct check_pred_data {
+	int count;
+	int max;
+};
+
+static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred,
+			      int *err, void *data)
+{
+	struct check_pred_data *d = data;
+
+	if (WARN_ON(d->count++ > d->max)) {
+		*err = -EINVAL;
+		return WALK_PRED_ABORT;
+	}
+	return WALK_PRED_DEFAULT;
+}
+
 /*
  * The tree is walked at filtering of an event. If the tree is not correctly
  * built, it may cause an infinite loop. Check here that the tree does
@@ -1329,58 +1403,19 @@ static int count_preds(struct filter_parse_state *ps)
 static int check_pred_tree(struct event_filter *filter,
 			   struct filter_pred *root)
 {
-	struct filter_pred *preds;
-	struct filter_pred *pred;
-	enum move_type move = MOVE_DOWN;
-	int count = 0;
-	int done = 0;
-	int max;
-
-	/*
-	 * The max that we can hit a node is three times.
-	 * Once going down, once coming up from left, and
-	 * once coming up from right. This is more than enough
-	 * since leafs are only hit a single time.
-	 */
-	max = 3 * filter->n_preds;
-
-	preds = filter->preds;
-	if  (!preds)
-		return -EINVAL;
-	pred = root;
-
-	do {
-		if (WARN_ON(count++ > max))
-			return -EINVAL;
-
-		switch (move) {
-		case MOVE_DOWN:
-			if (pred->left != FILTER_PRED_INVALID) {
-				pred = &preds[pred->left];
-				continue;
-			}
-			/* A leaf at the root is just a leaf in the tree */
-			if (pred == root)
-				break;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		case MOVE_UP_FROM_LEFT:
-			pred = &preds[pred->right];
-			move = MOVE_DOWN;
-			continue;
-		case MOVE_UP_FROM_RIGHT:
-			if (pred == root)
-				break;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		}
-		done = 1;
-	} while (!done);
+	struct check_pred_data data = {
+		/*
+		 * The max that we can hit a node is three times.
+		 * Once going down, once coming up from left, and
+		 * once coming up from right. This is more than enough
+		 * since leafs are only hit a single time.
+		 */
+		.max   = 3 * filter->n_preds,
+		.count = 0,
+	};
 
-	/* We are fine. */
-	return 0;
+	return walk_pred_tree(filter->preds, root,
+			      check_pred_tree_cb, &data);
 }
 
 static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
-- 
cgit v1.2.2


From c00b060f36e1238816ebcf2c8cccd5e9fa068980 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 11 Aug 2011 16:25:50 +0200
Subject: tracing/filter: Change count_leafs function to use walk_pred_tree

Changing count_leafs function to use unified predicates tree
processing.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/r/1313072754-4620-7-git-send-email-jolsa@redhat.com
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 47 ++++++++++++--------------------------
 1 file changed, 14 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 5b889d43d856..ebbb2611982e 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1418,43 +1418,24 @@ static int check_pred_tree(struct event_filter *filter,
 			      check_pred_tree_cb, &data);
 }
 
-static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
+static int count_leafs_cb(enum move_type move, struct filter_pred *pred,
+			  int *err, void *data)
 {
-	struct filter_pred *pred;
-	enum move_type move = MOVE_DOWN;
-	int count = 0;
-	int done = 0;
+	int *count = data;
 
-	pred = root;
+	if ((move == MOVE_DOWN) &&
+	    (pred->left == FILTER_PRED_INVALID))
+		(*count)++;
 
-	do {
-		switch (move) {
-		case MOVE_DOWN:
-			if (pred->left != FILTER_PRED_INVALID) {
-				pred = &preds[pred->left];
-				continue;
-			}
-			/* A leaf at the root is just a leaf in the tree */
-			if (pred == root)
-				return 1;
-			count++;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		case MOVE_UP_FROM_LEFT:
-			pred = &preds[pred->right];
-			move = MOVE_DOWN;
-			continue;
-		case MOVE_UP_FROM_RIGHT:
-			if (pred == root)
-				break;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		}
-		done = 1;
-	} while (!done);
+	return WALK_PRED_DEFAULT;
+}
+
+static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
+{
+	int count = 0, ret;
 
+	ret = walk_pred_tree(preds, root, count_leafs_cb, &count);
+	WARN_ON(ret);
 	return count;
 }
 
-- 
cgit v1.2.2


From 1b797fe5aaac11e60fce1592119d0517e95aba95 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 11 Aug 2011 16:25:51 +0200
Subject: tracing/filter: Change fold_pred_tree function to use walk_pred_tree

Changing fold_pred_tree function to use unified predicates tree
processing.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/r/1313072754-4620-8-git-send-email-jolsa@redhat.com
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 65 ++++++++++++--------------------------
 1 file changed, 20 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index ebbb2611982e..d8aa100cb22e 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1496,6 +1496,24 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
 	return 0;
 }
 
+static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
+			     int *err, void *data)
+{
+	struct filter_pred *preds = data;
+
+	if (move != MOVE_DOWN)
+		return WALK_PRED_DEFAULT;
+	if (!(pred->index & FILTER_PRED_FOLD))
+		return WALK_PRED_DEFAULT;
+
+	*err = fold_pred(preds, pred);
+	if (*err)
+		return WALK_PRED_ABORT;
+
+	/* eveyrhing below is folded, continue with parent */
+	return WALK_PRED_PARENT;
+}
+
 /*
  * To optimize the processing of the ops, if we have several "ors" or
  * "ands" together, we can put them in an array and process them all
@@ -1504,51 +1522,8 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
 static int fold_pred_tree(struct event_filter *filter,
 			   struct filter_pred *root)
 {
-	struct filter_pred *preds;
-	struct filter_pred *pred;
-	enum move_type move = MOVE_DOWN;
-	int done = 0;
-	int err;
-
-	preds = filter->preds;
-	if  (!preds)
-		return -EINVAL;
-	pred = root;
-
-	do {
-		switch (move) {
-		case MOVE_DOWN:
-			if (pred->index & FILTER_PRED_FOLD) {
-				err = fold_pred(preds, pred);
-				if (err)
-					return err;
-				/* Folded nodes are like leafs */
-			} else if (pred->left != FILTER_PRED_INVALID) {
-				pred = &preds[pred->left];
-				continue;
-			}
-
-			/* A leaf at the root is just a leaf in the tree */
-			if (pred == root)
-				break;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		case MOVE_UP_FROM_LEFT:
-			pred = &preds[pred->right];
-			move = MOVE_DOWN;
-			continue;
-		case MOVE_UP_FROM_RIGHT:
-			if (pred == root)
-				break;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		}
-		done = 1;
-	} while (!done);
-
-	return 0;
+	return walk_pred_tree(filter->preds, root, fold_pred_tree_cb,
+			      filter->preds);
 }
 
 static int replace_preds(struct ftrace_event_call *call,
-- 
cgit v1.2.2


From 96bc293a97e7f1651977976be7dd42031f6d8ea3 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 11 Aug 2011 16:25:52 +0200
Subject: tracing/filter: Change fold_pred function to use walk_pred_tree

Changing fold_pred_tree function to use unified predicates tree
processing.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/r/1313072754-4620-9-git-send-email-jolsa@redhat.com
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 68 ++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d8aa100cb22e..f44e68b89f15 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1439,13 +1439,40 @@ static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
 	return count;
 }
 
+struct fold_pred_data {
+	struct filter_pred *root;
+	int count;
+	int children;
+};
+
+static int fold_pred_cb(enum move_type move, struct filter_pred *pred,
+			int *err, void *data)
+{
+	struct fold_pred_data *d = data;
+	struct filter_pred *root = d->root;
+
+	if (move != MOVE_DOWN)
+		return WALK_PRED_DEFAULT;
+	if (pred->left != FILTER_PRED_INVALID)
+		return WALK_PRED_DEFAULT;
+
+	if (WARN_ON(d->count == d->children)) {
+		*err = -EINVAL;
+		return WALK_PRED_ABORT;
+	}
+
+	pred->index &= ~FILTER_PRED_FOLD;
+	root->ops[d->count++] = pred->index;
+	return WALK_PRED_DEFAULT;
+}
+
 static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
 {
-	struct filter_pred *pred;
-	enum move_type move = MOVE_DOWN;
-	int count = 0;
+	struct fold_pred_data data = {
+		.root  = root,
+		.count = 0,
+	};
 	int children;
-	int done = 0;
 
 	/* No need to keep the fold flag */
 	root->index &= ~FILTER_PRED_FOLD;
@@ -1463,37 +1490,8 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
 		return -ENOMEM;
 
 	root->val = children;
-
-	pred = root;
-	do {
-		switch (move) {
-		case MOVE_DOWN:
-			if (pred->left != FILTER_PRED_INVALID) {
-				pred = &preds[pred->left];
-				continue;
-			}
-			if (WARN_ON(count == children))
-				return -EINVAL;
-			pred->index &= ~FILTER_PRED_FOLD;
-			root->ops[count++] = pred->index;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		case MOVE_UP_FROM_LEFT:
-			pred = &preds[pred->right];
-			move = MOVE_DOWN;
-			continue;
-		case MOVE_UP_FROM_RIGHT:
-			if (pred == root)
-				break;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		}
-		done = 1;
-	} while (!done);
-
-	return 0;
+	data.children = children;
+	return walk_pred_tree(preds, root, fold_pred_cb, &data);
 }
 
 static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
-- 
cgit v1.2.2


From f30120fce1efaa426f340a354d5ace36dab59f0e Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 11 Aug 2011 16:25:53 +0200
Subject: tracing/filter: Change filter_match_preds function to use
 walk_pred_tree

Changing filter_match_preds function to use unified predicates tree
processing.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/r/1313072754-4620-10-git-send-email-jolsa@redhat.com
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 124 +++++++++++++++++--------------------
 1 file changed, 58 insertions(+), 66 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f44e68b89f15..319c3cac7d95 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -467,99 +467,91 @@ static int process_ops(struct filter_pred *preds,
 
 	for (i = 0; i < op->val; i++) {
 		pred = &preds[op->ops[i]];
-		match = pred->fn(pred, rec);
+		if (!WARN_ON_ONCE(!pred->fn))
+			match = pred->fn(pred, rec);
 		if (!!match == type)
 			return match;
 	}
 	return match;
 }
 
+struct filter_match_preds_data {
+	struct filter_pred *preds;
+	int match;
+	void *rec;
+};
+
+static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
+				 int *err, void *data)
+{
+	struct filter_match_preds_data *d = data;
+
+	*err = 0;
+	switch (move) {
+	case MOVE_DOWN:
+		/* only AND and OR have children */
+		if (pred->left != FILTER_PRED_INVALID) {
+			/* If ops is set, then it was folded. */
+			if (!pred->ops)
+				return WALK_PRED_DEFAULT;
+			/* We can treat folded ops as a leaf node */
+			d->match = process_ops(d->preds, pred, d->rec);
+		} else {
+			if (!WARN_ON_ONCE(!pred->fn))
+				d->match = pred->fn(pred, d->rec);
+		}
+
+		return WALK_PRED_PARENT;
+	case MOVE_UP_FROM_LEFT:
+		/*
+		 * Check for short circuits.
+		 *
+		 * Optimization: !!match == (pred->op == OP_OR)
+		 *   is the same as:
+		 * if ((match && pred->op == OP_OR) ||
+		 *     (!match && pred->op == OP_AND))
+		 */
+		if (!!d->match == (pred->op == OP_OR))
+			return WALK_PRED_PARENT;
+		break;
+	case MOVE_UP_FROM_RIGHT:
+		break;
+	}
+
+	return WALK_PRED_DEFAULT;
+}
+
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
-	int match = -1;
-	enum move_type move = MOVE_DOWN;
 	struct filter_pred *preds;
-	struct filter_pred *pred;
 	struct filter_pred *root;
-	int n_preds;
-	int done = 0;
+	struct filter_match_preds_data data = {
+		/* match is currently meaningless */
+		.match = -1,
+		.rec   = rec,
+	};
+	int n_preds, ret;
 
 	/* no filter is considered a match */
 	if (!filter)
 		return 1;
 
 	n_preds = filter->n_preds;
-
 	if (!n_preds)
 		return 1;
 
 	/*
 	 * n_preds, root and filter->preds are protect with preemption disabled.
 	 */
-	preds = rcu_dereference_sched(filter->preds);
 	root = rcu_dereference_sched(filter->root);
 	if (!root)
 		return 1;
 
-	pred = root;
-
-	/* match is currently meaningless */
-	match = -1;
-
-	do {
-		switch (move) {
-		case MOVE_DOWN:
-			/* only AND and OR have children */
-			if (pred->left != FILTER_PRED_INVALID) {
-				/* If ops is set, then it was folded. */
-				if (!pred->ops) {
-					/* keep going to down the left side */
-					pred = &preds[pred->left];
-					continue;
-				}
-				/* We can treat folded ops as a leaf node */
-				match = process_ops(preds, pred, rec);
-			} else
-				match = pred->fn(pred, rec);
-			/* If this pred is the only pred */
-			if (pred == root)
-				break;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		case MOVE_UP_FROM_LEFT:
-			/*
-			 * Check for short circuits.
-			 *
-			 * Optimization: !!match == (pred->op == OP_OR)
-			 *   is the same as:
-			 * if ((match && pred->op == OP_OR) ||
-			 *     (!match && pred->op == OP_AND))
-			 */
-			if (!!match == (pred->op == OP_OR)) {
-				if (pred == root)
-					break;
-				pred = get_pred_parent(pred, preds,
-						       pred->parent, &move);
-				continue;
-			}
-			/* now go down the right side of the tree. */
-			pred = &preds[pred->right];
-			move = MOVE_DOWN;
-			continue;
-		case MOVE_UP_FROM_RIGHT:
-			/* We finished this equation. */
-			if (pred == root)
-				break;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		}
-		done = 1;
-	} while (!done);
-
-	return match;
+	data.preds = preds = rcu_dereference_sched(filter->preds);
+	ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data);
+	WARN_ON(ret);
+	return data.match;
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
 
-- 
cgit v1.2.2


From 1d0e78e380cd2802aa603a50e08220dfc681141c Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 11 Aug 2011 16:25:54 +0200
Subject: tracing/filter: Add startup tests for events filter

Adding automated tests running as late_initcall. Tests are
compiled in with CONFIG_FTRACE_STARTUP_TEST option.

Adding test event "ftrace_test_filter" used to simulate
filter processing during event occurance.

String filters are compiled and tested against several
test events with different values.

Also testing that evaluation of explicit predicates is ommited
due to the lazy filter evaluation.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/r/1313072754-4620-11-git-send-email-jolsa@redhat.com
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Makefile                   |   2 +
 kernel/trace/trace.h                    |   3 +
 kernel/trace/trace_events_filter.c      | 209 ++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_filter_test.h |  50 ++++++++
 4 files changed, 264 insertions(+)
 create mode 100644 kernel/trace/trace_events_filter_test.h

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 761c510a06c5..b384ed512bac 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,6 +15,8 @@ ifdef CONFIG_TRACING_BRANCHES
 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 endif
 
+CFLAGS_trace_events_filter.o := -I$(src)
+
 #
 # Make the trace clocks available generally: it's infrastructure
 # relied on by ptrace for example:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2eb3cf6d37bc..4c7540ad5dcb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -762,6 +762,9 @@ struct filter_pred {
 	u64 			val;
 	struct regex		regex;
 	unsigned short		*ops;
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+	struct ftrace_event_field *field;
+#endif
 	int 			offset;
 	int 			not;
 	int 			op;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 319c3cac7d95..6a642e278241 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1329,6 +1329,9 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
 	strcpy(pred.regex.pattern, operand2);
 	pred.regex.len = strlen(pred.regex.pattern);
 
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+	pred.field = field;
+#endif
 	return init_pred(ps, field, &pred) ? NULL : &pred;
 }
 
@@ -1926,3 +1929,209 @@ out_unlock:
 
 #endif /* CONFIG_PERF_EVENTS */
 
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace_events_filter_test.h"
+
+static int test_get_filter(char *filter_str, struct ftrace_event_call *call,
+			   struct event_filter **pfilter)
+{
+	struct event_filter *filter;
+	struct filter_parse_state *ps;
+	int err = -ENOMEM;
+
+	filter = __alloc_filter();
+	if (!filter)
+		goto out;
+
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps)
+		goto free_filter;
+
+	parse_init(ps, filter_ops, filter_str);
+	err = filter_parse(ps);
+	if (err)
+		goto free_ps;
+
+	err = replace_preds(call, filter, ps, filter_str, false);
+	if (!err)
+		*pfilter = filter;
+
+ free_ps:
+	filter_opstack_clear(ps);
+	postfix_clear(ps);
+	kfree(ps);
+
+ free_filter:
+	if (err)
+		__free_filter(filter);
+
+ out:
+	return err;
+}
+
+#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
+{ \
+	.filter = FILTER, \
+	.rec    = { .a = va, .b = vb, .c = vc, .d = vd, \
+		    .e = ve, .f = vf, .g = vg, .h = vh }, \
+	.match  = m, \
+	.not_visited = nvisit, \
+}
+#define YES 1
+#define NO  0
+
+static struct test_filter_data_t {
+	char *filter;
+	struct ftrace_raw_ftrace_test_filter rec;
+	int match;
+	char *not_visited;
+} test_filter_data[] = {
+#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \
+	       "e == 1 && f == 1 && g == 1 && h == 1"
+	DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""),
+	DATA_REC(NO,  0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"),
+	DATA_REC(NO,  1, 1, 1, 1, 1, 1, 1, 0, ""),
+#undef FILTER
+#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \
+	       "e == 1 || f == 1 || g == 1 || h == 1"
+	DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 0, ""),
+	DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""),
+	DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"),
+#undef FILTER
+#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \
+	       "(e == 1 || f == 1) && (g == 1 || h == 1)"
+	DATA_REC(NO,  0, 0, 1, 1, 1, 1, 1, 1, "dfh"),
+	DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
+	DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"),
+	DATA_REC(NO,  1, 0, 1, 0, 0, 1, 0, 0, "bd"),
+#undef FILTER
+#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \
+	       "(e == 1 && f == 1) || (g == 1 && h == 1)"
+	DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"),
+	DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""),
+	DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 1, ""),
+#undef FILTER
+#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \
+	       "(e == 1 && f == 1) || (g == 1 && h == 1)"
+	DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"),
+	DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 1, ""),
+	DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""),
+#undef FILTER
+#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \
+	       "(e == 1 || f == 1)) && (g == 1 || h == 1)"
+	DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"),
+	DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 0, ""),
+	DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"),
+#undef FILTER
+#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \
+	       "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))"
+	DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"),
+	DATA_REC(NO,  0, 1, 0, 1, 0, 1, 0, 1, ""),
+	DATA_REC(NO,  1, 0, 1, 0, 1, 0, 1, 0, ""),
+#undef FILTER
+#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \
+	       "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))"
+	DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"),
+	DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
+	DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"),
+};
+
+#undef DATA_REC
+#undef FILTER
+#undef YES
+#undef NO
+
+#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t))
+
+static int test_pred_visited;
+
+static int test_pred_visited_fn(struct filter_pred *pred, void *event)
+{
+	struct ftrace_event_field *field = pred->field;
+
+	test_pred_visited = 1;
+	printk(KERN_INFO "\npred visited %s\n", field->name);
+	return 1;
+}
+
+static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred,
+			     int *err, void *data)
+{
+	char *fields = data;
+
+	if ((move == MOVE_DOWN) &&
+	    (pred->left == FILTER_PRED_INVALID)) {
+		struct ftrace_event_field *field = pred->field;
+
+		if (!field) {
+			WARN(1, "all leafs should have field defined");
+			return WALK_PRED_DEFAULT;
+		}
+		if (!strchr(fields, *field->name))
+			return WALK_PRED_DEFAULT;
+
+		WARN_ON(!pred->fn);
+		pred->fn = test_pred_visited_fn;
+	}
+	return WALK_PRED_DEFAULT;
+}
+
+static __init int ftrace_test_event_filter(void)
+{
+	int i;
+
+	printk(KERN_INFO "Testing ftrace filter: ");
+
+	for (i = 0; i < DATA_CNT; i++) {
+		struct event_filter *filter = NULL;
+		struct test_filter_data_t *d = &test_filter_data[i];
+		int err;
+
+		err = test_get_filter(d->filter, &event_ftrace_test_filter,
+				      &filter);
+		if (err) {
+			printk(KERN_INFO
+			       "Failed to get filter for '%s', err %d\n",
+			       d->filter, err);
+			break;
+		}
+
+		if (*d->not_visited)
+			walk_pred_tree(filter->preds, filter->root,
+				       test_walk_pred_cb,
+				       d->not_visited);
+
+		test_pred_visited = 0;
+		err = filter_match_preds(filter, &d->rec);
+
+		__free_filter(filter);
+
+		if (test_pred_visited) {
+			printk(KERN_INFO
+			       "Failed, unwanted pred visited for filter %s\n",
+			       d->filter);
+			break;
+		}
+
+		if (err != d->match) {
+			printk(KERN_INFO
+			       "Failed to match filter '%s', expected %d\n",
+			       d->filter, d->match);
+			break;
+		}
+	}
+
+	if (i == DATA_CNT)
+		printk(KERN_CONT "OK\n");
+
+	return 0;
+}
+
+late_initcall(ftrace_test_event_filter);
+
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h
new file mode 100644
index 000000000000..bfd4dba0d603
--- /dev/null
+++ b/kernel/trace/trace_events_filter_test.h
@@ -0,0 +1,50 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM test
+
+#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TEST_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(ftrace_test_filter,
+
+	TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h),
+
+	TP_ARGS(a, b, c, d, e, f, g, h),
+
+	TP_STRUCT__entry(
+		__field(int, a)
+		__field(int, b)
+		__field(int, c)
+		__field(int, d)
+		__field(int, e)
+		__field(int, f)
+		__field(int, g)
+		__field(int, h)
+	),
+
+	TP_fast_assign(
+		__entry->a = a;
+		__entry->b = b;
+		__entry->c = c;
+		__entry->d = d;
+		__entry->e = e;
+		__entry->f = f;
+		__entry->g = g;
+		__entry->h = h;
+	),
+
+	TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d",
+		  __entry->a, __entry->b, __entry->c, __entry->d,
+		  __entry->e, __entry->f, __entry->g, __entry->h)
+);
+
+#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_events_filter_test
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.2


From e8db0be1245de16a6cc6365506abc392c3c212d4 Mon Sep 17 00:00:00 2001
From: Jean Pihet <j-pihet@ti.com>
Date: Thu, 25 Aug 2011 15:35:03 +0200
Subject: PM QoS: Move and rename the implementation files

The PM QoS implementation files are better named
kernel/power/qos.c and include/linux/pm_qos.h.

The PM QoS support is compiled under the CONFIG_PM option.

Signed-off-by: Jean Pihet <j-pihet@ti.com>
Acked-by: markgross <markgross@thegnar.org>
Reviewed-by: Kevin Hilman <khilman@ti.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/Makefile        |   2 +-
 kernel/pm_qos_params.c | 481 -------------------------------------------------
 kernel/power/Makefile  |   2 +-
 kernel/power/qos.c     | 481 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 483 insertions(+), 483 deletions(-)
 delete mode 100644 kernel/pm_qos_params.c
 create mode 100644 kernel/power/qos.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index eca595e2fd52..2da48d3515eb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
+	    notifier.o ksysfs.o sched_clock.o cred.o \
 	    async.o range.o
 obj-y += groups.o
 
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
deleted file mode 100644
index 37f05d0f0793..000000000000
--- a/kernel/pm_qos_params.c
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * This module exposes the interface to kernel space for specifying
- * QoS dependencies.  It provides infrastructure for registration of:
- *
- * Dependents on a QoS value : register requests
- * Watchers of QoS value : get notified when target QoS value changes
- *
- * This QoS design is best effort based.  Dependents register their QoS needs.
- * Watchers register to keep track of the current QoS needs of the system.
- *
- * There are 3 basic classes of QoS parameter: latency, timeout, throughput
- * each have defined units:
- * latency: usec
- * timeout: usec <-- currently not used.
- * throughput: kbs (kilo byte / sec)
- *
- * There are lists of pm_qos_objects each one wrapping requests, notifiers
- *
- * User mode requests on a QOS parameter register themselves to the
- * subsystem by opening the device node /dev/... and writing there request to
- * the node.  As long as the process holds a file handle open to the node the
- * client continues to be accounted for.  Upon file release the usermode
- * request is removed and a new qos target is computed.  This way when the
- * request that the application has is cleaned up when closes the file
- * pointer or exits the pm_qos_object will get an opportunity to clean up.
- *
- * Mark Gross <mgross@linux.intel.com>
- */
-
-/*#define DEBUG*/
-
-#include <linux/pm_qos_params.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/device.h>
-#include <linux/miscdevice.h>
-#include <linux/string.h>
-#include <linux/platform_device.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-
-#include <linux/uaccess.h>
-
-/*
- * locking rule: all changes to requests or notifiers lists
- * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
- * held, taken with _irqsave.  One lock to rule them all
- */
-enum pm_qos_type {
-	PM_QOS_MAX,		/* return the largest value */
-	PM_QOS_MIN		/* return the smallest value */
-};
-
-/*
- * Note: The lockless read path depends on the CPU accessing
- * target_value atomically.  Atomic access is only guaranteed on all CPU
- * types linux supports for 32 bit quantites
- */
-struct pm_qos_object {
-	struct plist_head requests;
-	struct blocking_notifier_head *notifiers;
-	struct miscdevice pm_qos_power_miscdev;
-	char *name;
-	s32 target_value;	/* Do not change to 64 bit */
-	s32 default_value;
-	enum pm_qos_type type;
-};
-
-static DEFINE_SPINLOCK(pm_qos_lock);
-
-static struct pm_qos_object null_pm_qos;
-static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
-static struct pm_qos_object cpu_dma_pm_qos = {
-	.requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests),
-	.notifiers = &cpu_dma_lat_notifier,
-	.name = "cpu_dma_latency",
-	.target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
-	.default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
-	.type = PM_QOS_MIN,
-};
-
-static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
-static struct pm_qos_object network_lat_pm_qos = {
-	.requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests),
-	.notifiers = &network_lat_notifier,
-	.name = "network_latency",
-	.target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
-	.default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
-	.type = PM_QOS_MIN
-};
-
-
-static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
-static struct pm_qos_object network_throughput_pm_qos = {
-	.requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests),
-	.notifiers = &network_throughput_notifier,
-	.name = "network_throughput",
-	.target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
-	.default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
-	.type = PM_QOS_MAX,
-};
-
-
-static struct pm_qos_object *pm_qos_array[] = {
-	&null_pm_qos,
-	&cpu_dma_pm_qos,
-	&network_lat_pm_qos,
-	&network_throughput_pm_qos
-};
-
-static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
-		size_t count, loff_t *f_pos);
-static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
-		size_t count, loff_t *f_pos);
-static int pm_qos_power_open(struct inode *inode, struct file *filp);
-static int pm_qos_power_release(struct inode *inode, struct file *filp);
-
-static const struct file_operations pm_qos_power_fops = {
-	.write = pm_qos_power_write,
-	.read = pm_qos_power_read,
-	.open = pm_qos_power_open,
-	.release = pm_qos_power_release,
-	.llseek = noop_llseek,
-};
-
-/* unlocked internal variant */
-static inline int pm_qos_get_value(struct pm_qos_object *o)
-{
-	if (plist_head_empty(&o->requests))
-		return o->default_value;
-
-	switch (o->type) {
-	case PM_QOS_MIN:
-		return plist_first(&o->requests)->prio;
-
-	case PM_QOS_MAX:
-		return plist_last(&o->requests)->prio;
-
-	default:
-		/* runtime check for not using enum */
-		BUG();
-	}
-}
-
-static inline s32 pm_qos_read_value(struct pm_qos_object *o)
-{
-	return o->target_value;
-}
-
-static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value)
-{
-	o->target_value = value;
-}
-
-static void update_target(struct pm_qos_object *o, struct plist_node *node,
-			  int del, int value)
-{
-	unsigned long flags;
-	int prev_value, curr_value;
-
-	spin_lock_irqsave(&pm_qos_lock, flags);
-	prev_value = pm_qos_get_value(o);
-	/* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */
-	if (value != PM_QOS_DEFAULT_VALUE) {
-		/*
-		 * to change the list, we atomically remove, reinit
-		 * with new value and add, then see if the extremal
-		 * changed
-		 */
-		plist_del(node, &o->requests);
-		plist_node_init(node, value);
-		plist_add(node, &o->requests);
-	} else if (del) {
-		plist_del(node, &o->requests);
-	} else {
-		plist_add(node, &o->requests);
-	}
-	curr_value = pm_qos_get_value(o);
-	pm_qos_set_value(o, curr_value);
-	spin_unlock_irqrestore(&pm_qos_lock, flags);
-
-	if (prev_value != curr_value)
-		blocking_notifier_call_chain(o->notifiers,
-					     (unsigned long)curr_value,
-					     NULL);
-}
-
-static int register_pm_qos_misc(struct pm_qos_object *qos)
-{
-	qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
-	qos->pm_qos_power_miscdev.name = qos->name;
-	qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
-
-	return misc_register(&qos->pm_qos_power_miscdev);
-}
-
-static int find_pm_qos_object_by_minor(int minor)
-{
-	int pm_qos_class;
-
-	for (pm_qos_class = 0;
-		pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
-		if (minor ==
-			pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
-			return pm_qos_class;
-	}
-	return -1;
-}
-
-/**
- * pm_qos_request - returns current system wide qos expectation
- * @pm_qos_class: identification of which qos value is requested
- *
- * This function returns the current target value.
- */
-int pm_qos_request(int pm_qos_class)
-{
-	return pm_qos_read_value(pm_qos_array[pm_qos_class]);
-}
-EXPORT_SYMBOL_GPL(pm_qos_request);
-
-int pm_qos_request_active(struct pm_qos_request_list *req)
-{
-	return req->pm_qos_class != 0;
-}
-EXPORT_SYMBOL_GPL(pm_qos_request_active);
-
-/**
- * pm_qos_add_request - inserts new qos request into the list
- * @dep: pointer to a preallocated handle
- * @pm_qos_class: identifies which list of qos request to use
- * @value: defines the qos request
- *
- * This function inserts a new entry in the pm_qos_class list of requested qos
- * performance characteristics.  It recomputes the aggregate QoS expectations
- * for the pm_qos_class of parameters and initializes the pm_qos_request_list
- * handle.  Caller needs to save this handle for later use in updates and
- * removal.
- */
-
-void pm_qos_add_request(struct pm_qos_request_list *dep,
-			int pm_qos_class, s32 value)
-{
-	struct pm_qos_object *o =  pm_qos_array[pm_qos_class];
-	int new_value;
-
-	if (pm_qos_request_active(dep)) {
-		WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
-		return;
-	}
-	if (value == PM_QOS_DEFAULT_VALUE)
-		new_value = o->default_value;
-	else
-		new_value = value;
-	plist_node_init(&dep->list, new_value);
-	dep->pm_qos_class = pm_qos_class;
-	update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
-}
-EXPORT_SYMBOL_GPL(pm_qos_add_request);
-
-/**
- * pm_qos_update_request - modifies an existing qos request
- * @pm_qos_req : handle to list element holding a pm_qos request to use
- * @value: defines the qos request
- *
- * Updates an existing qos request for the pm_qos_class of parameters along
- * with updating the target pm_qos_class value.
- *
- * Attempts are made to make this code callable on hot code paths.
- */
-void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
-			   s32 new_value)
-{
-	s32 temp;
-	struct pm_qos_object *o;
-
-	if (!pm_qos_req) /*guard against callers passing in null */
-		return;
-
-	if (!pm_qos_request_active(pm_qos_req)) {
-		WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
-		return;
-	}
-
-	o = pm_qos_array[pm_qos_req->pm_qos_class];
-
-	if (new_value == PM_QOS_DEFAULT_VALUE)
-		temp = o->default_value;
-	else
-		temp = new_value;
-
-	if (temp != pm_qos_req->list.prio)
-		update_target(o, &pm_qos_req->list, 0, temp);
-}
-EXPORT_SYMBOL_GPL(pm_qos_update_request);
-
-/**
- * pm_qos_remove_request - modifies an existing qos request
- * @pm_qos_req: handle to request list element
- *
- * Will remove pm qos request from the list of requests and
- * recompute the current target value for the pm_qos_class.  Call this
- * on slow code paths.
- */
-void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
-{
-	struct pm_qos_object *o;
-
-	if (pm_qos_req == NULL)
-		return;
-		/* silent return to keep pcm code cleaner */
-
-	if (!pm_qos_request_active(pm_qos_req)) {
-		WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
-		return;
-	}
-
-	o = pm_qos_array[pm_qos_req->pm_qos_class];
-	update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
-	memset(pm_qos_req, 0, sizeof(*pm_qos_req));
-}
-EXPORT_SYMBOL_GPL(pm_qos_remove_request);
-
-/**
- * pm_qos_add_notifier - sets notification entry for changes to target value
- * @pm_qos_class: identifies which qos target changes should be notified.
- * @notifier: notifier block managed by caller.
- *
- * will register the notifier into a notification chain that gets called
- * upon changes to the pm_qos_class target value.
- */
-int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
-{
-	int retval;
-
-	retval = blocking_notifier_chain_register(
-			pm_qos_array[pm_qos_class]->notifiers, notifier);
-
-	return retval;
-}
-EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
-
-/**
- * pm_qos_remove_notifier - deletes notification entry from chain.
- * @pm_qos_class: identifies which qos target changes are notified.
- * @notifier: notifier block to be removed.
- *
- * will remove the notifier from the notification chain that gets called
- * upon changes to the pm_qos_class target value.
- */
-int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
-{
-	int retval;
-
-	retval = blocking_notifier_chain_unregister(
-			pm_qos_array[pm_qos_class]->notifiers, notifier);
-
-	return retval;
-}
-EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
-
-static int pm_qos_power_open(struct inode *inode, struct file *filp)
-{
-	long pm_qos_class;
-
-	pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
-	if (pm_qos_class >= 0) {
-               struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL);
-		if (!req)
-			return -ENOMEM;
-
-		pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
-		filp->private_data = req;
-
-		if (filp->private_data)
-			return 0;
-	}
-	return -EPERM;
-}
-
-static int pm_qos_power_release(struct inode *inode, struct file *filp)
-{
-	struct pm_qos_request_list *req;
-
-	req = filp->private_data;
-	pm_qos_remove_request(req);
-	kfree(req);
-
-	return 0;
-}
-
-
-static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
-		size_t count, loff_t *f_pos)
-{
-	s32 value;
-	unsigned long flags;
-	struct pm_qos_object *o;
-	struct pm_qos_request_list *pm_qos_req = filp->private_data;
-
-	if (!pm_qos_req)
-		return -EINVAL;
-	if (!pm_qos_request_active(pm_qos_req))
-		return -EINVAL;
-
-	o = pm_qos_array[pm_qos_req->pm_qos_class];
-	spin_lock_irqsave(&pm_qos_lock, flags);
-	value = pm_qos_get_value(o);
-	spin_unlock_irqrestore(&pm_qos_lock, flags);
-
-	return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
-}
-
-static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
-		size_t count, loff_t *f_pos)
-{
-	s32 value;
-	struct pm_qos_request_list *pm_qos_req;
-
-	if (count == sizeof(s32)) {
-		if (copy_from_user(&value, buf, sizeof(s32)))
-			return -EFAULT;
-	} else if (count <= 11) { /* ASCII perhaps? */
-		char ascii_value[11];
-		unsigned long int ulval;
-		int ret;
-
-		if (copy_from_user(ascii_value, buf, count))
-			return -EFAULT;
-
-		if (count > 10) {
-			if (ascii_value[10] == '\n')
-				ascii_value[10] = '\0';
-			else
-				return -EINVAL;
-		} else {
-			ascii_value[count] = '\0';
-		}
-		ret = strict_strtoul(ascii_value, 16, &ulval);
-		if (ret) {
-			pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
-			return -EINVAL;
-		}
-		value = (s32)lower_32_bits(ulval);
-	} else {
-		return -EINVAL;
-	}
-
-	pm_qos_req = filp->private_data;
-	pm_qos_update_request(pm_qos_req, value);
-
-	return count;
-}
-
-
-static int __init pm_qos_power_init(void)
-{
-	int ret = 0;
-
-	ret = register_pm_qos_misc(&cpu_dma_pm_qos);
-	if (ret < 0) {
-		printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n");
-		return ret;
-	}
-	ret = register_pm_qos_misc(&network_lat_pm_qos);
-	if (ret < 0) {
-		printk(KERN_ERR "pm_qos_param: network_latency setup failed\n");
-		return ret;
-	}
-	ret = register_pm_qos_misc(&network_throughput_pm_qos);
-	if (ret < 0)
-		printk(KERN_ERR
-			"pm_qos_param: network_throughput setup failed\n");
-
-	return ret;
-}
-
-late_initcall(pm_qos_power_init);
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c5ebc6a90643..ad6bdd8b401a 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,7 @@
 
 ccflags-$(CONFIG_PM_DEBUG)	:= -DDEBUG
 
-obj-$(CONFIG_PM)		+= main.o
+obj-$(CONFIG_PM)		+= main.o qos.o
 obj-$(CONFIG_PM_SLEEP)		+= console.o
 obj-$(CONFIG_FREEZER)		+= process.o
 obj-$(CONFIG_SUSPEND)		+= suspend.o
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
new file mode 100644
index 000000000000..61b47384329e
--- /dev/null
+++ b/kernel/power/qos.c
@@ -0,0 +1,481 @@
+/*
+ * This module exposes the interface to kernel space for specifying
+ * QoS dependencies.  It provides infrastructure for registration of:
+ *
+ * Dependents on a QoS value : register requests
+ * Watchers of QoS value : get notified when target QoS value changes
+ *
+ * This QoS design is best effort based.  Dependents register their QoS needs.
+ * Watchers register to keep track of the current QoS needs of the system.
+ *
+ * There are 3 basic classes of QoS parameter: latency, timeout, throughput
+ * each have defined units:
+ * latency: usec
+ * timeout: usec <-- currently not used.
+ * throughput: kbs (kilo byte / sec)
+ *
+ * There are lists of pm_qos_objects each one wrapping requests, notifiers
+ *
+ * User mode requests on a QOS parameter register themselves to the
+ * subsystem by opening the device node /dev/... and writing there request to
+ * the node.  As long as the process holds a file handle open to the node the
+ * client continues to be accounted for.  Upon file release the usermode
+ * request is removed and a new qos target is computed.  This way when the
+ * request that the application has is cleaned up when closes the file
+ * pointer or exits the pm_qos_object will get an opportunity to clean up.
+ *
+ * Mark Gross <mgross@linux.intel.com>
+ */
+
+/*#define DEBUG*/
+
+#include <linux/pm_qos.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/string.h>
+#include <linux/platform_device.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+
+#include <linux/uaccess.h>
+
+/*
+ * locking rule: all changes to requests or notifiers lists
+ * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
+ * held, taken with _irqsave.  One lock to rule them all
+ */
+enum pm_qos_type {
+	PM_QOS_MAX,		/* return the largest value */
+	PM_QOS_MIN		/* return the smallest value */
+};
+
+/*
+ * Note: The lockless read path depends on the CPU accessing
+ * target_value atomically.  Atomic access is only guaranteed on all CPU
+ * types linux supports for 32 bit quantites
+ */
+struct pm_qos_object {
+	struct plist_head requests;
+	struct blocking_notifier_head *notifiers;
+	struct miscdevice pm_qos_power_miscdev;
+	char *name;
+	s32 target_value;	/* Do not change to 64 bit */
+	s32 default_value;
+	enum pm_qos_type type;
+};
+
+static DEFINE_SPINLOCK(pm_qos_lock);
+
+static struct pm_qos_object null_pm_qos;
+static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
+static struct pm_qos_object cpu_dma_pm_qos = {
+	.requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests),
+	.notifiers = &cpu_dma_lat_notifier,
+	.name = "cpu_dma_latency",
+	.target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
+	.default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
+	.type = PM_QOS_MIN,
+};
+
+static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
+static struct pm_qos_object network_lat_pm_qos = {
+	.requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests),
+	.notifiers = &network_lat_notifier,
+	.name = "network_latency",
+	.target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
+	.default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
+	.type = PM_QOS_MIN
+};
+
+
+static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
+static struct pm_qos_object network_throughput_pm_qos = {
+	.requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests),
+	.notifiers = &network_throughput_notifier,
+	.name = "network_throughput",
+	.target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
+	.default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
+	.type = PM_QOS_MAX,
+};
+
+
+static struct pm_qos_object *pm_qos_array[] = {
+	&null_pm_qos,
+	&cpu_dma_pm_qos,
+	&network_lat_pm_qos,
+	&network_throughput_pm_qos
+};
+
+static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
+		size_t count, loff_t *f_pos);
+static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
+		size_t count, loff_t *f_pos);
+static int pm_qos_power_open(struct inode *inode, struct file *filp);
+static int pm_qos_power_release(struct inode *inode, struct file *filp);
+
+static const struct file_operations pm_qos_power_fops = {
+	.write = pm_qos_power_write,
+	.read = pm_qos_power_read,
+	.open = pm_qos_power_open,
+	.release = pm_qos_power_release,
+	.llseek = noop_llseek,
+};
+
+/* unlocked internal variant */
+static inline int pm_qos_get_value(struct pm_qos_object *o)
+{
+	if (plist_head_empty(&o->requests))
+		return o->default_value;
+
+	switch (o->type) {
+	case PM_QOS_MIN:
+		return plist_first(&o->requests)->prio;
+
+	case PM_QOS_MAX:
+		return plist_last(&o->requests)->prio;
+
+	default:
+		/* runtime check for not using enum */
+		BUG();
+	}
+}
+
+static inline s32 pm_qos_read_value(struct pm_qos_object *o)
+{
+	return o->target_value;
+}
+
+static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value)
+{
+	o->target_value = value;
+}
+
+static void update_target(struct pm_qos_object *o, struct plist_node *node,
+			  int del, int value)
+{
+	unsigned long flags;
+	int prev_value, curr_value;
+
+	spin_lock_irqsave(&pm_qos_lock, flags);
+	prev_value = pm_qos_get_value(o);
+	/* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */
+	if (value != PM_QOS_DEFAULT_VALUE) {
+		/*
+		 * to change the list, we atomically remove, reinit
+		 * with new value and add, then see if the extremal
+		 * changed
+		 */
+		plist_del(node, &o->requests);
+		plist_node_init(node, value);
+		plist_add(node, &o->requests);
+	} else if (del) {
+		plist_del(node, &o->requests);
+	} else {
+		plist_add(node, &o->requests);
+	}
+	curr_value = pm_qos_get_value(o);
+	pm_qos_set_value(o, curr_value);
+	spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+	if (prev_value != curr_value)
+		blocking_notifier_call_chain(o->notifiers,
+					     (unsigned long)curr_value,
+					     NULL);
+}
+
+static int register_pm_qos_misc(struct pm_qos_object *qos)
+{
+	qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
+	qos->pm_qos_power_miscdev.name = qos->name;
+	qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
+
+	return misc_register(&qos->pm_qos_power_miscdev);
+}
+
+static int find_pm_qos_object_by_minor(int minor)
+{
+	int pm_qos_class;
+
+	for (pm_qos_class = 0;
+		pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
+		if (minor ==
+			pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
+			return pm_qos_class;
+	}
+	return -1;
+}
+
+/**
+ * pm_qos_request - returns current system wide qos expectation
+ * @pm_qos_class: identification of which qos value is requested
+ *
+ * This function returns the current target value.
+ */
+int pm_qos_request(int pm_qos_class)
+{
+	return pm_qos_read_value(pm_qos_array[pm_qos_class]);
+}
+EXPORT_SYMBOL_GPL(pm_qos_request);
+
+int pm_qos_request_active(struct pm_qos_request_list *req)
+{
+	return req->pm_qos_class != 0;
+}
+EXPORT_SYMBOL_GPL(pm_qos_request_active);
+
+/**
+ * pm_qos_add_request - inserts new qos request into the list
+ * @dep: pointer to a preallocated handle
+ * @pm_qos_class: identifies which list of qos request to use
+ * @value: defines the qos request
+ *
+ * This function inserts a new entry in the pm_qos_class list of requested qos
+ * performance characteristics.  It recomputes the aggregate QoS expectations
+ * for the pm_qos_class of parameters and initializes the pm_qos_request_list
+ * handle.  Caller needs to save this handle for later use in updates and
+ * removal.
+ */
+
+void pm_qos_add_request(struct pm_qos_request_list *dep,
+			int pm_qos_class, s32 value)
+{
+	struct pm_qos_object *o =  pm_qos_array[pm_qos_class];
+	int new_value;
+
+	if (pm_qos_request_active(dep)) {
+		WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
+		return;
+	}
+	if (value == PM_QOS_DEFAULT_VALUE)
+		new_value = o->default_value;
+	else
+		new_value = value;
+	plist_node_init(&dep->list, new_value);
+	dep->pm_qos_class = pm_qos_class;
+	update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
+}
+EXPORT_SYMBOL_GPL(pm_qos_add_request);
+
+/**
+ * pm_qos_update_request - modifies an existing qos request
+ * @pm_qos_req : handle to list element holding a pm_qos request to use
+ * @value: defines the qos request
+ *
+ * Updates an existing qos request for the pm_qos_class of parameters along
+ * with updating the target pm_qos_class value.
+ *
+ * Attempts are made to make this code callable on hot code paths.
+ */
+void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
+			   s32 new_value)
+{
+	s32 temp;
+	struct pm_qos_object *o;
+
+	if (!pm_qos_req) /*guard against callers passing in null */
+		return;
+
+	if (!pm_qos_request_active(pm_qos_req)) {
+		WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
+		return;
+	}
+
+	o = pm_qos_array[pm_qos_req->pm_qos_class];
+
+	if (new_value == PM_QOS_DEFAULT_VALUE)
+		temp = o->default_value;
+	else
+		temp = new_value;
+
+	if (temp != pm_qos_req->list.prio)
+		update_target(o, &pm_qos_req->list, 0, temp);
+}
+EXPORT_SYMBOL_GPL(pm_qos_update_request);
+
+/**
+ * pm_qos_remove_request - modifies an existing qos request
+ * @pm_qos_req: handle to request list element
+ *
+ * Will remove pm qos request from the list of requests and
+ * recompute the current target value for the pm_qos_class.  Call this
+ * on slow code paths.
+ */
+void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
+{
+	struct pm_qos_object *o;
+
+	if (pm_qos_req == NULL)
+		return;
+		/* silent return to keep pcm code cleaner */
+
+	if (!pm_qos_request_active(pm_qos_req)) {
+		WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
+		return;
+	}
+
+	o = pm_qos_array[pm_qos_req->pm_qos_class];
+	update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
+	memset(pm_qos_req, 0, sizeof(*pm_qos_req));
+}
+EXPORT_SYMBOL_GPL(pm_qos_remove_request);
+
+/**
+ * pm_qos_add_notifier - sets notification entry for changes to target value
+ * @pm_qos_class: identifies which qos target changes should be notified.
+ * @notifier: notifier block managed by caller.
+ *
+ * will register the notifier into a notification chain that gets called
+ * upon changes to the pm_qos_class target value.
+ */
+int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
+{
+	int retval;
+
+	retval = blocking_notifier_chain_register(
+			pm_qos_array[pm_qos_class]->notifiers, notifier);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
+
+/**
+ * pm_qos_remove_notifier - deletes notification entry from chain.
+ * @pm_qos_class: identifies which qos target changes are notified.
+ * @notifier: notifier block to be removed.
+ *
+ * will remove the notifier from the notification chain that gets called
+ * upon changes to the pm_qos_class target value.
+ */
+int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
+{
+	int retval;
+
+	retval = blocking_notifier_chain_unregister(
+			pm_qos_array[pm_qos_class]->notifiers, notifier);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
+
+static int pm_qos_power_open(struct inode *inode, struct file *filp)
+{
+	long pm_qos_class;
+
+	pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
+	if (pm_qos_class >= 0) {
+               struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL);
+		if (!req)
+			return -ENOMEM;
+
+		pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
+		filp->private_data = req;
+
+		if (filp->private_data)
+			return 0;
+	}
+	return -EPERM;
+}
+
+static int pm_qos_power_release(struct inode *inode, struct file *filp)
+{
+	struct pm_qos_request_list *req;
+
+	req = filp->private_data;
+	pm_qos_remove_request(req);
+	kfree(req);
+
+	return 0;
+}
+
+
+static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
+		size_t count, loff_t *f_pos)
+{
+	s32 value;
+	unsigned long flags;
+	struct pm_qos_object *o;
+	struct pm_qos_request_list *pm_qos_req = filp->private_data;
+
+	if (!pm_qos_req)
+		return -EINVAL;
+	if (!pm_qos_request_active(pm_qos_req))
+		return -EINVAL;
+
+	o = pm_qos_array[pm_qos_req->pm_qos_class];
+	spin_lock_irqsave(&pm_qos_lock, flags);
+	value = pm_qos_get_value(o);
+	spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+	return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
+}
+
+static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
+		size_t count, loff_t *f_pos)
+{
+	s32 value;
+	struct pm_qos_request_list *pm_qos_req;
+
+	if (count == sizeof(s32)) {
+		if (copy_from_user(&value, buf, sizeof(s32)))
+			return -EFAULT;
+	} else if (count <= 11) { /* ASCII perhaps? */
+		char ascii_value[11];
+		unsigned long int ulval;
+		int ret;
+
+		if (copy_from_user(ascii_value, buf, count))
+			return -EFAULT;
+
+		if (count > 10) {
+			if (ascii_value[10] == '\n')
+				ascii_value[10] = '\0';
+			else
+				return -EINVAL;
+		} else {
+			ascii_value[count] = '\0';
+		}
+		ret = strict_strtoul(ascii_value, 16, &ulval);
+		if (ret) {
+			pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
+			return -EINVAL;
+		}
+		value = (s32)lower_32_bits(ulval);
+	} else {
+		return -EINVAL;
+	}
+
+	pm_qos_req = filp->private_data;
+	pm_qos_update_request(pm_qos_req, value);
+
+	return count;
+}
+
+
+static int __init pm_qos_power_init(void)
+{
+	int ret = 0;
+
+	ret = register_pm_qos_misc(&cpu_dma_pm_qos);
+	if (ret < 0) {
+		printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n");
+		return ret;
+	}
+	ret = register_pm_qos_misc(&network_lat_pm_qos);
+	if (ret < 0) {
+		printk(KERN_ERR "pm_qos_param: network_latency setup failed\n");
+		return ret;
+	}
+	ret = register_pm_qos_misc(&network_throughput_pm_qos);
+	if (ret < 0)
+		printk(KERN_ERR
+			"pm_qos_param: network_throughput setup failed\n");
+
+	return ret;
+}
+
+late_initcall(pm_qos_power_init);
-- 
cgit v1.2.2


From cc74998618a66d34651c784dd02412614c3e81cc Mon Sep 17 00:00:00 2001
From: Jean Pihet <j-pihet@ti.com>
Date: Thu, 25 Aug 2011 15:35:12 +0200
Subject: PM QoS: Minor clean-ups

 - Misc fixes to improve code readability:
  * rename struct pm_qos_request_list to struct pm_qos_request,
  * rename pm_qos_req parameter to req in internal code,
    consistenly use req in the API parameters,
  * update the in-kernel API callers to the new parameters names,
  * rename of fields names (requests, list, node, constraints)

Signed-off-by: Jean Pihet <j-pihet@ti.com>
Acked-by: markgross <markgross@thegnar.org>
Reviewed-by: Kevin Hilman <khilman@ti.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/qos.c | 88 +++++++++++++++++++++++++++---------------------------
 1 file changed, 44 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 61b47384329e..aa52c44e6080 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -45,7 +45,7 @@
 #include <linux/uaccess.h>
 
 /*
- * locking rule: all changes to requests or notifiers lists
+ * locking rule: all changes to constraints or notifiers lists
  * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
  * held, taken with _irqsave.  One lock to rule them all
  */
@@ -60,7 +60,7 @@ enum pm_qos_type {
  * types linux supports for 32 bit quantites
  */
 struct pm_qos_object {
-	struct plist_head requests;
+	struct plist_head constraints;
 	struct blocking_notifier_head *notifiers;
 	struct miscdevice pm_qos_power_miscdev;
 	char *name;
@@ -74,7 +74,7 @@ static DEFINE_SPINLOCK(pm_qos_lock);
 static struct pm_qos_object null_pm_qos;
 static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
 static struct pm_qos_object cpu_dma_pm_qos = {
-	.requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests),
+	.constraints = PLIST_HEAD_INIT(cpu_dma_pm_qos.constraints),
 	.notifiers = &cpu_dma_lat_notifier,
 	.name = "cpu_dma_latency",
 	.target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
@@ -84,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
 
 static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
 static struct pm_qos_object network_lat_pm_qos = {
-	.requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests),
+	.constraints = PLIST_HEAD_INIT(network_lat_pm_qos.constraints),
 	.notifiers = &network_lat_notifier,
 	.name = "network_latency",
 	.target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
@@ -95,7 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = {
 
 static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
 static struct pm_qos_object network_throughput_pm_qos = {
-	.requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests),
+	.constraints = PLIST_HEAD_INIT(network_throughput_pm_qos.constraints),
 	.notifiers = &network_throughput_notifier,
 	.name = "network_throughput",
 	.target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
@@ -129,15 +129,15 @@ static const struct file_operations pm_qos_power_fops = {
 /* unlocked internal variant */
 static inline int pm_qos_get_value(struct pm_qos_object *o)
 {
-	if (plist_head_empty(&o->requests))
+	if (plist_head_empty(&o->constraints))
 		return o->default_value;
 
 	switch (o->type) {
 	case PM_QOS_MIN:
-		return plist_first(&o->requests)->prio;
+		return plist_first(&o->constraints)->prio;
 
 	case PM_QOS_MAX:
-		return plist_last(&o->requests)->prio;
+		return plist_last(&o->constraints)->prio;
 
 	default:
 		/* runtime check for not using enum */
@@ -170,13 +170,13 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node,
 		 * with new value and add, then see if the extremal
 		 * changed
 		 */
-		plist_del(node, &o->requests);
+		plist_del(node, &o->constraints);
 		plist_node_init(node, value);
-		plist_add(node, &o->requests);
+		plist_add(node, &o->constraints);
 	} else if (del) {
-		plist_del(node, &o->requests);
+		plist_del(node, &o->constraints);
 	} else {
-		plist_add(node, &o->requests);
+		plist_add(node, &o->constraints);
 	}
 	curr_value = pm_qos_get_value(o);
 	pm_qos_set_value(o, curr_value);
@@ -222,7 +222,7 @@ int pm_qos_request(int pm_qos_class)
 }
 EXPORT_SYMBOL_GPL(pm_qos_request);
 
-int pm_qos_request_active(struct pm_qos_request_list *req)
+int pm_qos_request_active(struct pm_qos_request *req)
 {
 	return req->pm_qos_class != 0;
 }
@@ -230,24 +230,24 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active);
 
 /**
  * pm_qos_add_request - inserts new qos request into the list
- * @dep: pointer to a preallocated handle
+ * @req: pointer to a preallocated handle
  * @pm_qos_class: identifies which list of qos request to use
  * @value: defines the qos request
  *
  * This function inserts a new entry in the pm_qos_class list of requested qos
  * performance characteristics.  It recomputes the aggregate QoS expectations
- * for the pm_qos_class of parameters and initializes the pm_qos_request_list
+ * for the pm_qos_class of parameters and initializes the pm_qos_request
  * handle.  Caller needs to save this handle for later use in updates and
  * removal.
  */
 
-void pm_qos_add_request(struct pm_qos_request_list *dep,
+void pm_qos_add_request(struct pm_qos_request *req,
 			int pm_qos_class, s32 value)
 {
 	struct pm_qos_object *o =  pm_qos_array[pm_qos_class];
 	int new_value;
 
-	if (pm_qos_request_active(dep)) {
+	if (pm_qos_request_active(req)) {
 		WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
 		return;
 	}
@@ -255,15 +255,15 @@ void pm_qos_add_request(struct pm_qos_request_list *dep,
 		new_value = o->default_value;
 	else
 		new_value = value;
-	plist_node_init(&dep->list, new_value);
-	dep->pm_qos_class = pm_qos_class;
-	update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
+	plist_node_init(&req->node, new_value);
+	req->pm_qos_class = pm_qos_class;
+	update_target(o, &req->node, 0, PM_QOS_DEFAULT_VALUE);
 }
 EXPORT_SYMBOL_GPL(pm_qos_add_request);
 
 /**
  * pm_qos_update_request - modifies an existing qos request
- * @pm_qos_req : handle to list element holding a pm_qos request to use
+ * @req : handle to list element holding a pm_qos request to use
  * @value: defines the qos request
  *
  * Updates an existing qos request for the pm_qos_class of parameters along
@@ -271,56 +271,56 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request);
  *
  * Attempts are made to make this code callable on hot code paths.
  */
-void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
+void pm_qos_update_request(struct pm_qos_request *req,
 			   s32 new_value)
 {
 	s32 temp;
 	struct pm_qos_object *o;
 
-	if (!pm_qos_req) /*guard against callers passing in null */
+	if (!req) /*guard against callers passing in null */
 		return;
 
-	if (!pm_qos_request_active(pm_qos_req)) {
+	if (!pm_qos_request_active(req)) {
 		WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
 		return;
 	}
 
-	o = pm_qos_array[pm_qos_req->pm_qos_class];
+	o = pm_qos_array[req->pm_qos_class];
 
 	if (new_value == PM_QOS_DEFAULT_VALUE)
 		temp = o->default_value;
 	else
 		temp = new_value;
 
-	if (temp != pm_qos_req->list.prio)
-		update_target(o, &pm_qos_req->list, 0, temp);
+	if (temp != req->node.prio)
+		update_target(o, &req->node, 0, temp);
 }
 EXPORT_SYMBOL_GPL(pm_qos_update_request);
 
 /**
  * pm_qos_remove_request - modifies an existing qos request
- * @pm_qos_req: handle to request list element
+ * @req: handle to request list element
  *
- * Will remove pm qos request from the list of requests and
+ * Will remove pm qos request from the list of constraints and
  * recompute the current target value for the pm_qos_class.  Call this
  * on slow code paths.
  */
-void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
+void pm_qos_remove_request(struct pm_qos_request *req)
 {
 	struct pm_qos_object *o;
 
-	if (pm_qos_req == NULL)
+	if (req == NULL)
 		return;
 		/* silent return to keep pcm code cleaner */
 
-	if (!pm_qos_request_active(pm_qos_req)) {
+	if (!pm_qos_request_active(req)) {
 		WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
 		return;
 	}
 
-	o = pm_qos_array[pm_qos_req->pm_qos_class];
-	update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
-	memset(pm_qos_req, 0, sizeof(*pm_qos_req));
+	o = pm_qos_array[req->pm_qos_class];
+	update_target(o, &req->node, 1, PM_QOS_DEFAULT_VALUE);
+	memset(req, 0, sizeof(*req));
 }
 EXPORT_SYMBOL_GPL(pm_qos_remove_request);
 
@@ -368,7 +368,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
 
 	pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
 	if (pm_qos_class >= 0) {
-               struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL);
+		struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);
 		if (!req)
 			return -ENOMEM;
 
@@ -383,7 +383,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
 
 static int pm_qos_power_release(struct inode *inode, struct file *filp)
 {
-	struct pm_qos_request_list *req;
+	struct pm_qos_request *req;
 
 	req = filp->private_data;
 	pm_qos_remove_request(req);
@@ -399,14 +399,14 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
 	s32 value;
 	unsigned long flags;
 	struct pm_qos_object *o;
-	struct pm_qos_request_list *pm_qos_req = filp->private_data;
+	struct pm_qos_request *req = filp->private_data;
 
-	if (!pm_qos_req)
+	if (!req)
 		return -EINVAL;
-	if (!pm_qos_request_active(pm_qos_req))
+	if (!pm_qos_request_active(req))
 		return -EINVAL;
 
-	o = pm_qos_array[pm_qos_req->pm_qos_class];
+	o = pm_qos_array[req->pm_qos_class];
 	spin_lock_irqsave(&pm_qos_lock, flags);
 	value = pm_qos_get_value(o);
 	spin_unlock_irqrestore(&pm_qos_lock, flags);
@@ -418,7 +418,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
 		size_t count, loff_t *f_pos)
 {
 	s32 value;
-	struct pm_qos_request_list *pm_qos_req;
+	struct pm_qos_request *req;
 
 	if (count == sizeof(s32)) {
 		if (copy_from_user(&value, buf, sizeof(s32)))
@@ -449,8 +449,8 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
 		return -EINVAL;
 	}
 
-	pm_qos_req = filp->private_data;
-	pm_qos_update_request(pm_qos_req, value);
+	req = filp->private_data;
+	pm_qos_update_request(req, value);
 
 	return count;
 }
-- 
cgit v1.2.2


From 4a31a33425a1eb92f6a0b9846f081842268361c8 Mon Sep 17 00:00:00 2001
From: Jean Pihet <j-pihet@ti.com>
Date: Thu, 25 Aug 2011 15:35:20 +0200
Subject: PM QoS: Code reorganization

Move around the PM QoS misc devices management code
for better readability.

Signed-off-by: Jean Pihet <j-pihet@ti.com>
Acked-by: markgross <markgross@thegnar.org>
Reviewed-by: Kevin Hilman <khilman@ti.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/qos.c | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index aa52c44e6080..788c4cfe0cdb 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -188,28 +188,6 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node,
 					     NULL);
 }
 
-static int register_pm_qos_misc(struct pm_qos_object *qos)
-{
-	qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
-	qos->pm_qos_power_miscdev.name = qos->name;
-	qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
-
-	return misc_register(&qos->pm_qos_power_miscdev);
-}
-
-static int find_pm_qos_object_by_minor(int minor)
-{
-	int pm_qos_class;
-
-	for (pm_qos_class = 0;
-		pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
-		if (minor ==
-			pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
-			return pm_qos_class;
-	}
-	return -1;
-}
-
 /**
  * pm_qos_request - returns current system wide qos expectation
  * @pm_qos_class: identification of which qos value is requested
@@ -362,6 +340,29 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
 }
 EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
 
+/* User space interface to PM QoS classes via misc devices */
+static int register_pm_qos_misc(struct pm_qos_object *qos)
+{
+	qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
+	qos->pm_qos_power_miscdev.name = qos->name;
+	qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
+
+	return misc_register(&qos->pm_qos_power_miscdev);
+}
+
+static int find_pm_qos_object_by_minor(int minor)
+{
+	int pm_qos_class;
+
+	for (pm_qos_class = 0;
+		pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
+		if (minor ==
+			pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
+			return pm_qos_class;
+	}
+	return -1;
+}
+
 static int pm_qos_power_open(struct inode *inode, struct file *filp)
 {
 	long pm_qos_class;
-- 
cgit v1.2.2


From 4e1779baaa542c83b459b0a56585e0c1a04c7782 Mon Sep 17 00:00:00 2001
From: Jean Pihet <j-pihet@ti.com>
Date: Thu, 25 Aug 2011 15:35:27 +0200
Subject: PM QoS: Reorganize data structs

In preparation for the per-device constratins support, re-organize
the data strctures:
 - add a struct pm_qos_constraints which contains the constraints
 related data
 - update struct pm_qos_object contents to the PM QoS internal object
 data. Add a pointer to struct pm_qos_constraints
 - update the internal code to use the new data structs.

Signed-off-by: Jean Pihet <j-pihet@ti.com>
Reviewed-by: Kevin Hilman <khilman@ti.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/qos.c | 85 ++++++++++++++++++++++++++----------------------------
 1 file changed, 41 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 788c4cfe0cdb..4a35fe50b777 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -49,58 +49,53 @@
  * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
  * held, taken with _irqsave.  One lock to rule them all
  */
-enum pm_qos_type {
-	PM_QOS_MAX,		/* return the largest value */
-	PM_QOS_MIN		/* return the smallest value */
-};
-
-/*
- * Note: The lockless read path depends on the CPU accessing
- * target_value atomically.  Atomic access is only guaranteed on all CPU
- * types linux supports for 32 bit quantites
- */
 struct pm_qos_object {
-	struct plist_head constraints;
-	struct blocking_notifier_head *notifiers;
+	struct pm_qos_constraints *constraints;
 	struct miscdevice pm_qos_power_miscdev;
 	char *name;
-	s32 target_value;	/* Do not change to 64 bit */
-	s32 default_value;
-	enum pm_qos_type type;
 };
 
 static DEFINE_SPINLOCK(pm_qos_lock);
 
 static struct pm_qos_object null_pm_qos;
+
 static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
-static struct pm_qos_object cpu_dma_pm_qos = {
-	.constraints = PLIST_HEAD_INIT(cpu_dma_pm_qos.constraints),
-	.notifiers = &cpu_dma_lat_notifier,
-	.name = "cpu_dma_latency",
+static struct pm_qos_constraints cpu_dma_constraints = {
+	.list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
 	.target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
 	.default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
 	.type = PM_QOS_MIN,
+	.notifiers = &cpu_dma_lat_notifier,
+};
+static struct pm_qos_object cpu_dma_pm_qos = {
+	.constraints = &cpu_dma_constraints,
 };
 
 static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
-static struct pm_qos_object network_lat_pm_qos = {
-	.constraints = PLIST_HEAD_INIT(network_lat_pm_qos.constraints),
-	.notifiers = &network_lat_notifier,
-	.name = "network_latency",
+static struct pm_qos_constraints network_lat_constraints = {
+	.list = PLIST_HEAD_INIT(network_lat_constraints.list),
 	.target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
 	.default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
-	.type = PM_QOS_MIN
+	.type = PM_QOS_MIN,
+	.notifiers = &network_lat_notifier,
+};
+static struct pm_qos_object network_lat_pm_qos = {
+	.constraints = &network_lat_constraints,
+	.name = "network_latency",
 };
 
 
 static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
-static struct pm_qos_object network_throughput_pm_qos = {
-	.constraints = PLIST_HEAD_INIT(network_throughput_pm_qos.constraints),
-	.notifiers = &network_throughput_notifier,
-	.name = "network_throughput",
+static struct pm_qos_constraints network_tput_constraints = {
+	.list = PLIST_HEAD_INIT(network_tput_constraints.list),
 	.target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
 	.default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
 	.type = PM_QOS_MAX,
+	.notifiers = &network_throughput_notifier,
+};
+static struct pm_qos_object network_throughput_pm_qos = {
+	.constraints = &network_tput_constraints,
+	.name = "network_throughput",
 };
 
 
@@ -129,15 +124,15 @@ static const struct file_operations pm_qos_power_fops = {
 /* unlocked internal variant */
 static inline int pm_qos_get_value(struct pm_qos_object *o)
 {
-	if (plist_head_empty(&o->constraints))
-		return o->default_value;
+	if (plist_head_empty(&o->constraints->list))
+		return o->constraints->default_value;
 
-	switch (o->type) {
+	switch (o->constraints->type) {
 	case PM_QOS_MIN:
-		return plist_first(&o->constraints)->prio;
+		return plist_first(&o->constraints->list)->prio;
 
 	case PM_QOS_MAX:
-		return plist_last(&o->constraints)->prio;
+		return plist_last(&o->constraints->list)->prio;
 
 	default:
 		/* runtime check for not using enum */
@@ -147,12 +142,12 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
 
 static inline s32 pm_qos_read_value(struct pm_qos_object *o)
 {
-	return o->target_value;
+	return o->constraints->target_value;
 }
 
 static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value)
 {
-	o->target_value = value;
+	o->constraints->target_value = value;
 }
 
 static void update_target(struct pm_qos_object *o, struct plist_node *node,
@@ -170,20 +165,20 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node,
 		 * with new value and add, then see if the extremal
 		 * changed
 		 */
-		plist_del(node, &o->constraints);
+		plist_del(node, &o->constraints->list);
 		plist_node_init(node, value);
-		plist_add(node, &o->constraints);
+		plist_add(node, &o->constraints->list);
 	} else if (del) {
-		plist_del(node, &o->constraints);
+		plist_del(node, &o->constraints->list);
 	} else {
-		plist_add(node, &o->constraints);
+		plist_add(node, &o->constraints->list);
 	}
 	curr_value = pm_qos_get_value(o);
 	pm_qos_set_value(o, curr_value);
 	spin_unlock_irqrestore(&pm_qos_lock, flags);
 
 	if (prev_value != curr_value)
-		blocking_notifier_call_chain(o->notifiers,
+		blocking_notifier_call_chain(o->constraints->notifiers,
 					     (unsigned long)curr_value,
 					     NULL);
 }
@@ -230,7 +225,7 @@ void pm_qos_add_request(struct pm_qos_request *req,
 		return;
 	}
 	if (value == PM_QOS_DEFAULT_VALUE)
-		new_value = o->default_value;
+		new_value = o->constraints->default_value;
 	else
 		new_value = value;
 	plist_node_init(&req->node, new_value);
@@ -266,7 +261,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
 	o = pm_qos_array[req->pm_qos_class];
 
 	if (new_value == PM_QOS_DEFAULT_VALUE)
-		temp = o->default_value;
+		temp = o->constraints->default_value;
 	else
 		temp = new_value;
 
@@ -315,7 +310,8 @@ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
 	int retval;
 
 	retval = blocking_notifier_chain_register(
-			pm_qos_array[pm_qos_class]->notifiers, notifier);
+			pm_qos_array[pm_qos_class]->constraints->notifiers,
+			notifier);
 
 	return retval;
 }
@@ -334,7 +330,8 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
 	int retval;
 
 	retval = blocking_notifier_chain_unregister(
-			pm_qos_array[pm_qos_class]->notifiers, notifier);
+			pm_qos_array[pm_qos_class]->constraints->notifiers,
+			notifier);
 
 	return retval;
 }
-- 
cgit v1.2.2


From abe98ec2d86279fe821c9051003a0abc43444f15 Mon Sep 17 00:00:00 2001
From: Jean Pihet <j-pihet@ti.com>
Date: Thu, 25 Aug 2011 15:35:34 +0200
Subject: PM QoS: Generalize and export constraints management code

In preparation for the per-device constratins support:
 - rename update_target to pm_qos_update_target
 - generalize and export pm_qos_update_target for usage by the upcoming
   per-device latency constraints framework:
   * operate on struct pm_qos_constraints for constraints management,
   * introduce an 'action' parameter for constraints add/update/remove,
   * the return value indicates if the aggregated constraint value has
     changed,
 - update the internal code to operate on struct pm_qos_constraints
 - add a NULL pointer check in the API functions

Signed-off-by: Jean Pihet <j-pihet@ti.com>
Reviewed-by: Kevin Hilman <khilman@ti.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/qos.c | 123 +++++++++++++++++++++++++++++------------------------
 1 file changed, 67 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 4a35fe50b777..7c7cd181cabe 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -122,17 +122,17 @@ static const struct file_operations pm_qos_power_fops = {
 };
 
 /* unlocked internal variant */
-static inline int pm_qos_get_value(struct pm_qos_object *o)
+static inline int pm_qos_get_value(struct pm_qos_constraints *c)
 {
-	if (plist_head_empty(&o->constraints->list))
-		return o->constraints->default_value;
+	if (plist_head_empty(&c->list))
+		return c->default_value;
 
-	switch (o->constraints->type) {
+	switch (c->type) {
 	case PM_QOS_MIN:
-		return plist_first(&o->constraints->list)->prio;
+		return plist_first(&c->list)->prio;
 
 	case PM_QOS_MAX:
-		return plist_last(&o->constraints->list)->prio;
+		return plist_last(&c->list)->prio;
 
 	default:
 		/* runtime check for not using enum */
@@ -140,47 +140,73 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
 	}
 }
 
-static inline s32 pm_qos_read_value(struct pm_qos_object *o)
+static inline s32 pm_qos_read_value(struct pm_qos_constraints *c)
 {
-	return o->constraints->target_value;
+	return c->target_value;
 }
 
-static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value)
+static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
 {
-	o->constraints->target_value = value;
+	c->target_value = value;
 }
 
-static void update_target(struct pm_qos_object *o, struct plist_node *node,
-			  int del, int value)
+/**
+ * pm_qos_update_target - manages the constraints list and calls the notifiers
+ *  if needed
+ * @c: constraints data struct
+ * @node: request to add to the list, to update or to remove
+ * @action: action to take on the constraints list
+ * @value: value of the request to add or update
+ *
+ * This function returns 1 if the aggregated constraint value has changed, 0
+ *  otherwise.
+ */
+int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
+			 enum pm_qos_req_action action, int value)
 {
 	unsigned long flags;
-	int prev_value, curr_value;
+	int prev_value, curr_value, new_value;
 
 	spin_lock_irqsave(&pm_qos_lock, flags);
-	prev_value = pm_qos_get_value(o);
-	/* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */
-	if (value != PM_QOS_DEFAULT_VALUE) {
+	prev_value = pm_qos_get_value(c);
+	if (value == PM_QOS_DEFAULT_VALUE)
+		new_value = c->default_value;
+	else
+		new_value = value;
+
+	switch (action) {
+	case PM_QOS_REMOVE_REQ:
+		plist_del(node, &c->list);
+		break;
+	case PM_QOS_UPDATE_REQ:
 		/*
 		 * to change the list, we atomically remove, reinit
 		 * with new value and add, then see if the extremal
 		 * changed
 		 */
-		plist_del(node, &o->constraints->list);
-		plist_node_init(node, value);
-		plist_add(node, &o->constraints->list);
-	} else if (del) {
-		plist_del(node, &o->constraints->list);
-	} else {
-		plist_add(node, &o->constraints->list);
+		plist_del(node, &c->list);
+	case PM_QOS_ADD_REQ:
+		plist_node_init(node, new_value);
+		plist_add(node, &c->list);
+		break;
+	default:
+		/* no action */
+		;
 	}
-	curr_value = pm_qos_get_value(o);
-	pm_qos_set_value(o, curr_value);
+
+	curr_value = pm_qos_get_value(c);
+	pm_qos_set_value(c, curr_value);
+
 	spin_unlock_irqrestore(&pm_qos_lock, flags);
 
-	if (prev_value != curr_value)
-		blocking_notifier_call_chain(o->constraints->notifiers,
+	if (prev_value != curr_value) {
+		blocking_notifier_call_chain(c->notifiers,
 					     (unsigned long)curr_value,
 					     NULL);
+		return 1;
+	} else {
+		return 0;
+	}
 }
 
 /**
@@ -191,7 +217,7 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node,
  */
 int pm_qos_request(int pm_qos_class)
 {
-	return pm_qos_read_value(pm_qos_array[pm_qos_class]);
+	return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints);
 }
 EXPORT_SYMBOL_GPL(pm_qos_request);
 
@@ -217,20 +243,16 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active);
 void pm_qos_add_request(struct pm_qos_request *req,
 			int pm_qos_class, s32 value)
 {
-	struct pm_qos_object *o =  pm_qos_array[pm_qos_class];
-	int new_value;
+	if (!req) /*guard against callers passing in null */
+		return;
 
 	if (pm_qos_request_active(req)) {
 		WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
 		return;
 	}
-	if (value == PM_QOS_DEFAULT_VALUE)
-		new_value = o->constraints->default_value;
-	else
-		new_value = value;
-	plist_node_init(&req->node, new_value);
 	req->pm_qos_class = pm_qos_class;
-	update_target(o, &req->node, 0, PM_QOS_DEFAULT_VALUE);
+	pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
+			     &req->node, PM_QOS_ADD_REQ, value);
 }
 EXPORT_SYMBOL_GPL(pm_qos_add_request);
 
@@ -247,9 +269,6 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request);
 void pm_qos_update_request(struct pm_qos_request *req,
 			   s32 new_value)
 {
-	s32 temp;
-	struct pm_qos_object *o;
-
 	if (!req) /*guard against callers passing in null */
 		return;
 
@@ -258,15 +277,10 @@ void pm_qos_update_request(struct pm_qos_request *req,
 		return;
 	}
 
-	o = pm_qos_array[req->pm_qos_class];
-
-	if (new_value == PM_QOS_DEFAULT_VALUE)
-		temp = o->constraints->default_value;
-	else
-		temp = new_value;
-
-	if (temp != req->node.prio)
-		update_target(o, &req->node, 0, temp);
+	if (new_value != req->node.prio)
+		pm_qos_update_target(
+			pm_qos_array[req->pm_qos_class]->constraints,
+			&req->node, PM_QOS_UPDATE_REQ, new_value);
 }
 EXPORT_SYMBOL_GPL(pm_qos_update_request);
 
@@ -280,9 +294,7 @@ EXPORT_SYMBOL_GPL(pm_qos_update_request);
  */
 void pm_qos_remove_request(struct pm_qos_request *req)
 {
-	struct pm_qos_object *o;
-
-	if (req == NULL)
+	if (!req) /*guard against callers passing in null */
 		return;
 		/* silent return to keep pcm code cleaner */
 
@@ -291,8 +303,9 @@ void pm_qos_remove_request(struct pm_qos_request *req)
 		return;
 	}
 
-	o = pm_qos_array[req->pm_qos_class];
-	update_target(o, &req->node, 1, PM_QOS_DEFAULT_VALUE);
+	pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
+			     &req->node, PM_QOS_REMOVE_REQ,
+			     PM_QOS_DEFAULT_VALUE);
 	memset(req, 0, sizeof(*req));
 }
 EXPORT_SYMBOL_GPL(pm_qos_remove_request);
@@ -396,7 +409,6 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
 {
 	s32 value;
 	unsigned long flags;
-	struct pm_qos_object *o;
 	struct pm_qos_request *req = filp->private_data;
 
 	if (!req)
@@ -404,9 +416,8 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
 	if (!pm_qos_request_active(req))
 		return -EINVAL;
 
-	o = pm_qos_array[req->pm_qos_class];
 	spin_lock_irqsave(&pm_qos_lock, flags);
-	value = pm_qos_get_value(o);
+	value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints);
 	spin_unlock_irqrestore(&pm_qos_lock, flags);
 
 	return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
-- 
cgit v1.2.2


From b66213cdb002b08b29603d488c451dfe25e2ca20 Mon Sep 17 00:00:00 2001
From: Jean Pihet <j-pihet@ti.com>
Date: Thu, 25 Aug 2011 15:35:47 +0200
Subject: PM QoS: Add global notification mechanism for device constraints

Add a global notification chain that gets called upon changes to the
aggregated constraint value for any device.
The notification callbacks are passing the full constraint request data
in order for the callees to have access to it. The current use is for the
platform low-level code to access the target device of the constraint.

Signed-off-by: Jean Pihet <j-pihet@ti.com>
Reviewed-by: Kevin Hilman <khilman@ti.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/qos.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 7c7cd181cabe..1c1797dd1d1d 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -140,7 +140,7 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c)
 	}
 }
 
-static inline s32 pm_qos_read_value(struct pm_qos_constraints *c)
+s32 pm_qos_read_value(struct pm_qos_constraints *c)
 {
 	return c->target_value;
 }
-- 
cgit v1.2.2


From 86b6ef21b80ac6565d172cdab4384404de007eea Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 22 Aug 2011 09:41:46 -0400
Subject: tracing: Add preempt disable for filter self test

The self testing for event filters does not really need preemption
disabled as there are no races at the time of testing, but the functions
it calls uses rcu_dereference_sched() which will complain if preemption
is not disabled.

Cc: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 6a642e278241..816d3d074979 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -2101,6 +2101,11 @@ static __init int ftrace_test_event_filter(void)
 			break;
 		}
 
+		/*
+		 * The preemption disabling is not really needed for self
+		 * tests, but the rcu dereference will complain without it.
+		 */
+		preempt_disable();
 		if (*d->not_visited)
 			walk_pred_tree(filter->preds, filter->root,
 				       test_walk_pred_cb,
@@ -2108,6 +2113,7 @@ static __init int ftrace_test_event_filter(void)
 
 		test_pred_visited = 0;
 		err = filter_match_preds(filter, &d->rec);
+		preempt_enable();
 
 		__free_filter(filter);
 
-- 
cgit v1.2.2


From f81ab074c30234b07c8309c542cafd07bed721f7 Mon Sep 17 00:00:00 2001
From: Vaibhav Nagarnaik <vnagarnaik@google.com>
Date: Tue, 16 Aug 2011 14:46:15 -0700
Subject: trace: Add a new readonly entry to report total buffer size

The current file "buffer_size_kb" reports the size of per-cpu buffer and
not the overall memory allocated which could be misleading. A new file
"buffer_total_size_kb" adds up all the enabled CPU buffer sizes and
reports it. This is only a readonly entry.

Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com>
Cc: Michael Rubin <mrubin@google.com>
Cc: David Sharp <dhsharp@google.com>
Link: http://lkml.kernel.org/r/1313531179-9323-2-git-send-email-vnagarnaik@google.com
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e5df02c69b1d..01176788c387 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3568,6 +3568,30 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
+static ssize_t
+tracing_total_entries_read(struct file *filp, char __user *ubuf,
+				size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	char buf[64];
+	int r, cpu;
+	unsigned long size = 0, expanded_size = 0;
+
+	mutex_lock(&trace_types_lock);
+	for_each_tracing_cpu(cpu) {
+		size += tr->entries >> 10;
+		if (!ring_buffer_expanded)
+			expanded_size += trace_buf_size >> 10;
+	}
+	if (ring_buffer_expanded)
+		r = sprintf(buf, "%lu\n", size);
+	else
+		r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);
+	mutex_unlock(&trace_types_lock);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
 static ssize_t
 tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
 			  size_t cnt, loff_t *ppos)
@@ -3739,6 +3763,12 @@ static const struct file_operations tracing_entries_fops = {
 	.llseek		= generic_file_llseek,
 };
 
+static const struct file_operations tracing_total_entries_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_total_entries_read,
+	.llseek		= generic_file_llseek,
+};
+
 static const struct file_operations tracing_free_buffer_fops = {
 	.write		= tracing_free_buffer_write,
 	.release	= tracing_free_buffer_release,
@@ -4450,6 +4480,9 @@ static __init int tracer_init_debugfs(void)
 	trace_create_file("buffer_size_kb", 0644, d_tracer,
 			&global_trace, &tracing_entries_fops);
 
+	trace_create_file("buffer_total_size_kb", 0444, d_tracer,
+			&global_trace, &tracing_total_entries_fops);
+
 	trace_create_file("free_buffer", 0644, d_tracer,
 			&global_trace, &tracing_free_buffer_fops);
 
-- 
cgit v1.2.2


From c64e148a3be3cb786534ad38298c25c833116c26 Mon Sep 17 00:00:00 2001
From: Vaibhav Nagarnaik <vnagarnaik@google.com>
Date: Tue, 16 Aug 2011 14:46:16 -0700
Subject: trace: Add ring buffer stats to measure rate of events

The stats file under per_cpu folder provides the number of entries,
overruns and other statistics about the CPU ring buffer. However, the
numbers do not provide any indication of how full the ring buffer is in
bytes compared to the overall size in bytes. Also, it is helpful to know
the rate at which the cpu buffer is filling up.

This patch adds an entry "bytes: " in printed stats for per_cpu ring
buffer which provides the actual bytes consumed in the ring buffer. This
field includes the number of bytes used by recorded events and the
padding bytes added when moving the tail pointer to next page.

It also adds the following time stamps:
"oldest event ts:" - the oldest timestamp in the ring buffer
"now ts:"  - the timestamp at the time of reading

The field "now ts" provides a consistent time snapshot to the userspace
when being read. This is read from the same trace clock used by tracing
event timestamps.

Together, these values provide the rate at which the buffer is filling
up, from the formula:
bytes / (now_ts - oldest_event_ts)

Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com>
Cc: Michael Rubin <mrubin@google.com>
Cc: David Sharp <dhsharp@google.com>
Link: http://lkml.kernel.org/r/1313531179-9323-3-git-send-email-vnagarnaik@google.com
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 70 +++++++++++++++++++++++++++++++++++++++++++++-
 kernel/trace/trace.c       | 13 +++++++++
 2 files changed, 82 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 731201bf4acc..acf6b68dc4a8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -488,12 +488,14 @@ struct ring_buffer_per_cpu {
 	struct buffer_page		*reader_page;
 	unsigned long			lost_events;
 	unsigned long			last_overrun;
+	local_t				entries_bytes;
 	local_t				commit_overrun;
 	local_t				overrun;
 	local_t				entries;
 	local_t				committing;
 	local_t				commits;
 	unsigned long			read;
+	unsigned long			read_bytes;
 	u64				write_stamp;
 	u64				read_stamp;
 };
@@ -1708,6 +1710,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
 		 * the counters.
 		 */
 		local_add(entries, &cpu_buffer->overrun);
+		local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
 
 		/*
 		 * The entries will be zeroed out when we move the
@@ -1863,6 +1866,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	event = __rb_page_index(tail_page, tail);
 	kmemcheck_annotate_bitfield(event, bitfield);
 
+	/* account for padding bytes */
+	local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
+
 	/*
 	 * Save the original length to the meta data.
 	 * This will be used by the reader to add lost event
@@ -2054,6 +2060,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	if (!tail)
 		tail_page->page->time_stamp = ts;
 
+	/* account for these added bytes */
+	local_add(length, &cpu_buffer->entries_bytes);
+
 	return event;
 }
 
@@ -2076,6 +2085,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
 		unsigned long write_mask =
 			local_read(&bpage->write) & ~RB_WRITE_MASK;
+		unsigned long event_length = rb_event_length(event);
 		/*
 		 * This is on the tail page. It is possible that
 		 * a write could come in and move the tail page
@@ -2085,8 +2095,11 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 		old_index += write_mask;
 		new_index += write_mask;
 		index = local_cmpxchg(&bpage->write, old_index, new_index);
-		if (index == old_index)
+		if (index == old_index) {
+			/* update counters */
+			local_sub(event_length, &cpu_buffer->entries_bytes);
 			return 1;
+		}
 	}
 
 	/* could not discard */
@@ -2660,6 +2673,58 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
 		(local_read(&cpu_buffer->overrun) + cpu_buffer->read);
 }
 
+/**
+ * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to read from.
+ */
+unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
+{
+	unsigned long flags;
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct buffer_page *bpage;
+	unsigned long ret;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return 0;
+
+	cpu_buffer = buffer->buffers[cpu];
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	/*
+	 * if the tail is on reader_page, oldest time stamp is on the reader
+	 * page
+	 */
+	if (cpu_buffer->tail_page == cpu_buffer->reader_page)
+		bpage = cpu_buffer->reader_page;
+	else
+		bpage = rb_set_head_page(cpu_buffer);
+	ret = bpage->page->time_stamp;
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
+
+/**
+ * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to read from.
+ */
+unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long ret;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return 0;
+
+	cpu_buffer = buffer->buffers[cpu];
+	ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu);
+
 /**
  * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
  * @buffer: The ring buffer
@@ -3527,11 +3592,13 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->reader_page->read = 0;
 
 	local_set(&cpu_buffer->commit_overrun, 0);
+	local_set(&cpu_buffer->entries_bytes, 0);
 	local_set(&cpu_buffer->overrun, 0);
 	local_set(&cpu_buffer->entries, 0);
 	local_set(&cpu_buffer->committing, 0);
 	local_set(&cpu_buffer->commits, 0);
 	cpu_buffer->read = 0;
+	cpu_buffer->read_bytes = 0;
 
 	cpu_buffer->write_stamp = 0;
 	cpu_buffer->read_stamp = 0;
@@ -3918,6 +3985,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	} else {
 		/* update the entry counter */
 		cpu_buffer->read += rb_page_entries(reader);
+		cpu_buffer->read_bytes += BUF_PAGE_SIZE;
 
 		/* swap the pages */
 		rb_init_page(bpage);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 01176788c387..b41907000bc6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4056,6 +4056,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
 	struct trace_array *tr = &global_trace;
 	struct trace_seq *s;
 	unsigned long cnt;
+	unsigned long long t;
+	unsigned long usec_rem;
 
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (!s)
@@ -4072,6 +4074,17 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
 	cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
 	trace_seq_printf(s, "commit overrun: %ld\n", cnt);
 
+	cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "bytes: %ld\n", cnt);
+
+	t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
+	usec_rem = do_div(t, USEC_PER_SEC);
+	trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem);
+
+	t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
+	usec_rem = do_div(t, USEC_PER_SEC);
+	trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
+
 	count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
 
 	kfree(s);
-- 
cgit v1.2.2


From 5f12a761937373d2f9b557d7519e6f1cf738b8f0 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 11 Aug 2011 12:31:20 +0100
Subject: perf: provide PMU when initing events

Currently, an event's 'pmu' field is set after pmu::event_init() is
called. This means that pmu::event_init() must figure out which struct
pmu the event was initialised from. This makes it difficult to
consolidate common event initialisation code for similar PMUs, and
very difficult to implement drivers for PMUs which can have multiple
instances (e.g. a USB controller PMU, a GPU PMU, etc).

This patch sets the 'pmu' field before initialising the event, allowing
event init code to identify the struct pmu instance easily. In the
event of failure to initialise an event, the event is destroyed via
kfree() without calling perf_event::destroy(), so this shouldn't
result in bad behaviour even if the destroy field was set before
failure to initialise was noted.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1313062280-19123-1-git-send-email-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/events/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index b8785e26ee1c..68c8017de969 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5715,6 +5715,7 @@ struct pmu *perf_init_event(struct perf_event *event)
 	pmu = idr_find(&pmu_idr, event->attr.type);
 	rcu_read_unlock();
 	if (pmu) {
+		event->pmu = pmu;
 		ret = pmu->event_init(event);
 		if (ret)
 			pmu = ERR_PTR(ret);
@@ -5722,6 +5723,7 @@ struct pmu *perf_init_event(struct perf_event *event)
 	}
 
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		event->pmu = pmu;
 		ret = pmu->event_init(event);
 		if (!ret)
 			goto unlock;
@@ -5848,8 +5850,6 @@ done:
 		return ERR_PTR(err);
 	}
 
-	event->pmu = pmu;
-
 	if (!event->parent) {
 		if (event->attach_state & PERF_ATTACH_TASK)
 			jump_label_inc(&perf_sched_events);
-- 
cgit v1.2.2


From 6beea0cda8ce71c01354e688e5735c47e331e84f Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Wed, 24 Aug 2011 09:37:48 +0200
Subject: nohz: Fix update_ts_time_stat idle accounting

update_ts_time_stat currently updates idle time even if we are in
iowait loop at the moment. The only real users of the idle counter
(via get_cpu_idle_time_us) are CPU governors and they expect to get
cumulative time for both idle and iowait times.
The value (idle_sleeptime) is also printed to userspace by print_cpu
but it prints both idle and iowait times so the idle part is misleading.

Let's clean this up and fix update_ts_time_stat to account both counters
properly and update consumers of idle to consider iowait time as well.
If we do this we might use get_cpu_{idle,iowait}_time_us from other
contexts as well and we will get expected values.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Cc: Dave Jones <davej@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Link: http://lkml.kernel.org/r/e9c909c221a8da402c4da07e4cd968c3218f8eb1.1314172057.git.mhocko@suse.cz
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d5097c44b407..7ab44bca6546 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -159,9 +159,10 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda
 
 	if (ts->idle_active) {
 		delta = ktime_sub(now, ts->idle_entrytime);
-		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
 		if (nr_iowait_cpu(cpu) > 0)
 			ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+		else
+			ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
 		ts->idle_entrytime = now;
 	}
 
@@ -200,8 +201,7 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
  * @last_update_time: variable to store update time in
  *
  * Return the cummulative idle time (since boot) for a given
- * CPU, in microseconds. The idle time returned includes
- * the iowait time (unlike what "top" and co report).
+ * CPU, in microseconds.
  *
  * This time is measured via accounting rather than sampling,
  * and is as accurate as ktime_get() is.
@@ -221,7 +221,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 }
 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 
-/*
+/**
  * get_cpu_iowait_time_us - get the total iowait time of a cpu
  * @cpu: CPU number to query
  * @last_update_time: variable to store update time in
-- 
cgit v1.2.2


From 09a1d34f8535ecf9a347ea76f7597730c2bc0c8d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Wed, 24 Aug 2011 09:39:30 +0200
Subject: nohz: Make idle/iowait counter update conditional

get_cpu_{idle,iowait}_time_us update idle/iowait counters
unconditionally if the given CPU is in the idle loop.

This doesn't work well outside of CPU governors which are singletons
so nobody (except for IRQ) can race with them.

We will need to use both functions from /proc/stat handler to properly
handle nohz idle/iowait times.

Make the update depend on a non NULL last_update_time argument.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Cc: Dave Jones <davej@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Link: http://lkml.kernel.org/r/11f23179472635ce52e78921d47a20216b872f23.1314172057.git.mhocko@suse.cz
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 41 +++++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7ab44bca6546..664c4a365439 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -198,7 +198,8 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
 /**
  * get_cpu_idle_time_us - get the total idle time of a cpu
  * @cpu: CPU number to query
- * @last_update_time: variable to store update time in
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
  *
  * Return the cummulative idle time (since boot) for a given
  * CPU, in microseconds.
@@ -211,20 +212,35 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+	ktime_t now, idle;
 
 	if (!tick_nohz_enabled)
 		return -1;
 
-	update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
+	now = ktime_get();
+	if (last_update_time) {
+		update_ts_time_stats(cpu, ts, now, last_update_time);
+		idle = ts->idle_sleeptime;
+	} else {
+		if (ts->idle_active && !nr_iowait_cpu(cpu)) {
+			ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+
+			idle = ktime_add(ts->idle_sleeptime, delta);
+		} else {
+			idle = ts->idle_sleeptime;
+		}
+	}
+
+	return ktime_to_us(idle);
 
-	return ktime_to_us(ts->idle_sleeptime);
 }
 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 
 /**
  * get_cpu_iowait_time_us - get the total iowait time of a cpu
  * @cpu: CPU number to query
- * @last_update_time: variable to store update time in
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
  *
  * Return the cummulative iowait time (since boot) for a given
  * CPU, in microseconds.
@@ -237,13 +253,26 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+	ktime_t now, iowait;
 
 	if (!tick_nohz_enabled)
 		return -1;
 
-	update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
+	now = ktime_get();
+	if (last_update_time) {
+		update_ts_time_stats(cpu, ts, now, last_update_time);
+		iowait = ts->iowait_sleeptime;
+	} else {
+		if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
+			ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+
+			iowait = ktime_add(ts->iowait_sleeptime, delta);
+		} else {
+			iowait = ts->iowait_sleeptime;
+		}
+	}
 
-	return ktime_to_us(ts->iowait_sleeptime);
+	return ktime_to_us(iowait);
 }
 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 
-- 
cgit v1.2.2


From 29c158e81c733ac7d6a75c5ee929f34fb9f92983 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 23 Aug 2011 13:20:46 +0200
Subject: nohz: Remove "Switched to NOHz mode" debugging messages

When performing cpu hotplug tests the kernel printk log buffer gets flooded
with pointless "Switched to NOHz mode..." messages. Especially when afterwards
analyzing a dump this might have removed more interesting stuff out of the
buffer.
Assuming that switching to NOHz mode simply works just remove the printk.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Link: http://lkml.kernel.org/r/20110823112046.GB2540@osiris.boeblingen.de.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 664c4a365439..7e2e0817cbfb 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -669,8 +669,6 @@ static void tick_nohz_switch_to_nohz(void)
 		next = ktime_add(next, tick_period);
 	}
 	local_irq_enable();
-
-	printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
 }
 
 /*
@@ -822,10 +820,8 @@ void tick_setup_sched_timer(void)
 	}
 
 #ifdef CONFIG_NO_HZ
-	if (tick_nohz_enabled) {
+	if (tick_nohz_enabled)
 		ts->nohz_mode = NOHZ_MODE_HIGHRES;
-		printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
-	}
 #endif
 }
 #endif /* HIGH_RES_TIMERS */
-- 
cgit v1.2.2


From d1748302f70be7469809809283fe164156a34231 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 23 Aug 2011 15:29:42 +0200
Subject: clockevents: Make minimum delay adjustments configurable

The automatic increase of the min_delta_ns of a clockevents device
should be done in the clockevents code as the minimum delay is an
attribute of the clockevents device.

In addition not all architectures want the automatic adjustment, on a
massively virtualized system it can happen that the programming of a
clock event fails several times in a row because the virtual cpu has
been rescheduled quickly enough. In that case the minimum delay will
erroneously be increased with no way back. The new config symbol
GENERIC_CLOCKEVENTS_MIN_ADJUST is used to enable the automatic
adjustment. The config option is selected only for x86.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: john stultz <johnstul@us.ibm.com>
Link: http://lkml.kernel.org/r/20110823133142.494157493@de.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/Kconfig          |   2 +
 kernel/time/clockevents.c    | 125 ++++++++++++++++++++++++++++++++++++++-----
 kernel/time/tick-broadcast.c |   4 +-
 kernel/time/tick-common.c    |   4 +-
 kernel/time/tick-internal.h  |   2 -
 kernel/time/tick-oneshot.c   |  77 ++------------------------
 6 files changed, 121 insertions(+), 93 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f06a8a365648..b26c2228fe92 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -27,3 +27,5 @@ config GENERIC_CLOCKEVENTS_BUILD
 	default y
 	depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR
 
+config GENERIC_CLOCKEVENTS_MIN_ADJUST
+	bool
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index e4c699dfa4e8..713ef94eceef 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,42 +94,139 @@ void clockevents_shutdown(struct clock_event_device *dev)
 	dev->next_event.tv64 = KTIME_MAX;
 }
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
+
+/* Limit min_delta to a jiffie */
+#define MIN_DELTA_LIMIT		(NSEC_PER_SEC / HZ)
+
+/**
+ * clockevents_increase_min_delta - raise minimum delta of a clock event device
+ * @dev:       device to increase the minimum delta
+ *
+ * Returns 0 on success, -ETIME when the minimum delta reached the limit.
+ */
+static int clockevents_increase_min_delta(struct clock_event_device *dev)
+{
+	/* Nothing to do if we already reached the limit */
+	if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
+		printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n");
+		dev->next_event.tv64 = KTIME_MAX;
+		return -ETIME;
+	}
+
+	if (dev->min_delta_ns < 5000)
+		dev->min_delta_ns = 5000;
+	else
+		dev->min_delta_ns += dev->min_delta_ns >> 1;
+
+	if (dev->min_delta_ns > MIN_DELTA_LIMIT)
+		dev->min_delta_ns = MIN_DELTA_LIMIT;
+
+	printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
+	       dev->name ? dev->name : "?",
+	       (unsigned long long) dev->min_delta_ns);
+	return 0;
+}
+
+/**
+ * clockevents_program_min_delta - Set clock event device to the minimum delay.
+ * @dev:	device to program
+ *
+ * Returns 0 on success, -ETIME when the retry loop failed.
+ */
+static int clockevents_program_min_delta(struct clock_event_device *dev)
+{
+	unsigned long long clc;
+	int64_t delta;
+	int i;
+
+	for (i = 0;;) {
+		delta = dev->min_delta_ns;
+		dev->next_event = ktime_add_ns(ktime_get(), delta);
+
+		if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+			return 0;
+
+		dev->retries++;
+		clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+		if (dev->set_next_event((unsigned long) clc, dev) == 0)
+			return 0;
+
+		if (++i > 2) {
+			/*
+			 * We tried 3 times to program the device with the
+			 * given min_delta_ns. Try to increase the minimum
+			 * delta, if that fails as well get out of here.
+			 */
+			if (clockevents_increase_min_delta(dev))
+				return -ETIME;
+			i = 0;
+		}
+	}
+}
+
+#else  /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
+
+/**
+ * clockevents_program_min_delta - Set clock event device to the minimum delay.
+ * @dev:	device to program
+ *
+ * Returns 0 on success, -ETIME when the retry loop failed.
+ */
+static int clockevents_program_min_delta(struct clock_event_device *dev)
+{
+	unsigned long long clc;
+	int64_t delta;
+
+	delta = dev->min_delta_ns;
+	dev->next_event = ktime_add_ns(ktime_get(), delta);
+
+	if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+		return 0;
+
+	dev->retries++;
+	clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+	return dev->set_next_event((unsigned long) clc, dev);
+}
+
+#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
+
 /**
  * clockevents_program_event - Reprogram the clock event device.
+ * @dev:	device to program
  * @expires:	absolute expiry time (monotonic clock)
+ * @force:	program minimum delay if expires can not be set
  *
  * Returns 0 on success, -ETIME when the event is in the past.
  */
 int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
-			      ktime_t now)
+			      bool force)
 {
 	unsigned long long clc;
 	int64_t delta;
+	int rc;
 
 	if (unlikely(expires.tv64 < 0)) {
 		WARN_ON_ONCE(1);
 		return -ETIME;
 	}
 
-	delta = ktime_to_ns(ktime_sub(expires, now));
-
-	if (delta <= 0)
-		return -ETIME;
-
 	dev->next_event = expires;
 
 	if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
 		return 0;
 
-	if (delta > dev->max_delta_ns)
-		delta = dev->max_delta_ns;
-	if (delta < dev->min_delta_ns)
-		delta = dev->min_delta_ns;
+	delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
+	if (delta <= 0)
+		return force ? clockevents_program_min_delta(dev) : -ETIME;
 
-	clc = delta * dev->mult;
-	clc >>= dev->shift;
+	delta = min(delta, (int64_t) dev->max_delta_ns);
+	delta = max(delta, (int64_t) dev->min_delta_ns);
 
-	return dev->set_next_event((unsigned long) clc, dev);
+	clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+	rc = dev->set_next_event((unsigned long) clc, dev);
+
+	return (rc && force) ? clockevents_program_min_delta(dev) : rc;
 }
 
 /**
@@ -258,7 +355,7 @@ int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
 	if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
 		return 0;
 
-	return clockevents_program_event(dev, dev->next_event, ktime_get());
+	return clockevents_program_event(dev, dev->next_event, false);
 }
 
 /*
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index c7218d132738..f954282d9a82 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -194,7 +194,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
 	for (next = dev->next_event; ;) {
 		next = ktime_add(next, tick_period);
 
-		if (!clockevents_program_event(dev, next, ktime_get()))
+		if (!clockevents_program_event(dev, next, false))
 			return;
 		tick_do_periodic_broadcast();
 	}
@@ -373,7 +373,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
 {
 	struct clock_event_device *bc = tick_broadcast_device.evtdev;
 
-	return tick_dev_program_event(bc, expires, force);
+	return clockevents_program_event(bc, expires, force);
 }
 
 int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 119528de8235..da6c9ecad4e4 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -94,7 +94,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
 	 */
 	next = ktime_add(dev->next_event, tick_period);
 	for (;;) {
-		if (!clockevents_program_event(dev, next, ktime_get()))
+		if (!clockevents_program_event(dev, next, false))
 			return;
 		/*
 		 * Have to be careful here. If we're in oneshot mode,
@@ -137,7 +137,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 		clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
 
 		for (;;) {
-			if (!clockevents_program_event(dev, next, ktime_get()))
+			if (!clockevents_program_event(dev, next, false))
 				return;
 			next = ktime_add(next, tick_period);
 		}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 1009b06d6f89..4e265b901fed 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -26,8 +26,6 @@ extern void clockevents_shutdown(struct clock_event_device *dev);
 extern void tick_setup_oneshot(struct clock_event_device *newdev,
 			       void (*handler)(struct clock_event_device *),
 			       ktime_t nextevt);
-extern int tick_dev_program_event(struct clock_event_device *dev,
-				  ktime_t expires, int force);
 extern int tick_program_event(ktime_t expires, int force);
 extern void tick_oneshot_notify(void);
 extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2d04411a5f05..824109060a33 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -21,74 +21,6 @@
 
 #include "tick-internal.h"
 
-/* Limit min_delta to a jiffie */
-#define MIN_DELTA_LIMIT		(NSEC_PER_SEC / HZ)
-
-static int tick_increase_min_delta(struct clock_event_device *dev)
-{
-	/* Nothing to do if we already reached the limit */
-	if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
-		return -ETIME;
-
-	if (dev->min_delta_ns < 5000)
-		dev->min_delta_ns = 5000;
-	else
-		dev->min_delta_ns += dev->min_delta_ns >> 1;
-
-	if (dev->min_delta_ns > MIN_DELTA_LIMIT)
-		dev->min_delta_ns = MIN_DELTA_LIMIT;
-
-	printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
-	       dev->name ? dev->name : "?",
-	       (unsigned long long) dev->min_delta_ns);
-	return 0;
-}
-
-/**
- * tick_program_event internal worker function
- */
-int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
-			   int force)
-{
-	ktime_t now = ktime_get();
-	int i;
-
-	for (i = 0;;) {
-		int ret = clockevents_program_event(dev, expires, now);
-
-		if (!ret || !force)
-			return ret;
-
-		dev->retries++;
-		/*
-		 * We tried 3 times to program the device with the given
-		 * min_delta_ns. If that's not working then we increase it
-		 * and emit a warning.
-		 */
-		if (++i > 2) {
-			/* Increase the min. delta and try again */
-			if (tick_increase_min_delta(dev)) {
-				/*
-				 * Get out of the loop if min_delta_ns
-				 * hit the limit already. That's
-				 * better than staying here forever.
-				 *
-				 * We clear next_event so we have a
-				 * chance that the box survives.
-				 */
-				printk(KERN_WARNING
-				       "CE: Reprogramming failure. Giving up\n");
-				dev->next_event.tv64 = KTIME_MAX;
-				return -ETIME;
-			}
-			i = 0;
-		}
-
-		now = ktime_get();
-		expires = ktime_add_ns(now, dev->min_delta_ns);
-	}
-}
-
 /**
  * tick_program_event
  */
@@ -96,7 +28,7 @@ int tick_program_event(ktime_t expires, int force)
 {
 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 
-	return tick_dev_program_event(dev, expires, force);
+	return clockevents_program_event(dev, expires, force);
 }
 
 /**
@@ -104,11 +36,10 @@ int tick_program_event(ktime_t expires, int force)
  */
 void tick_resume_oneshot(void)
 {
-	struct tick_device *td = &__get_cpu_var(tick_cpu_device);
-	struct clock_event_device *dev = td->evtdev;
+	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 
 	clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
-	tick_program_event(ktime_get(), 1);
+	clockevents_program_event(dev, ktime_get(), true);
 }
 
 /**
@@ -120,7 +51,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
 {
 	newdev->event_handler = handler;
 	clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
-	tick_dev_program_event(newdev, next_event, 1);
+	clockevents_program_event(newdev, next_event, true);
 }
 
 /**
-- 
cgit v1.2.2


From 65516f8a7c2028381f0dae4c16ddb621c96158cc Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 23 Aug 2011 15:29:43 +0200
Subject: clockevents: Add direct ktime programming function

There is at least one architecture (s390) with a sane clockevent device
that can be programmed with the equivalent of a ktime. No need to create
a delta against the current time, the ktime can be used directly.

A new clock device function 'set_next_ktime' is introduced that is called
with the unmodified ktime for the timer if the clock event device has the
CLOCK_EVT_FEAT_KTIME bit set.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: john stultz <johnstul@us.ibm.com>
Link: http://lkml.kernel.org/r/20110823133142.815350967@de.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 713ef94eceef..1ecd6ba36d6c 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -216,6 +216,10 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
 	if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
 		return 0;
 
+	/* Shortcut for clockevent devices that can deal with ktime. */
+	if (dev->features & CLOCK_EVT_FEAT_KTIME)
+		return dev->set_next_ktime(expires, dev);
+
 	delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
 	if (delta <= 0)
 		return force ? clockevents_program_min_delta(dev) : -ETIME;
-- 
cgit v1.2.2


From e8abccb719377af63cb0f1fed289db405e3def16 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 1 Sep 2011 12:42:04 +0200
Subject: posix-cpu-timers: Cure SMP accounting oddities

David reported:

  Attached below is a watered-down version of rt/tst-cpuclock2.c from
  GLIBC.  Just build it with "gcc -o test test.c -lpthread -lrt" or
  similar.

  Run it several times, and you will see cases where the main thread
  will measure a process clock difference before and after the nanosleep
  which is smaller than the cpu-burner thread's individual thread clock
  difference.  This doesn't make any sense since the cpu-burner thread
  is part of the top-level process's thread group.

  I've reproduced this on both x86-64 and sparc64 (using both 32-bit and
  64-bit binaries).

  For example:

  [davem@boricha build-x86_64-linux]$ ./test
  process: before(0.001221967) after(0.498624371) diff(497402404)
  thread:  before(0.000081692) after(0.498316431) diff(498234739)
  self:    before(0.001223521) after(0.001240219) diff(16698)
  [davem@boricha build-x86_64-linux]$

  The diff of 'process' should always be >= the diff of 'thread'.

  I make sure to wrap the 'thread' clock measurements the most tightly
  around the nanosleep() call, and that the 'process' clock measurements
  are the outer-most ones.

  ---
  #include <unistd.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <time.h>
  #include <fcntl.h>
  #include <string.h>
  #include <errno.h>
  #include <pthread.h>

  static pthread_barrier_t barrier;

  static void *chew_cpu(void *arg)
  {
	  pthread_barrier_wait(&barrier);
	  while (1)
		  __asm__ __volatile__("" : : : "memory");
	  return NULL;
  }

  int main(void)
  {
	  clockid_t process_clock, my_thread_clock, th_clock;
	  struct timespec process_before, process_after;
	  struct timespec me_before, me_after;
	  struct timespec th_before, th_after;
	  struct timespec sleeptime;
	  unsigned long diff;
	  pthread_t th;
	  int err;

	  err = clock_getcpuclockid(0, &process_clock);
	  if (err)
		  return 1;

	  err = pthread_getcpuclockid(pthread_self(), &my_thread_clock);
	  if (err)
		  return 1;

	  pthread_barrier_init(&barrier, NULL, 2);
	  err = pthread_create(&th, NULL, chew_cpu, NULL);
	  if (err)
		  return 1;

	  err = pthread_getcpuclockid(th, &th_clock);
	  if (err)
		  return 1;

	  pthread_barrier_wait(&barrier);

	  err = clock_gettime(process_clock, &process_before);
	  if (err)
		  return 1;

	  err = clock_gettime(my_thread_clock, &me_before);
	  if (err)
		  return 1;

	  err = clock_gettime(th_clock, &th_before);
	  if (err)
		  return 1;

	  sleeptime.tv_sec = 0;
	  sleeptime.tv_nsec = 500000000;
	  nanosleep(&sleeptime, NULL);

	  err = clock_gettime(th_clock, &th_after);
	  if (err)
		  return 1;

	  err = clock_gettime(my_thread_clock, &me_after);
	  if (err)
		  return 1;

	  err = clock_gettime(process_clock, &process_after);
	  if (err)
		  return 1;

	  diff = process_after.tv_nsec - process_before.tv_nsec;
	  printf("process: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n",
		 process_before.tv_sec, process_before.tv_nsec,
		 process_after.tv_sec, process_after.tv_nsec, diff);
	  diff = th_after.tv_nsec - th_before.tv_nsec;
	  printf("thread:  before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n",
		 th_before.tv_sec, th_before.tv_nsec,
		 th_after.tv_sec, th_after.tv_nsec, diff);
	  diff = me_after.tv_nsec - me_before.tv_nsec;
	  printf("self:    before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n",
		 me_before.tv_sec, me_before.tv_nsec,
		 me_after.tv_sec, me_after.tv_nsec, diff);

	  return 0;
  }

This is due to us using p->se.sum_exec_runtime in
thread_group_cputime() where we iterate the thread group and sum all
data. This does not take time since the last schedule operation (tick
or otherwise) into account. We can cure this by using
task_sched_runtime() at the cost of having to take locks.

This also means we can (and must) do away with
thread_group_sched_runtime() since the modified thread_group_cputime()
is now more accurate and would deadlock when called from
thread_group_sched_runtime().

Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1314874459.7945.22.camel@twins
Cc: stable@kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-cpu-timers.c |  5 +++--
 kernel/sched.c            | 24 ------------------------
 2 files changed, 3 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 58f405b581e7..c8008dd58ef2 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -250,7 +250,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 	do {
 		times->utime = cputime_add(times->utime, t->utime);
 		times->stime = cputime_add(times->stime, t->stime);
-		times->sum_exec_runtime += t->se.sum_exec_runtime;
+		times->sum_exec_runtime += task_sched_runtime(t);
 	} while_each_thread(tsk, t);
 out:
 	rcu_read_unlock();
@@ -312,7 +312,8 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 		cpu->cpu = cputime.utime;
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = thread_group_sched_runtime(p);
+		thread_group_cputime(p, &cputime);
+		cpu->sched = cputime.sum_exec_runtime;
 		break;
 	}
 	return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index ccacdbdecf45..e1290ecee3c2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3724,30 +3724,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	return ns;
 }
 
-/*
- * Return sum_exec_runtime for the thread group.
- * In case the task is currently running, return the sum plus current's
- * pending runtime that have not been accounted yet.
- *
- * Note that the thread group might have other running tasks as well,
- * so the return value not includes other pending runtime that other
- * running tasks might have.
- */
-unsigned long long thread_group_sched_runtime(struct task_struct *p)
-{
-	struct task_cputime totals;
-	unsigned long flags;
-	struct rq *rq;
-	u64 ns;
-
-	rq = task_rq_lock(p, &flags);
-	thread_group_cputime(p, &totals);
-	ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
-	task_rq_unlock(rq, p, &flags);
-
-	return ns;
-}
-
 /*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
-- 
cgit v1.2.2


From 60f96b41f71d2a13d1c0a457b8b77958f77142d1 Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar <santosh.shilimkar@ti.com>
Date: Fri, 9 Sep 2011 13:59:35 +0530
Subject: genirq: Add IRQCHIP_SKIP_SET_WAKE flag

Some irq chips need the irq_set_wake() functionality, but do not
require a irq_set_wake() callback. Instead of forcing an empty
callback to be implemented add a flag which notes this fact. Check for
the flag in set_irq_wake_real() and return success when set.

Signed-off-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9b956fa20308..7e1a3ed1e61a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -467,6 +467,9 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
 	struct irq_desc *desc = irq_to_desc(irq);
 	int ret = -ENXIO;
 
+	if (irq_desc_get_chip(desc)->flags &  IRQCHIP_SKIP_SET_WAKE)
+		return 0;
+
 	if (desc->irq_data.chip->irq_set_wake)
 		ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
 
-- 
cgit v1.2.2


From 0fa914c632210e09940f0bf5b85d2329b75e04e0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Jun 2011 09:58:38 +0200
Subject: rtmutex: Cleanup the debug code

Use the existing lock debugging macros.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/rtmutex-debug.c | 75 +++++++-------------------------------------------
 1 file changed, 10 insertions(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 3c7cbc2c33be..cb1ced6967ad 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -29,61 +29,6 @@
 
 #include "rtmutex_common.h"
 
-# define TRACE_WARN_ON(x)			WARN_ON(x)
-# define TRACE_BUG_ON(x)			BUG_ON(x)
-
-# define TRACE_OFF()						\
-do {								\
-	if (rt_trace_on) {					\
-		rt_trace_on = 0;				\
-		console_verbose();				\
-		if (raw_spin_is_locked(&current->pi_lock))	\
-			raw_spin_unlock(&current->pi_lock);	\
-	}							\
-} while (0)
-
-# define TRACE_OFF_NOLOCK()					\
-do {								\
-	if (rt_trace_on) {					\
-		rt_trace_on = 0;				\
-		console_verbose();				\
-	}							\
-} while (0)
-
-# define TRACE_BUG_LOCKED()			\
-do {						\
-	TRACE_OFF();				\
-	BUG();					\
-} while (0)
-
-# define TRACE_WARN_ON_LOCKED(c)		\
-do {						\
-	if (unlikely(c)) {			\
-		TRACE_OFF();			\
-		WARN_ON(1);			\
-	}					\
-} while (0)
-
-# define TRACE_BUG_ON_LOCKED(c)			\
-do {						\
-	if (unlikely(c))			\
-		TRACE_BUG_LOCKED();		\
-} while (0)
-
-#ifdef CONFIG_SMP
-# define SMP_TRACE_BUG_ON_LOCKED(c)	TRACE_BUG_ON_LOCKED(c)
-#else
-# define SMP_TRACE_BUG_ON_LOCKED(c)	do { } while (0)
-#endif
-
-/*
- * deadlock detection flag. We turn it off when we detect
- * the first problem because we dont want to recurse back
- * into the tracing code when doing error printk or
- * executing a BUG():
- */
-static int rt_trace_on = 1;
-
 static void printk_task(struct task_struct *p)
 {
 	if (p)
@@ -111,8 +56,8 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
 
 void rt_mutex_debug_task_free(struct task_struct *task)
 {
-	WARN_ON(!plist_head_empty(&task->pi_waiters));
-	WARN_ON(task->pi_blocked_on);
+	DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters));
+	DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
 }
 
 /*
@@ -125,7 +70,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
 {
 	struct task_struct *task;
 
-	if (!rt_trace_on || detect || !act_waiter)
+	if (!debug_locks || detect || !act_waiter)
 		return;
 
 	task = rt_mutex_owner(act_waiter->lock);
@@ -139,7 +84,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
 {
 	struct task_struct *task;
 
-	if (!waiter->deadlock_lock || !rt_trace_on)
+	if (!waiter->deadlock_lock || !debug_locks)
 		return;
 
 	rcu_read_lock();
@@ -149,7 +94,8 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
 		return;
 	}
 
-	TRACE_OFF_NOLOCK();
+	if (!debug_locks_off())
+		return;
 
 	printk("\n============================================\n");
 	printk(  "[ BUG: circular locking deadlock detected! ]\n");
@@ -180,7 +126,6 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
 
 	printk("[ turning off deadlock detection."
 	       "Please report this trace. ]\n\n");
-	local_irq_disable();
 }
 
 void debug_rt_mutex_lock(struct rt_mutex *lock)
@@ -189,7 +134,7 @@ void debug_rt_mutex_lock(struct rt_mutex *lock)
 
 void debug_rt_mutex_unlock(struct rt_mutex *lock)
 {
-	TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
+	DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
 }
 
 void
@@ -199,7 +144,7 @@ debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
 
 void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
 {
-	TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock));
+	DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
 }
 
 void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
@@ -213,8 +158,8 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
 void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
 {
 	put_pid(waiter->deadlock_task_pid);
-	TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
-	TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+	DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
+	DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
 	memset(waiter, 0x22, sizeof(*waiter));
 }
 
-- 
cgit v1.2.2


From 9fb60336253edf73dedc527b2aa2bf32eae0d6da Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 Sep 2011 13:32:23 +0200
Subject: clocksource: Make watchdog reset lockless

KGDB needs to trylock watchdog_lock when trying to reset the
clocksource watchdog after the system has been stopped to avoid a
potential deadlock. When the trylock fails TSC usually becomes
unstable.

We can be more clever by using an atomic counter and checking it in
the clocksource_watchdog callback. We restart the watchdog whenever
the counter is > 0 and only decrement the counter when we ran through
a full update cycle.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <johnstul@us.ibm.com>
Acked-by: Jason Wessel <jason.wessel@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1109121326280.2723@ionos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clocksource.c | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e0980f0d9a0a..cf52fda2e096 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -186,6 +186,7 @@ static struct timer_list watchdog_timer;
 static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
 static DEFINE_SPINLOCK(watchdog_lock);
 static int watchdog_running;
+static atomic_t watchdog_reset_pending;
 
 static int clocksource_watchdog_kthread(void *data);
 static void __clocksource_change_rating(struct clocksource *cs, int rating);
@@ -247,12 +248,14 @@ static void clocksource_watchdog(unsigned long data)
 	struct clocksource *cs;
 	cycle_t csnow, wdnow;
 	int64_t wd_nsec, cs_nsec;
-	int next_cpu;
+	int next_cpu, reset_pending;
 
 	spin_lock(&watchdog_lock);
 	if (!watchdog_running)
 		goto out;
 
+	reset_pending = atomic_read(&watchdog_reset_pending);
+
 	list_for_each_entry(cs, &watchdog_list, wd_list) {
 
 		/* Clocksource already marked unstable? */
@@ -268,7 +271,8 @@ static void clocksource_watchdog(unsigned long data)
 		local_irq_enable();
 
 		/* Clocksource initialized ? */
-		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
+		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
+		    atomic_read(&watchdog_reset_pending)) {
 			cs->flags |= CLOCK_SOURCE_WATCHDOG;
 			cs->wd_last = wdnow;
 			cs->cs_last = csnow;
@@ -283,8 +287,11 @@ static void clocksource_watchdog(unsigned long data)
 		cs->cs_last = csnow;
 		cs->wd_last = wdnow;
 
+		if (atomic_read(&watchdog_reset_pending))
+			continue;
+
 		/* Check the deviation from the watchdog clocksource. */
-		if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
+		if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
 			clocksource_unstable(cs, cs_nsec - wd_nsec);
 			continue;
 		}
@@ -302,6 +309,13 @@ static void clocksource_watchdog(unsigned long data)
 		}
 	}
 
+	/*
+	 * We only clear the watchdog_reset_pending, when we did a
+	 * full cycle through all clocksources.
+	 */
+	if (reset_pending)
+		atomic_dec(&watchdog_reset_pending);
+
 	/*
 	 * Cycle through CPUs to check if the CPUs stay synchronized
 	 * to each other.
@@ -344,23 +358,7 @@ static inline void clocksource_reset_watchdog(void)
 
 static void clocksource_resume_watchdog(void)
 {
-	unsigned long flags;
-
-	/*
-	 * We use trylock here to avoid a potential dead lock when
-	 * kgdb calls this code after the kernel has been stopped with
-	 * watchdog_lock held. When watchdog_lock is held we just
-	 * return and accept, that the watchdog might trigger and mark
-	 * the monitored clock source (usually TSC) unstable.
-	 *
-	 * This does not affect the other caller clocksource_resume()
-	 * because at this point the kernel is UP, interrupts are
-	 * disabled and nothing can hold watchdog_lock.
-	 */
-	if (!spin_trylock_irqsave(&watchdog_lock, flags))
-		return;
-	clocksource_reset_watchdog();
-	spin_unlock_irqrestore(&watchdog_lock, flags);
+	atomic_inc(&watchdog_reset_pending);
 }
 
 static void clocksource_enqueue_watchdog(struct clocksource *cs)
-- 
cgit v1.2.2


From ec484608c5885931c432e99ecfd2772288cd993c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 25 Jul 2009 16:09:17 +0200
Subject: locking, kprobes: Annotate the hash locks and kretprobe.lock as raw

The kprobe locks can be taken in atomic context and therefore
cannot be preempted on -rt - annotate it.

In mainline this change documents the low level nature of
the lock - otherwise there's no functional difference. Lockdep
and Sparse checking will work as usual.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kprobes.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b30fd54eb985..2f193d0ba7f2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -78,10 +78,10 @@ static bool kprobes_all_disarmed;
 static DEFINE_MUTEX(kprobe_mutex);
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 static struct {
-	spinlock_t lock ____cacheline_aligned_in_smp;
+	raw_spinlock_t lock ____cacheline_aligned_in_smp;
 } kretprobe_table_locks[KPROBE_TABLE_SIZE];
 
-static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
+static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
 {
 	return &(kretprobe_table_locks[hash].lock);
 }
@@ -1013,9 +1013,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
 	hlist_del(&ri->hlist);
 	INIT_HLIST_NODE(&ri->hlist);
 	if (likely(rp)) {
-		spin_lock(&rp->lock);
+		raw_spin_lock(&rp->lock);
 		hlist_add_head(&ri->hlist, &rp->free_instances);
-		spin_unlock(&rp->lock);
+		raw_spin_unlock(&rp->lock);
 	} else
 		/* Unregistering */
 		hlist_add_head(&ri->hlist, head);
@@ -1026,19 +1026,19 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
 __acquires(hlist_lock)
 {
 	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
-	spinlock_t *hlist_lock;
+	raw_spinlock_t *hlist_lock;
 
 	*head = &kretprobe_inst_table[hash];
 	hlist_lock = kretprobe_table_lock_ptr(hash);
-	spin_lock_irqsave(hlist_lock, *flags);
+	raw_spin_lock_irqsave(hlist_lock, *flags);
 }
 
 static void __kprobes kretprobe_table_lock(unsigned long hash,
 	unsigned long *flags)
 __acquires(hlist_lock)
 {
-	spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
-	spin_lock_irqsave(hlist_lock, *flags);
+	raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+	raw_spin_lock_irqsave(hlist_lock, *flags);
 }
 
 void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
@@ -1046,18 +1046,18 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
 __releases(hlist_lock)
 {
 	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
-	spinlock_t *hlist_lock;
+	raw_spinlock_t *hlist_lock;
 
 	hlist_lock = kretprobe_table_lock_ptr(hash);
-	spin_unlock_irqrestore(hlist_lock, *flags);
+	raw_spin_unlock_irqrestore(hlist_lock, *flags);
 }
 
 static void __kprobes kretprobe_table_unlock(unsigned long hash,
        unsigned long *flags)
 __releases(hlist_lock)
 {
-	spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
-	spin_unlock_irqrestore(hlist_lock, *flags);
+	raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+	raw_spin_unlock_irqrestore(hlist_lock, *flags);
 }
 
 /*
@@ -1663,12 +1663,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 
 	/*TODO: consider to only swap the RA after the last pre_handler fired */
 	hash = hash_ptr(current, KPROBE_HASH_BITS);
-	spin_lock_irqsave(&rp->lock, flags);
+	raw_spin_lock_irqsave(&rp->lock, flags);
 	if (!hlist_empty(&rp->free_instances)) {
 		ri = hlist_entry(rp->free_instances.first,
 				struct kretprobe_instance, hlist);
 		hlist_del(&ri->hlist);
-		spin_unlock_irqrestore(&rp->lock, flags);
+		raw_spin_unlock_irqrestore(&rp->lock, flags);
 
 		ri->rp = rp;
 		ri->task = current;
@@ -1685,7 +1685,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 		kretprobe_table_unlock(hash, &flags);
 	} else {
 		rp->nmissed++;
-		spin_unlock_irqrestore(&rp->lock, flags);
+		raw_spin_unlock_irqrestore(&rp->lock, flags);
 	}
 	return 0;
 }
@@ -1721,7 +1721,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
 		rp->maxactive = num_possible_cpus();
 #endif
 	}
-	spin_lock_init(&rp->lock);
+	raw_spin_lock_init(&rp->lock);
 	INIT_HLIST_HEAD(&rp->free_instances);
 	for (i = 0; i < rp->maxactive; i++) {
 		inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -1959,7 +1959,7 @@ static int __init init_kprobes(void)
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		INIT_HLIST_HEAD(&kprobe_table[i]);
 		INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
-		spin_lock_init(&(kretprobe_table_locks[i].lock));
+		raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
 	}
 
 	/*
-- 
cgit v1.2.2


From cdcc136ffd264849a943acb42c36ffe9b458f811 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 25 Jul 2009 16:47:45 +0200
Subject: locking, sched, cgroups: Annotate release_list_lock as raw

The release_list_lock can be taken in atomic context and therefore
cannot be preempted on -rt - annotate it.

In mainline this change documents the low level nature of
the lock - otherwise there's no functional difference. Lockdep
and Sparse checking will work as usual.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cgroup.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1d2b6ceea95d..453100a4159d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -265,7 +265,7 @@ list_for_each_entry(_root, &roots, root_list)
 /* the list of cgroups eligible for automatic release. Protected by
  * release_list_lock */
 static LIST_HEAD(release_list);
-static DEFINE_SPINLOCK(release_list_lock);
+static DEFINE_RAW_SPINLOCK(release_list_lock);
 static void cgroup_release_agent(struct work_struct *work);
 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
@@ -4014,11 +4014,11 @@ again:
 	finish_wait(&cgroup_rmdir_waitq, &wait);
 	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
 
-	spin_lock(&release_list_lock);
+	raw_spin_lock(&release_list_lock);
 	set_bit(CGRP_REMOVED, &cgrp->flags);
 	if (!list_empty(&cgrp->release_list))
 		list_del_init(&cgrp->release_list);
-	spin_unlock(&release_list_lock);
+	raw_spin_unlock(&release_list_lock);
 
 	cgroup_lock_hierarchy(cgrp->root);
 	/* delete this cgroup from parent->children */
@@ -4671,13 +4671,13 @@ static void check_for_release(struct cgroup *cgrp)
 		 * already queued for a userspace notification, queue
 		 * it now */
 		int need_schedule_work = 0;
-		spin_lock(&release_list_lock);
+		raw_spin_lock(&release_list_lock);
 		if (!cgroup_is_removed(cgrp) &&
 		    list_empty(&cgrp->release_list)) {
 			list_add(&cgrp->release_list, &release_list);
 			need_schedule_work = 1;
 		}
-		spin_unlock(&release_list_lock);
+		raw_spin_unlock(&release_list_lock);
 		if (need_schedule_work)
 			schedule_work(&release_agent_work);
 	}
@@ -4729,7 +4729,7 @@ static void cgroup_release_agent(struct work_struct *work)
 {
 	BUG_ON(work != &release_agent_work);
 	mutex_lock(&cgroup_mutex);
-	spin_lock(&release_list_lock);
+	raw_spin_lock(&release_list_lock);
 	while (!list_empty(&release_list)) {
 		char *argv[3], *envp[3];
 		int i;
@@ -4738,7 +4738,7 @@ static void cgroup_release_agent(struct work_struct *work)
 						    struct cgroup,
 						    release_list);
 		list_del_init(&cgrp->release_list);
-		spin_unlock(&release_list_lock);
+		raw_spin_unlock(&release_list_lock);
 		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 		if (!pathbuf)
 			goto continue_free;
@@ -4768,9 +4768,9 @@ static void cgroup_release_agent(struct work_struct *work)
  continue_free:
 		kfree(pathbuf);
 		kfree(agentbuf);
-		spin_lock(&release_list_lock);
+		raw_spin_lock(&release_list_lock);
 	}
-	spin_unlock(&release_list_lock);
+	raw_spin_unlock(&release_list_lock);
 	mutex_unlock(&cgroup_mutex);
 }
 
-- 
cgit v1.2.2


From 5389f6fad27019f2ba78f1b332f719ec05f12a42 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 25 Jul 2009 17:13:33 +0200
Subject: locking, tracing: Annotate tracing locks as raw

The tracing locks can be taken in atomic context and therefore
cannot be preempted on -rt - annotate it.

In mainline this change documents the low level nature of
the lock - otherwise there's no functional difference. Lockdep
and Sparse checking will work as usual.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c   | 52 ++++++++++++++++++++++----------------------
 kernel/trace/trace.c         | 10 ++++-----
 kernel/trace/trace_irqsoff.c |  6 ++---
 3 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 731201bf4acc..f2f821acc597 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -478,7 +478,7 @@ struct ring_buffer_per_cpu {
 	int				cpu;
 	atomic_t			record_disabled;
 	struct ring_buffer		*buffer;
-	spinlock_t			reader_lock;	/* serialize readers */
+	raw_spinlock_t			reader_lock;	/* serialize readers */
 	arch_spinlock_t			lock;
 	struct lock_class_key		lock_key;
 	struct list_head		*pages;
@@ -1062,7 +1062,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 
 	cpu_buffer->cpu = cpu;
 	cpu_buffer->buffer = buffer;
-	spin_lock_init(&cpu_buffer->reader_lock);
+	raw_spin_lock_init(&cpu_buffer->reader_lock);
 	lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
 	cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 
@@ -1259,7 +1259,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
 	struct list_head *p;
 	unsigned i;
 
-	spin_lock_irq(&cpu_buffer->reader_lock);
+	raw_spin_lock_irq(&cpu_buffer->reader_lock);
 	rb_head_page_deactivate(cpu_buffer);
 
 	for (i = 0; i < nr_pages; i++) {
@@ -1277,7 +1277,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
 	rb_check_pages(cpu_buffer);
 
 out:
-	spin_unlock_irq(&cpu_buffer->reader_lock);
+	raw_spin_unlock_irq(&cpu_buffer->reader_lock);
 }
 
 static void
@@ -1288,7 +1288,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
 	struct list_head *p;
 	unsigned i;
 
-	spin_lock_irq(&cpu_buffer->reader_lock);
+	raw_spin_lock_irq(&cpu_buffer->reader_lock);
 	rb_head_page_deactivate(cpu_buffer);
 
 	for (i = 0; i < nr_pages; i++) {
@@ -1303,7 +1303,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
 	rb_check_pages(cpu_buffer);
 
 out:
-	spin_unlock_irq(&cpu_buffer->reader_lock);
+	raw_spin_unlock_irq(&cpu_buffer->reader_lock);
 }
 
 /**
@@ -2804,9 +2804,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
 
 	cpu_buffer = iter->cpu_buffer;
 
-	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	rb_iter_reset(iter);
-	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
 
@@ -3265,12 +3265,12 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
  again:
 	local_irq_save(flags);
 	if (dolock)
-		spin_lock(&cpu_buffer->reader_lock);
+		raw_spin_lock(&cpu_buffer->reader_lock);
 	event = rb_buffer_peek(cpu_buffer, ts, lost_events);
 	if (event && event->type_len == RINGBUF_TYPE_PADDING)
 		rb_advance_reader(cpu_buffer);
 	if (dolock)
-		spin_unlock(&cpu_buffer->reader_lock);
+		raw_spin_unlock(&cpu_buffer->reader_lock);
 	local_irq_restore(flags);
 
 	if (event && event->type_len == RINGBUF_TYPE_PADDING)
@@ -3295,9 +3295,9 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	unsigned long flags;
 
  again:
-	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_iter_peek(iter, ts);
-	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
 	if (event && event->type_len == RINGBUF_TYPE_PADDING)
 		goto again;
@@ -3337,7 +3337,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
 	cpu_buffer = buffer->buffers[cpu];
 	local_irq_save(flags);
 	if (dolock)
-		spin_lock(&cpu_buffer->reader_lock);
+		raw_spin_lock(&cpu_buffer->reader_lock);
 
 	event = rb_buffer_peek(cpu_buffer, ts, lost_events);
 	if (event) {
@@ -3346,7 +3346,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
 	}
 
 	if (dolock)
-		spin_unlock(&cpu_buffer->reader_lock);
+		raw_spin_unlock(&cpu_buffer->reader_lock);
 	local_irq_restore(flags);
 
  out:
@@ -3438,11 +3438,11 @@ ring_buffer_read_start(struct ring_buffer_iter *iter)
 
 	cpu_buffer = iter->cpu_buffer;
 
-	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	arch_spin_lock(&cpu_buffer->lock);
 	rb_iter_reset(iter);
 	arch_spin_unlock(&cpu_buffer->lock);
-	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
 
@@ -3477,7 +3477,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
 	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
 	unsigned long flags;
 
-	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
  again:
 	event = rb_iter_peek(iter, ts);
 	if (!event)
@@ -3488,7 +3488,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
 
 	rb_advance_iter(iter);
  out:
-	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
 	return event;
 }
@@ -3557,7 +3557,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 
 	atomic_inc(&cpu_buffer->record_disabled);
 
-	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
 	if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
 		goto out;
@@ -3569,7 +3569,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	arch_spin_unlock(&cpu_buffer->lock);
 
  out:
-	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
 	atomic_dec(&cpu_buffer->record_disabled);
 }
@@ -3607,10 +3607,10 @@ int ring_buffer_empty(struct ring_buffer *buffer)
 		cpu_buffer = buffer->buffers[cpu];
 		local_irq_save(flags);
 		if (dolock)
-			spin_lock(&cpu_buffer->reader_lock);
+			raw_spin_lock(&cpu_buffer->reader_lock);
 		ret = rb_per_cpu_empty(cpu_buffer);
 		if (dolock)
-			spin_unlock(&cpu_buffer->reader_lock);
+			raw_spin_unlock(&cpu_buffer->reader_lock);
 		local_irq_restore(flags);
 
 		if (!ret)
@@ -3641,10 +3641,10 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
 	cpu_buffer = buffer->buffers[cpu];
 	local_irq_save(flags);
 	if (dolock)
-		spin_lock(&cpu_buffer->reader_lock);
+		raw_spin_lock(&cpu_buffer->reader_lock);
 	ret = rb_per_cpu_empty(cpu_buffer);
 	if (dolock)
-		spin_unlock(&cpu_buffer->reader_lock);
+		raw_spin_unlock(&cpu_buffer->reader_lock);
 	local_irq_restore(flags);
 
 	return ret;
@@ -3841,7 +3841,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	if (!bpage)
 		goto out;
 
-	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
 	reader = rb_get_reader_page(cpu_buffer);
 	if (!reader)
@@ -3964,7 +3964,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
 
  out_unlock:
-	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
  out:
 	return ret;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e5df02c69b1d..0c8bdeeb358b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -341,7 +341,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
 
 static int trace_stop_count;
-static DEFINE_SPINLOCK(tracing_start_lock);
+static DEFINE_RAW_SPINLOCK(tracing_start_lock);
 
 static void wakeup_work_handler(struct work_struct *work)
 {
@@ -960,7 +960,7 @@ void tracing_start(void)
 	if (tracing_disabled)
 		return;
 
-	spin_lock_irqsave(&tracing_start_lock, flags);
+	raw_spin_lock_irqsave(&tracing_start_lock, flags);
 	if (--trace_stop_count) {
 		if (trace_stop_count < 0) {
 			/* Someone screwed up their debugging */
@@ -985,7 +985,7 @@ void tracing_start(void)
 
 	ftrace_start();
  out:
-	spin_unlock_irqrestore(&tracing_start_lock, flags);
+	raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
 }
 
 /**
@@ -1000,7 +1000,7 @@ void tracing_stop(void)
 	unsigned long flags;
 
 	ftrace_stop();
-	spin_lock_irqsave(&tracing_start_lock, flags);
+	raw_spin_lock_irqsave(&tracing_start_lock, flags);
 	if (trace_stop_count++)
 		goto out;
 
@@ -1018,7 +1018,7 @@ void tracing_stop(void)
 	arch_spin_unlock(&ftrace_max_lock);
 
  out:
-	spin_unlock_irqrestore(&tracing_start_lock, flags);
+	raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
 }
 
 void trace_stop_cmdline_recording(void);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 667aa8cc0cfc..11186212068c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -23,7 +23,7 @@ static int				tracer_enabled __read_mostly;
 
 static DEFINE_PER_CPU(int, tracing_cpu);
 
-static DEFINE_SPINLOCK(max_trace_lock);
+static DEFINE_RAW_SPINLOCK(max_trace_lock);
 
 enum {
 	TRACER_IRQS_OFF		= (1 << 1),
@@ -321,7 +321,7 @@ check_critical_timing(struct trace_array *tr,
 	if (!report_latency(delta))
 		goto out;
 
-	spin_lock_irqsave(&max_trace_lock, flags);
+	raw_spin_lock_irqsave(&max_trace_lock, flags);
 
 	/* check if we are still the max latency */
 	if (!report_latency(delta))
@@ -344,7 +344,7 @@ check_critical_timing(struct trace_array *tr,
 	max_sequence++;
 
 out_unlock:
-	spin_unlock_irqrestore(&max_trace_lock, flags);
+	raw_spin_unlock_irqrestore(&max_trace_lock, flags);
 
 out:
 	data->critical_sequence = max_sequence;
-- 
cgit v1.2.2


From 07354eb1a74d1e1ece29f8bafe0b46e8c77a95ef Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 25 Jul 2009 17:50:36 +0200
Subject: locking, printk: Annotate logbuf_lock as raw

The logbuf_lock lock can be taken in atomic context and therefore
cannot be preempted on -rt - annotate it.

In mainline this change documents the low level nature of
the lock - otherwise there's no functional difference. Lockdep
and Sparse checking will work as usual.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[ merged and fixed it ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 28a40d8171b8..b7da18391c38 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -100,7 +100,7 @@ static int console_locked, console_suspended;
  * It is also used in interesting ways to provide interlocking in
  * console_unlock();.
  */
-static DEFINE_SPINLOCK(logbuf_lock);
+static DEFINE_RAW_SPINLOCK(logbuf_lock);
 
 #define LOG_BUF_MASK (log_buf_len-1)
 #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
@@ -212,7 +212,7 @@ void __init setup_log_buf(int early)
 		return;
 	}
 
-	spin_lock_irqsave(&logbuf_lock, flags);
+	raw_spin_lock_irqsave(&logbuf_lock, flags);
 	log_buf_len = new_log_buf_len;
 	log_buf = new_log_buf;
 	new_log_buf_len = 0;
@@ -230,7 +230,7 @@ void __init setup_log_buf(int early)
 	log_start -= offset;
 	con_start -= offset;
 	log_end -= offset;
-	spin_unlock_irqrestore(&logbuf_lock, flags);
+	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 
 	pr_info("log_buf_len: %d\n", log_buf_len);
 	pr_info("early log buf free: %d(%d%%)\n",
@@ -365,18 +365,18 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 		if (error)
 			goto out;
 		i = 0;
-		spin_lock_irq(&logbuf_lock);
+		raw_spin_lock_irq(&logbuf_lock);
 		while (!error && (log_start != log_end) && i < len) {
 			c = LOG_BUF(log_start);
 			log_start++;
-			spin_unlock_irq(&logbuf_lock);
+			raw_spin_unlock_irq(&logbuf_lock);
 			error = __put_user(c,buf);
 			buf++;
 			i++;
 			cond_resched();
-			spin_lock_irq(&logbuf_lock);
+			raw_spin_lock_irq(&logbuf_lock);
 		}
-		spin_unlock_irq(&logbuf_lock);
+		raw_spin_unlock_irq(&logbuf_lock);
 		if (!error)
 			error = i;
 		break;
@@ -399,7 +399,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 		count = len;
 		if (count > log_buf_len)
 			count = log_buf_len;
-		spin_lock_irq(&logbuf_lock);
+		raw_spin_lock_irq(&logbuf_lock);
 		if (count > logged_chars)
 			count = logged_chars;
 		if (do_clear)
@@ -416,12 +416,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 			if (j + log_buf_len < log_end)
 				break;
 			c = LOG_BUF(j);
-			spin_unlock_irq(&logbuf_lock);
+			raw_spin_unlock_irq(&logbuf_lock);
 			error = __put_user(c,&buf[count-1-i]);
 			cond_resched();
-			spin_lock_irq(&logbuf_lock);
+			raw_spin_lock_irq(&logbuf_lock);
 		}
-		spin_unlock_irq(&logbuf_lock);
+		raw_spin_unlock_irq(&logbuf_lock);
 		if (error)
 			break;
 		error = i;
@@ -689,7 +689,7 @@ static void zap_locks(void)
 	oops_timestamp = jiffies;
 
 	/* If a crash is occurring, make sure we can't deadlock */
-	spin_lock_init(&logbuf_lock);
+	raw_spin_lock_init(&logbuf_lock);
 	/* And make sure that we print immediately */
 	sema_init(&console_sem, 1);
 }
@@ -802,9 +802,9 @@ static int console_trylock_for_printk(unsigned int cpu)
 		}
 	}
 	printk_cpu = UINT_MAX;
-	spin_unlock(&logbuf_lock);
 	if (wake)
 		up(&console_sem);
+	raw_spin_unlock(&logbuf_lock);
 	return retval;
 }
 static const char recursion_bug_msg [] =
@@ -864,7 +864,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	}
 
 	lockdep_off();
-	spin_lock(&logbuf_lock);
+	raw_spin_lock(&logbuf_lock);
 	printk_cpu = this_cpu;
 
 	if (recursion_bug) {
@@ -1257,14 +1257,14 @@ void console_unlock(void)
 
 again:
 	for ( ; ; ) {
-		spin_lock_irqsave(&logbuf_lock, flags);
+		raw_spin_lock_irqsave(&logbuf_lock, flags);
 		wake_klogd |= log_start - log_end;
 		if (con_start == log_end)
 			break;			/* Nothing to print */
 		_con_start = con_start;
 		_log_end = log_end;
 		con_start = log_end;		/* Flush */
-		spin_unlock(&logbuf_lock);
+		raw_spin_unlock(&logbuf_lock);
 		stop_critical_timings();	/* don't trace print latency */
 		call_console_drivers(_con_start, _log_end);
 		start_critical_timings();
@@ -1276,7 +1276,7 @@ again:
 	if (unlikely(exclusive_console))
 		exclusive_console = NULL;
 
-	spin_unlock(&logbuf_lock);
+	raw_spin_unlock(&logbuf_lock);
 
 	up(&console_sem);
 
@@ -1286,13 +1286,13 @@ again:
 	 * there's a new owner and the console_unlock() from them will do the
 	 * flush, no worries.
 	 */
-	spin_lock(&logbuf_lock);
+	raw_spin_lock(&logbuf_lock);
 	if (con_start != log_end)
 		retry = 1;
-	spin_unlock_irqrestore(&logbuf_lock, flags);
 	if (retry && console_trylock())
 		goto again;
 
+	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 	if (wake_klogd)
 		wake_up_klogd();
 }
@@ -1522,9 +1522,9 @@ void register_console(struct console *newcon)
 		 * console_unlock(); will print out the buffered messages
 		 * for us.
 		 */
-		spin_lock_irqsave(&logbuf_lock, flags);
+		raw_spin_lock_irqsave(&logbuf_lock, flags);
 		con_start = log_start;
-		spin_unlock_irqrestore(&logbuf_lock, flags);
+		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 		/*
 		 * We're about to replay the log buffer.  Only do this to the
 		 * just-registered console to avoid excessive message spam to
@@ -1731,10 +1731,10 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 	/* Theoretically, the log could move on after we do this, but
 	   there's not a lot we can do about that. The new messages
 	   will overwrite the start of what we dump. */
-	spin_lock_irqsave(&logbuf_lock, flags);
+	raw_spin_lock_irqsave(&logbuf_lock, flags);
 	end = log_end & LOG_BUF_MASK;
 	chars = logged_chars;
-	spin_unlock_irqrestore(&logbuf_lock, flags);
+	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 
 	if (chars > end) {
 		s1 = log_buf + log_buf_len - chars + end;
-- 
cgit v1.2.2


From ee30a7b2fc072f139dac44826860d2c1f422137c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 25 Jul 2009 18:56:56 +0200
Subject: locking, sched: Annotate thread_group_cputimer as raw

The thread_group_cputimer lock can be taken in atomic context and therefore
cannot be preempted on -rt - annotate it.

In mainline this change documents the low level nature of
the lock - otherwise there's no functional difference. Lockdep
and Sparse checking will work as usual.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/posix-cpu-timers.c | 12 ++++++------
 kernel/sched_stats.h      | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 58f405b581e7..41440cca55a2 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -274,7 +274,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 	struct task_cputime sum;
 	unsigned long flags;
 
-	spin_lock_irqsave(&cputimer->lock, flags);
+	raw_spin_lock_irqsave(&cputimer->lock, flags);
 	if (!cputimer->running) {
 		cputimer->running = 1;
 		/*
@@ -287,7 +287,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 		update_gt_cputime(&cputimer->cputime, &sum);
 	}
 	*times = cputimer->cputime;
-	spin_unlock_irqrestore(&cputimer->lock, flags);
+	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
 }
 
 /*
@@ -997,9 +997,9 @@ static void stop_process_timers(struct signal_struct *sig)
 	struct thread_group_cputimer *cputimer = &sig->cputimer;
 	unsigned long flags;
 
-	spin_lock_irqsave(&cputimer->lock, flags);
+	raw_spin_lock_irqsave(&cputimer->lock, flags);
 	cputimer->running = 0;
-	spin_unlock_irqrestore(&cputimer->lock, flags);
+	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
 }
 
 static u32 onecputick;
@@ -1289,9 +1289,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	if (sig->cputimer.running) {
 		struct task_cputime group_sample;
 
-		spin_lock(&sig->cputimer.lock);
+		raw_spin_lock(&sig->cputimer.lock);
 		group_sample = sig->cputimer.cputime;
-		spin_unlock(&sig->cputimer.lock);
+		raw_spin_unlock(&sig->cputimer.lock);
 
 		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
 			return 1;
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 331e01bcd026..87f9e36ea56e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -282,10 +282,10 @@ static inline void account_group_user_time(struct task_struct *tsk,
 	if (!cputimer->running)
 		return;
 
-	spin_lock(&cputimer->lock);
+	raw_spin_lock(&cputimer->lock);
 	cputimer->cputime.utime =
 		cputime_add(cputimer->cputime.utime, cputime);
-	spin_unlock(&cputimer->lock);
+	raw_spin_unlock(&cputimer->lock);
 }
 
 /**
@@ -306,10 +306,10 @@ static inline void account_group_system_time(struct task_struct *tsk,
 	if (!cputimer->running)
 		return;
 
-	spin_lock(&cputimer->lock);
+	raw_spin_lock(&cputimer->lock);
 	cputimer->cputime.stime =
 		cputime_add(cputimer->cputime.stime, cputime);
-	spin_unlock(&cputimer->lock);
+	raw_spin_unlock(&cputimer->lock);
 }
 
 /**
@@ -330,7 +330,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
 	if (!cputimer->running)
 		return;
 
-	spin_lock(&cputimer->lock);
+	raw_spin_lock(&cputimer->lock);
 	cputimer->cputime.sum_exec_runtime += ns;
-	spin_unlock(&cputimer->lock);
+	raw_spin_unlock(&cputimer->lock);
 }
-- 
cgit v1.2.2


From 8292c9e15c3b069459794a04f5e2cf0d5665ddc4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 24 Feb 2010 09:50:22 +0100
Subject: locking, semaphores: Annotate inner lock as raw

There is no reason to have the spin_lock protecting the semaphore
preemptible on -rt. Annotate it as a raw_spinlock.

In mainline this change documents the low level nature of
the lock - otherwise there's no functional difference. Lockdep
and Sparse checking will work as usual.

( On rt this also solves lockdep complaining about the
  rt_mutex.wait_lock being not initialized. )

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/semaphore.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 94a62c0d4ade..d831841e55a7 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -54,12 +54,12 @@ void down(struct semaphore *sem)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&sem->lock, flags);
+	raw_spin_lock_irqsave(&sem->lock, flags);
 	if (likely(sem->count > 0))
 		sem->count--;
 	else
 		__down(sem);
-	spin_unlock_irqrestore(&sem->lock, flags);
+	raw_spin_unlock_irqrestore(&sem->lock, flags);
 }
 EXPORT_SYMBOL(down);
 
@@ -77,12 +77,12 @@ int down_interruptible(struct semaphore *sem)
 	unsigned long flags;
 	int result = 0;
 
-	spin_lock_irqsave(&sem->lock, flags);
+	raw_spin_lock_irqsave(&sem->lock, flags);
 	if (likely(sem->count > 0))
 		sem->count--;
 	else
 		result = __down_interruptible(sem);
-	spin_unlock_irqrestore(&sem->lock, flags);
+	raw_spin_unlock_irqrestore(&sem->lock, flags);
 
 	return result;
 }
@@ -103,12 +103,12 @@ int down_killable(struct semaphore *sem)
 	unsigned long flags;
 	int result = 0;
 
-	spin_lock_irqsave(&sem->lock, flags);
+	raw_spin_lock_irqsave(&sem->lock, flags);
 	if (likely(sem->count > 0))
 		sem->count--;
 	else
 		result = __down_killable(sem);
-	spin_unlock_irqrestore(&sem->lock, flags);
+	raw_spin_unlock_irqrestore(&sem->lock, flags);
 
 	return result;
 }
@@ -132,11 +132,11 @@ int down_trylock(struct semaphore *sem)
 	unsigned long flags;
 	int count;
 
-	spin_lock_irqsave(&sem->lock, flags);
+	raw_spin_lock_irqsave(&sem->lock, flags);
 	count = sem->count - 1;
 	if (likely(count >= 0))
 		sem->count = count;
-	spin_unlock_irqrestore(&sem->lock, flags);
+	raw_spin_unlock_irqrestore(&sem->lock, flags);
 
 	return (count < 0);
 }
@@ -157,12 +157,12 @@ int down_timeout(struct semaphore *sem, long jiffies)
 	unsigned long flags;
 	int result = 0;
 
-	spin_lock_irqsave(&sem->lock, flags);
+	raw_spin_lock_irqsave(&sem->lock, flags);
 	if (likely(sem->count > 0))
 		sem->count--;
 	else
 		result = __down_timeout(sem, jiffies);
-	spin_unlock_irqrestore(&sem->lock, flags);
+	raw_spin_unlock_irqrestore(&sem->lock, flags);
 
 	return result;
 }
@@ -179,12 +179,12 @@ void up(struct semaphore *sem)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&sem->lock, flags);
+	raw_spin_lock_irqsave(&sem->lock, flags);
 	if (likely(list_empty(&sem->wait_list)))
 		sem->count++;
 	else
 		__up(sem);
-	spin_unlock_irqrestore(&sem->lock, flags);
+	raw_spin_unlock_irqrestore(&sem->lock, flags);
 }
 EXPORT_SYMBOL(up);
 
@@ -217,9 +217,9 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
 		if (timeout <= 0)
 			goto timed_out;
 		__set_task_state(task, state);
-		spin_unlock_irq(&sem->lock);
+		raw_spin_unlock_irq(&sem->lock);
 		timeout = schedule_timeout(timeout);
-		spin_lock_irq(&sem->lock);
+		raw_spin_lock_irq(&sem->lock);
 		if (waiter.up)
 			return 0;
 	}
-- 
cgit v1.2.2


From 2737c49f29a29f3d3645ba0778aa7a8798f32249 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 16 Jun 2010 16:58:34 +0200
Subject: locking, timer_stats: Annotate table_lock as raw

The table_lock lock can be taken in atomic context and therefore
cannot be preempted on -rt - annotate it.

In mainline this change documents the low level nature of
the lock - otherwise there's no functional difference. Lockdep
and Sparse checking will work as usual.

Reported-by: Andreas Sundebo <kernel@sundebo.dk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Andreas Sundebo <kernel@sundebo.dk>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/timer_stats.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index a5d0a3a85dd8..0b537f27b559 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -81,7 +81,7 @@ struct entry {
 /*
  * Spinlock protecting the tables - not taken during lookup:
  */
-static DEFINE_SPINLOCK(table_lock);
+static DEFINE_RAW_SPINLOCK(table_lock);
 
 /*
  * Per-CPU lookup locks for fast hash lookup:
@@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
 	prev = NULL;
 	curr = *head;
 
-	spin_lock(&table_lock);
+	raw_spin_lock(&table_lock);
 	/*
 	 * Make sure we have not raced with another CPU:
 	 */
@@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
 			*head = curr;
 	}
  out_unlock:
-	spin_unlock(&table_lock);
+	raw_spin_unlock(&table_lock);
 
 	return curr;
 }
-- 
cgit v1.2.2


From 757455d41c390da4ae8e06468ad097fe42eaab41 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 17 Sep 2009 17:31:09 +0200
Subject: locking, latencytop: Annotate latency_lock as raw

The latency_lock is lock can be taken in the guts of the
scheduler code and therefore cannot be preempted on -rt -
annotate it.

In mainline this change documents the low level nature of
the lock - otherwise there's no functional difference. Lockdep
and Sparse checking will work as usual.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/latencytop.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 376066e10413..4ac8ebfcab59 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -58,7 +58,7 @@
 #include <linux/list.h>
 #include <linux/stacktrace.h>
 
-static DEFINE_SPINLOCK(latency_lock);
+static DEFINE_RAW_SPINLOCK(latency_lock);
 
 #define MAXLR 128
 static struct latency_record latency_record[MAXLR];
@@ -72,19 +72,19 @@ void clear_all_latency_tracing(struct task_struct *p)
 	if (!latencytop_enabled)
 		return;
 
-	spin_lock_irqsave(&latency_lock, flags);
+	raw_spin_lock_irqsave(&latency_lock, flags);
 	memset(&p->latency_record, 0, sizeof(p->latency_record));
 	p->latency_record_count = 0;
-	spin_unlock_irqrestore(&latency_lock, flags);
+	raw_spin_unlock_irqrestore(&latency_lock, flags);
 }
 
 static void clear_global_latency_tracing(void)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&latency_lock, flags);
+	raw_spin_lock_irqsave(&latency_lock, flags);
 	memset(&latency_record, 0, sizeof(latency_record));
-	spin_unlock_irqrestore(&latency_lock, flags);
+	raw_spin_unlock_irqrestore(&latency_lock, flags);
 }
 
 static void __sched
@@ -190,7 +190,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
 	lat.max = usecs;
 	store_stacktrace(tsk, &lat);
 
-	spin_lock_irqsave(&latency_lock, flags);
+	raw_spin_lock_irqsave(&latency_lock, flags);
 
 	account_global_scheduler_latency(tsk, &lat);
 
@@ -231,7 +231,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
 	memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
 
 out_unlock:
-	spin_unlock_irqrestore(&latency_lock, flags);
+	raw_spin_unlock_irqrestore(&latency_lock, flags);
 }
 
 static int lstats_show(struct seq_file *m, void *v)
-- 
cgit v1.2.2


From 4523f6ada86853750565c68e17126af2e3df9b8a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 14 Sep 2011 10:54:29 +0200
Subject: alarmtimers: Fix error handling

commit 8bc0daf (alarmtimers: Rework RTC device selection using class
interface) did not implement required error checks. Add them.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/alarmtimer.c | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 154d5563ab1b..c436e790b21b 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -100,19 +100,25 @@ static struct class_interface alarmtimer_rtc_interface = {
 	.add_dev = &alarmtimer_rtc_add_device,
 };
 
-static void alarmtimer_rtc_interface_setup(void)
+static int alarmtimer_rtc_interface_setup(void)
 {
 	alarmtimer_rtc_interface.class = rtc_class;
-	class_interface_register(&alarmtimer_rtc_interface);
+	return class_interface_register(&alarmtimer_rtc_interface);
+}
+static void alarmtimer_rtc_interface_remove(void)
+{
+	class_interface_unregister(&alarmtimer_rtc_interface);
 }
 #else
-#define alarmtimer_get_rtcdev() (0)
-#define rtcdev (0)
-#define alarmtimer_rtc_interface_setup()
+static inline struct rtc_device *alarmtimer_get_rtcdev(void)
+{
+	return NULL;
+}
+#define rtcdev (NULL)
+static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
+static inline void alarmtimer_rtc_interface_remove(void) { }
 #endif
 
-
-
 /**
  * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
  * @base: pointer to the base where the timer is being run
@@ -764,6 +770,7 @@ static struct platform_driver alarmtimer_driver = {
  */
 static int __init alarmtimer_init(void)
 {
+	struct platform_device *pdev;
 	int error = 0;
 	int i;
 	struct k_clock alarm_clock = {
@@ -793,11 +800,25 @@ static int __init alarmtimer_init(void)
 		alarm_bases[i].timer.function = alarmtimer_fired;
 	}
 
-	alarmtimer_rtc_interface_setup();
+	error = alarmtimer_rtc_interface_setup();
+	if (error)
+		return error;
+
 	error = platform_driver_register(&alarmtimer_driver);
-	platform_device_register_simple("alarmtimer", -1, NULL, 0);
+	if (error)
+		goto out_if;
 
+	pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0);
+	if (IS_ERR(pdev)) {
+		error = PTR_ERR(pdev);
+		goto out_drv;
+	}
+	return 0;
+
+out_drv:
+	platform_driver_unregister(&alarmtimer_driver);
+out_if:
+	alarmtimer_rtc_interface_remove();
 	return error;
 }
 device_initcall(alarmtimer_init);
-
-- 
cgit v1.2.2


From 0119fee449f501d55924914a90f152540dd4ef9a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 2 Sep 2011 01:30:29 +0200
Subject: lockdep: Comment all warnings

Andrew requested I comment all the lockdep WARN()s to help other people
figure out wth is wrong..

Requested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1315301493.3191.9.camel@twins
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 147 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 91d67ce3a8d5..c081fa967c8f 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -96,8 +96,13 @@ static int graph_lock(void)
 
 static inline int graph_unlock(void)
 {
-	if (debug_locks && !arch_spin_is_locked(&lockdep_lock))
+	if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) {
+		/*
+		 * The lockdep graph lock isn't locked while we expect it to
+		 * be, we're confused now, bye!
+		 */
 		return DEBUG_LOCKS_WARN_ON(1);
+	}
 
 	current->lockdep_recursion--;
 	arch_spin_unlock(&lockdep_lock);
@@ -134,6 +139,9 @@ static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
 static inline struct lock_class *hlock_class(struct held_lock *hlock)
 {
 	if (!hlock->class_idx) {
+		/*
+		 * Someone passed in garbage, we give up.
+		 */
 		DEBUG_LOCKS_WARN_ON(1);
 		return NULL;
 	}
@@ -687,6 +695,10 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	 */
 	list_for_each_entry(class, hash_head, hash_entry) {
 		if (class->key == key) {
+			/*
+			 * Huh! same key, different name? Did someone trample
+			 * on some memory? We're most confused.
+			 */
 			WARN_ON_ONCE(class->name != lock->name);
 			return class;
 		}
@@ -800,6 +812,10 @@ out_unlock_set:
 	else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
 		lock->class_cache[subclass] = class;
 
+	/*
+	 * Hash collision, did we smoke some? We found a class with a matching
+	 * hash but the subclass -- which is hashed in -- didn't match.
+	 */
 	if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
 		return NULL;
 
@@ -926,7 +942,7 @@ static inline void mark_lock_accessed(struct lock_list *lock,
 	unsigned long nr;
 
 	nr = lock - list_entries;
-	WARN_ON(nr >= nr_list_entries);
+	WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
 	lock->parent = parent;
 	lock->class->dep_gen_id = lockdep_dependency_gen_id;
 }
@@ -936,7 +952,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock)
 	unsigned long nr;
 
 	nr = lock - list_entries;
-	WARN_ON(nr >= nr_list_entries);
+	WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
 	return lock->class->dep_gen_id == lockdep_dependency_gen_id;
 }
 
@@ -1196,6 +1212,9 @@ static noinline int print_bfs_bug(int ret)
 	if (!debug_locks_off_graph_unlock())
 		return 0;
 
+	/*
+	 * Breadth-first-search failed, graph got corrupted?
+	 */
 	WARN(1, "lockdep bfs error:%d\n", ret);
 
 	return 0;
@@ -1944,6 +1963,11 @@ out_bug:
 	if (!debug_locks_off_graph_unlock())
 		return 0;
 
+	/*
+	 * Clearly we all shouldn't be here, but since we made it we
+	 * can reliable say we messed up our state. See the above two
+	 * gotos for reasons why we could possibly end up here.
+	 */
 	WARN_ON(1);
 
 	return 0;
@@ -1975,6 +1999,11 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 	struct held_lock *hlock_curr, *hlock_next;
 	int i, j;
 
+	/*
+	 * We might need to take the graph lock, ensure we've got IRQs
+	 * disabled to make this an IRQ-safe lock.. for recursion reasons
+	 * lockdep won't complain about its own locking errors.
+	 */
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return 0;
 	/*
@@ -2126,6 +2155,10 @@ static void check_chain_key(struct task_struct *curr)
 		hlock = curr->held_locks + i;
 		if (chain_key != hlock->prev_chain_key) {
 			debug_locks_off();
+			/*
+			 * We got mighty confused, our chain keys don't match
+			 * with what we expect, someone trample on our task state?
+			 */
 			WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
 				curr->lockdep_depth, i,
 				(unsigned long long)chain_key,
@@ -2133,6 +2166,9 @@ static void check_chain_key(struct task_struct *curr)
 			return;
 		}
 		id = hlock->class_idx - 1;
+		/*
+		 * Whoops ran out of static storage again?
+		 */
 		if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
 			return;
 
@@ -2144,6 +2180,10 @@ static void check_chain_key(struct task_struct *curr)
 	}
 	if (chain_key != curr->curr_chain_key) {
 		debug_locks_off();
+		/*
+		 * More smoking hash instead of calculating it, damn see these
+		 * numbers float.. I bet that a pink elephant stepped on my memory.
+		 */
 		WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
 			curr->lockdep_depth, i,
 			(unsigned long long)chain_key,
@@ -2525,12 +2565,24 @@ void trace_hardirqs_on_caller(unsigned long ip)
 		return;
 	}
 
+	/*
+	 * We're enabling irqs and according to our state above irqs weren't
+	 * already enabled, yet we find the hardware thinks they are in fact
+	 * enabled.. someone messed up their IRQ state tracing.
+	 */
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return;
 
+	/*
+	 * See the fine text that goes along with this variable definition.
+	 */
 	if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
 		return;
 
+	/*
+	 * Can't allow enabling interrupts while in an interrupt handler,
+	 * that's general bad form and such. Recursion, limited stack etc..
+	 */
 	if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
 		return;
 
@@ -2558,6 +2610,10 @@ void trace_hardirqs_off_caller(unsigned long ip)
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
+	/*
+	 * So we're supposed to get called after you mask local IRQs, but for
+	 * some reason the hardware doesn't quite think you did a proper job.
+	 */
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return;
 
@@ -2590,6 +2646,10 @@ void trace_softirqs_on(unsigned long ip)
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
+	/*
+	 * We fancy IRQs being disabled here, see softirq.c, avoids
+	 * funny state and nesting things.
+	 */
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return;
 
@@ -2626,6 +2686,9 @@ void trace_softirqs_off(unsigned long ip)
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
+	/*
+	 * We fancy IRQs being disabled here, see softirq.c
+	 */
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return;
 
@@ -2637,6 +2700,9 @@ void trace_softirqs_off(unsigned long ip)
 		curr->softirq_disable_ip = ip;
 		curr->softirq_disable_event = ++curr->irq_events;
 		debug_atomic_inc(softirqs_off_events);
+		/*
+		 * Whoops, we wanted softirqs off, so why aren't they?
+		 */
 		DEBUG_LOCKS_WARN_ON(!softirq_count());
 	} else
 		debug_atomic_inc(redundant_softirqs_off);
@@ -2661,6 +2727,9 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
 	if (!(gfp_mask & __GFP_FS))
 		return;
 
+	/*
+	 * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
+	 */
 	if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
 		return;
 
@@ -2773,13 +2842,13 @@ static int separate_irq_context(struct task_struct *curr,
 	return 0;
 }
 
-#else
+#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
 
 static inline
 int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		enum lock_usage_bit new_bit)
 {
-	WARN_ON(1);
+	WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
 	return 1;
 }
 
@@ -2799,7 +2868,7 @@ void lockdep_trace_alloc(gfp_t gfp_mask)
 {
 }
 
-#endif
+#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
 
 /*
  * Mark a lock with a usage bit, and validate the state transition:
@@ -2880,6 +2949,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 	lock->cpu = raw_smp_processor_id();
 #endif
 
+	/*
+	 * Can't be having no nameless bastards around this place!
+	 */
 	if (DEBUG_LOCKS_WARN_ON(!name)) {
 		lock->name = "NULL";
 		return;
@@ -2887,6 +2959,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 
 	lock->name = name;
 
+	/*
+	 * No key, no joy, we need to hash something.
+	 */
 	if (DEBUG_LOCKS_WARN_ON(!key))
 		return;
 	/*
@@ -2894,6 +2969,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 	 */
 	if (!static_obj(key)) {
 		printk("BUG: key %p not in .data!\n", key);
+		/*
+		 * What it says above ^^^^^, I suggest you read it.
+		 */
 		DEBUG_LOCKS_WARN_ON(1);
 		return;
 	}
@@ -2932,6 +3010,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	if (unlikely(!debug_locks))
 		return 0;
 
+	/*
+	 * Lockdep should run with IRQs disabled, otherwise we could
+	 * get an interrupt which would want to take locks, which would
+	 * end up in lockdep and have you got a head-ache already?
+	 */
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return 0;
 
@@ -2963,6 +3046,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	 * dependency checks are done)
 	 */
 	depth = curr->lockdep_depth;
+	/*
+	 * Ran out of static storage for our per-task lock stack again have we?
+	 */
 	if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
 		return 0;
 
@@ -2981,6 +3067,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	}
 
 	hlock = curr->held_locks + depth;
+	/*
+	 * Plain impossible, we just registered it and checked it weren't no
+	 * NULL like.. I bet this mushroom I ate was good!
+	 */
 	if (DEBUG_LOCKS_WARN_ON(!class))
 		return 0;
 	hlock->class_idx = class_idx;
@@ -3015,11 +3105,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	 * the hash, not class->key.
 	 */
 	id = class - lock_classes;
+	/*
+	 * Whoops, we did it again.. ran straight out of our static allocation.
+	 */
 	if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
 		return 0;
 
 	chain_key = curr->curr_chain_key;
 	if (!depth) {
+		/*
+		 * How can we have a chain hash when we ain't got no keys?!
+		 */
 		if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
 			return 0;
 		chain_head = 1;
@@ -3091,6 +3187,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
 {
 	if (unlikely(!debug_locks))
 		return 0;
+	/*
+	 * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
+	 */
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return 0;
 
@@ -3120,6 +3219,11 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
 		if (!class)
 			return 0;
 
+		/*
+		 * References, but not a lock we're actually ref-counting?
+		 * State got messed up, follow the sites that change ->references
+		 * and try to make sense of it.
+		 */
 		if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
 			return 0;
 
@@ -3142,6 +3246,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
 	int i;
 
 	depth = curr->lockdep_depth;
+	/*
+	 * This function is about (re)setting the class of a held lock,
+	 * yet we're not actually holding any locks. Naughty user!
+	 */
 	if (DEBUG_LOCKS_WARN_ON(!depth))
 		return 0;
 
@@ -3177,6 +3285,10 @@ found_it:
 			return 0;
 	}
 
+	/*
+	 * I took it apart and put it back together again, except now I have
+	 * these 'spare' parts.. where shall I put them.
+	 */
 	if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
 		return 0;
 	return 1;
@@ -3201,6 +3313,10 @@ lock_release_non_nested(struct task_struct *curr,
 	 * of held locks:
 	 */
 	depth = curr->lockdep_depth;
+	/*
+	 * So we're all set to release this lock.. wait what lock? We don't
+	 * own any locks, you've been drinking again?
+	 */
 	if (DEBUG_LOCKS_WARN_ON(!depth))
 		return 0;
 
@@ -3253,6 +3369,10 @@ found_it:
 			return 0;
 	}
 
+	/*
+	 * We had N bottles of beer on the wall, we drank one, but now
+	 * there's not N-1 bottles of beer left on the wall...
+	 */
 	if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
 		return 0;
 	return 1;
@@ -3283,6 +3403,9 @@ static int lock_release_nested(struct task_struct *curr,
 		return lock_release_non_nested(curr, lock, ip);
 	curr->lockdep_depth--;
 
+	/*
+	 * No more locks, but somehow we've got hash left over, who left it?
+	 */
 	if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
 		return 0;
 
@@ -3365,10 +3488,13 @@ static void check_flags(unsigned long flags)
 	 * check if not in hardirq contexts:
 	 */
 	if (!hardirq_count()) {
-		if (softirq_count())
+		if (softirq_count()) {
+			/* like the above, but with softirqs */
 			DEBUG_LOCKS_WARN_ON(current->softirqs_enabled);
-		else
+		} else {
+			/* lick the above, does it taste good? */
 			DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
+		}
 	}
 
 	if (!debug_locks)
@@ -3506,6 +3632,10 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 	int i, contention_point, contending_point;
 
 	depth = curr->lockdep_depth;
+	/*
+	 * Whee, we contended on this lock, except it seems we're not
+	 * actually trying to acquire anything much at all..
+	 */
 	if (DEBUG_LOCKS_WARN_ON(!depth))
 		return;
 
@@ -3555,6 +3685,10 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
 	int i, cpu;
 
 	depth = curr->lockdep_depth;
+	/*
+	 * Yay, we acquired ownership of this lock we didn't try to
+	 * acquire, how the heck did that happen?
+	 */
 	if (DEBUG_LOCKS_WARN_ON(!depth))
 		return;
 
@@ -3759,8 +3893,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
 				match |= class == lock->class_cache[j];
 
 			if (unlikely(match)) {
-				if (debug_locks_off_graph_unlock())
+				if (debug_locks_off_graph_unlock()) {
+					/*
+					 * We all just reset everything, how did it match?
+					 */
 					WARN_ON(1);
+				}
 				goto out_restore;
 			}
 		}
-- 
cgit v1.2.2


From cba9bd22a5f8f857534b9a7f3fb3cafa0ac5fb75 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 Sep 2011 13:40:05 +0200
Subject: watchdog: Drop FIFO policy in exit path

When the watchdog thread exits it runs through the exit path with FIFO
priority. There is no point in doing so. Switch back to SCHED_NORMAL
before exiting.

Cc: Don Zickus <dzickus@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1109121337461.2723@ionos
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/watchdog.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index e952a1394d26..d680381b0e9c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -321,7 +321,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
  */
 static int watchdog(void *unused)
 {
-	static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
@@ -350,7 +350,8 @@ static int watchdog(void *unused)
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
-
+	param.sched_priority = 0;
+	sched_setscheduler(current, SCHED_NORMAL, &param);
 	return 0;
 }
 
-- 
cgit v1.2.2


From 6249687f76b69cc0b2ad34636f4a18d693ef3262 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 19 Sep 2011 11:35:58 -0400
Subject: tracing: Add a counter clock for those that do not trust clocks

When debugging tight race conditions, it can be helpful to have a
synchronized tracing method. Although in most cases the global clock
provides this functionality, if timings is not the issue, it is more
comforting to know that the order of events really happened in a precise
order.

Instead of using a clock, add a "counter" that is simply an incrementing
atomic 64bit counter that orders the events as they are perceived to
happen.

The trace_clock_counter() is added from the attempt by Peter Zijlstra
trying to convert the trace_clock_global() to it. I took Peter's counter
code and made trace_clock_counter() instead, and added it to the choice
of clocks. Just echo counter > /debug/tracing/trace_clock to activate
it.

Requested-by: Thomas Gleixner <tglx@linutronix.de>
Requested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-By: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c       |  1 +
 kernel/trace/trace_clock.c | 12 ++++++++++++
 2 files changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b41907000bc6..4b8df0dc9358 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -435,6 +435,7 @@ static struct {
 } trace_clocks[] = {
 	{ trace_clock_local,	"local" },
 	{ trace_clock_global,	"global" },
+	{ trace_clock_counter,	"counter" },
 };
 
 int trace_clock_id;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 6302747a1398..394783531cbb 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -113,3 +113,15 @@ u64 notrace trace_clock_global(void)
 
 	return now;
 }
+
+static atomic64_t trace_counter;
+
+/*
+ * trace_clock_counter(): simply an atomic counter.
+ * Use the trace_counter "counter" for cases where you do not care
+ * about timings, but are interested in strict ordering.
+ */
+u64 notrace trace_clock_counter(void)
+{
+	return atomic64_add_return(1, &trace_counter);
+}
-- 
cgit v1.2.2


From 590e4d857153c5d4cf86052cdfd42cf9b0779841 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Sun, 24 Jul 2011 16:33:13 +0000
Subject: sched: Allow SD_NODES_PER_DOMAIN to be overridden

We want to override the default value of SD_NODES_PER_DOMAIN on ppc64,
so move it into linux/topology.h.

Signed-off-by: Anton Blanchard <anton@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 kernel/sched.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ec5f472bc5b9..d258b2d9a66d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6947,8 +6947,6 @@ static int __init isolated_cpu_setup(char *str)
 
 __setup("isolcpus=", isolated_cpu_setup);
 
-#define SD_NODES_PER_DOMAIN 16
-
 #ifdef CONFIG_NUMA
 
 /**
-- 
cgit v1.2.2


From cbbc719fccdb8cbd87350a05c0d33167c9b79365 Mon Sep 17 00:00:00 2001
From: hank <pyu@redhat.com>
Date: Tue, 20 Sep 2011 13:53:39 -0700
Subject: time: Change jiffies_to_clock_t() argument type to unsigned long

The parameter's origin type is long. On an i386 architecture, it can
easily be larger than 0x80000000, causing this function to convert it
to a sign-extended u64 type.

Change the type to unsigned long so we get the correct result.

Signed-off-by: hank <pyu@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: <stable@kernel.org>
[ build fix ]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index 8e8dc6d705c9..d77606214529 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -575,7 +575,7 @@ EXPORT_SYMBOL(jiffies_to_timeval);
 /*
  * Convert jiffies/jiffies_64 to clock_t and back.
  */
-clock_t jiffies_to_clock_t(long x)
+clock_t jiffies_to_clock_t(unsigned long x)
 {
 #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
 # if HZ < USER_HZ
-- 
cgit v1.2.2


From e36de1de4a5f95b7cb3e5c37d10e6bbb91833ef0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 22 Sep 2011 11:11:51 -0400
Subject: tracing: Fix preemptirqsoff tracer to not stop at preempt off

If irqs are disabled when preemption count reaches zero, the
preemptirqsoff tracer should not flag that as the end.

When interrupts are enabled and preemption count is not zero
the preemptirqsoff correctly continues its tracing.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_irqsoff.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 667aa8cc0cfc..a1a3359996a7 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -505,13 +505,13 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
 #ifdef CONFIG_PREEMPT_TRACER
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
-	if (preempt_trace())
+	if (preempt_trace() && !irq_trace())
 		stop_critical_timing(a0, a1);
 }
 
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
-	if (preempt_trace())
+	if (preempt_trace() && !irq_trace())
 		start_critical_timing(a0, a1);
 }
 #endif /* CONFIG_PREEMPT_TRACER */
-- 
cgit v1.2.2


From ab10023e0088d5075354afc7cb9e72304757dddd Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Thu, 10 Feb 2011 02:04:45 -0800
Subject: cpu_pm: Add cpu power management notifiers

During some CPU power modes entered during idle, hotplug and
suspend, peripherals located in the CPU power domain, such as
the GIC, localtimers, and VFP, may be powered down.  Add a
notifier chain that allows drivers for those peripherals to
be notified before and after they may be reset.

Notified drivers can include VFP co-processor, interrupt controller
and it's PM extensions, local CPU timers context save/restore which
shouldn't be interrupted. Hence CPU PM event APIs  must be called
with interrupts disabled.

Signed-off-by: Colin Cross <ccross@android.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Reviewed-by: Kevin Hilman <khilman@ti.com>
Tested-and-Acked-by: Shawn Guo <shawn.guo@linaro.org>
Tested-by: Kevin Hilman <khilman@ti.com>
Tested-by: Vishwanath BS <vishwanath.bs@ti.com>
---
 kernel/Makefile      |   1 +
 kernel/cpu_pm.c      | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/power/Kconfig |   4 ++
 3 files changed, 205 insertions(+)
 create mode 100644 kernel/cpu_pm.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index eca595e2fd52..988cb3da7031 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -101,6 +101,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
+obj-$(CONFIG_CPU_PM) += cpu_pm.o
 
 obj-$(CONFIG_PERF_EVENTS) += events/
 
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
new file mode 100644
index 000000000000..4d1ff4acd04b
--- /dev/null
+++ b/kernel/cpu_pm.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (C) 2011 Google, Inc.
+ *
+ * Author:
+ *	Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/cpu_pm.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+
+static DEFINE_RWLOCK(cpu_pm_notifier_lock);
+static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+
+static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
+{
+	int ret;
+
+	ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
+		nr_to_call, nr_calls);
+
+	return notifier_to_errno(ret);
+}
+
+/**
+ * cpu_pm_register_notifier - register a driver with cpu_pm
+ * @nb: notifier block to register
+ *
+ * Add a driver to a list of drivers that are notified about
+ * CPU and CPU cluster low power entry and exit.
+ *
+ * This function may sleep, and has the same return conditions as
+ * raw_notifier_chain_register.
+ */
+int cpu_pm_register_notifier(struct notifier_block *nb)
+{
+	unsigned long flags;
+	int ret;
+
+	write_lock_irqsave(&cpu_pm_notifier_lock, flags);
+	ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
+	write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
+
+/**
+ * cpu_pm_unregister_notifier - unregister a driver with cpu_pm
+ * @nb: notifier block to be unregistered
+ *
+ * Remove a driver from the CPU PM notifier list.
+ *
+ * This function may sleep, and has the same return conditions as
+ * raw_notifier_chain_unregister.
+ */
+int cpu_pm_unregister_notifier(struct notifier_block *nb)
+{
+	unsigned long flags;
+	int ret;
+
+	write_lock_irqsave(&cpu_pm_notifier_lock, flags);
+	ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
+	write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
+
+/**
+ * cpm_pm_enter - CPU low power entry notifier
+ *
+ * Notifies listeners that a single CPU is entering a low power state that may
+ * cause some blocks in the same power domain as the cpu to reset.
+ *
+ * Must be called on the affected CPU with interrupts disabled.  Platform is
+ * responsible for ensuring that cpu_pm_enter is not called twice on the same
+ * CPU before cpu_pm_exit is called. Notified drivers can include VFP
+ * co-processor, interrupt controller and it's PM extensions, local CPU
+ * timers context save/restore which shouldn't be interrupted. Hence it
+ * must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_pm_enter(void)
+{
+	int nr_calls;
+	int ret = 0;
+
+	read_lock(&cpu_pm_notifier_lock);
+	ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
+	if (ret)
+		/*
+		 * Inform listeners (nr_calls - 1) about failure of CPU PM
+		 * PM entry who are notified earlier to prepare for it.
+		 */
+		cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
+	read_unlock(&cpu_pm_notifier_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_enter);
+
+/**
+ * cpm_pm_exit - CPU low power exit notifier
+ *
+ * Notifies listeners that a single CPU is exiting a low power state that may
+ * have caused some blocks in the same power domain as the cpu to reset.
+ *
+ * Notified drivers can include VFP co-processor, interrupt controller
+ * and it's PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_pm_exit(void)
+{
+	int ret;
+
+	read_lock(&cpu_pm_notifier_lock);
+	ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
+	read_unlock(&cpu_pm_notifier_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_exit);
+
+/**
+ * cpm_cluster_pm_enter - CPU cluster low power entry notifier
+ *
+ * Notifies listeners that all cpus in a power domain are entering a low power
+ * state that may cause some blocks in the same power domain to reset.
+ *
+ * Must be called after cpu_pm_enter has been called on all cpus in the power
+ * domain, and before cpu_pm_exit has been called on any cpu in the power
+ * domain. Notified drivers can include VFP co-processor, interrupt controller
+ * and it's PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_cluster_pm_enter(void)
+{
+	int nr_calls;
+	int ret = 0;
+
+	read_lock(&cpu_pm_notifier_lock);
+	ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
+	if (ret)
+		/*
+		 * Inform listeners (nr_calls - 1) about failure of CPU cluster
+		 * PM entry who are notified earlier to prepare for it.
+		 */
+		cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
+	read_unlock(&cpu_pm_notifier_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
+
+/**
+ * cpm_cluster_pm_exit - CPU cluster low power exit notifier
+ *
+ * Notifies listeners that all cpus in a power domain are exiting form a
+ * low power state that may have caused some blocks in the same power domain
+ * to reset.
+ *
+ * Must be called after cpu_pm_exit has been called on all cpus in the power
+ * domain, and before cpu_pm_exit has been called on any cpu in the power
+ * domain. Notified drivers can include VFP co-processor, interrupt controller
+ * and it's PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_cluster_pm_exit(void)
+{
+	int ret;
+
+	read_lock(&cpu_pm_notifier_lock);
+	ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
+	read_unlock(&cpu_pm_notifier_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 3744c594b19b..80a85971cf64 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -235,3 +235,7 @@ config PM_GENERIC_DOMAINS
 config PM_GENERIC_DOMAINS_RUNTIME
 	def_bool y
 	depends on PM_RUNTIME && PM_GENERIC_DOMAINS
+
+config CPU_PM
+	bool
+	depends on SUSPEND || CPU_IDLE
-- 
cgit v1.2.2


From 6f3eaec87b6b17bfa49cb3b5b8d07fa84be18512 Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Fri, 22 Jul 2011 14:57:09 -0700
Subject: cpu_pm: call notifiers during suspend

Implements syscore_ops in cpu_pm to call the cpu and
cpu cluster notifiers during suspend and resume,
allowing drivers receiving the notifications to
avoid implementing syscore_ops.

Signed-off-by: Colin Cross <ccross@android.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Reviewed-by: Kevin Hilman <khilman@ti.com>
Tested-and-Acked-by: Shawn Guo <shawn.guo@linaro.org>
Tested-by: Vishwanath BS <vishwanath.bs@ti.com>
---
 kernel/cpu_pm.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 4d1ff4acd04b..249152e15308 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/spinlock.h>
+#include <linux/syscore_ops.h>
 
 static DEFINE_RWLOCK(cpu_pm_notifier_lock);
 static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
@@ -198,3 +199,35 @@ int cpu_cluster_pm_exit(void)
 	return ret;
 }
 EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
+
+#ifdef CONFIG_PM
+static int cpu_pm_suspend(void)
+{
+	int ret;
+
+	ret = cpu_pm_enter();
+	if (ret)
+		return ret;
+
+	ret = cpu_cluster_pm_enter();
+	return ret;
+}
+
+static void cpu_pm_resume(void)
+{
+	cpu_cluster_pm_exit();
+	cpu_pm_exit();
+}
+
+static struct syscore_ops cpu_pm_syscore_ops = {
+	.suspend = cpu_pm_suspend,
+	.resume = cpu_pm_resume,
+};
+
+static int cpu_pm_init(void)
+{
+	register_syscore_ops(&cpu_pm_syscore_ops);
+	return 0;
+}
+core_initcall(cpu_pm_init);
+#endif
-- 
cgit v1.2.2


From f4cfb33ed980085b14a8f376281edb2e270f0244 Mon Sep 17 00:00:00 2001
From: Wang Xingchao <xingchao.wang@intel.com>
Date: Fri, 16 Sep 2011 13:35:52 -0400
Subject: sched: Remove redundant test in check_preempt_tick()

The caller already checks for nr_running > 1, therefore we don't have
to do so again.

Signed-off-by: Wang Xingchao <xingchao.wang@intel.com>
Reviewed-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1316194552-12019-1-git-send-email-xingchao.wang@intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1ca2cd44d64a..fef0bfde7c8c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1106,6 +1106,8 @@ static void
 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
 	unsigned long ideal_runtime, delta_exec;
+	struct sched_entity *se;
+	s64 delta;
 
 	ideal_runtime = sched_slice(cfs_rq, curr);
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
@@ -1127,16 +1129,14 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	if (delta_exec < sysctl_sched_min_granularity)
 		return;
 
-	if (cfs_rq->nr_running > 1) {
-		struct sched_entity *se = __pick_first_entity(cfs_rq);
-		s64 delta = curr->vruntime - se->vruntime;
+	se = __pick_first_entity(cfs_rq);
+	delta = curr->vruntime - se->vruntime;
 
-		if (delta < 0)
-			return;
+	if (delta < 0)
+		return;
 
-		if (delta > ideal_runtime)
-			resched_task(rq_of(cfs_rq)->curr);
-	}
+	if (delta > ideal_runtime)
+		resched_task(rq_of(cfs_rq)->curr);
 }
 
 static void
-- 
cgit v1.2.2


From 53b615ccca567ada1931eb04ad0614ac150c14a3 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Tue, 27 Sep 2011 22:53:27 +0200
Subject: PM / Runtime: Introduce trace points for tracing rpm_* functions

This patch introduces 3 trace points to prepare for tracing
rpm_idle/rpm_suspend/rpm_resume functions, so we can use these
trace points to replace the current dev_dbg().

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/trace/Makefile     |  1 +
 kernel/trace/rpm-traces.c | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)
 create mode 100644 kernel/trace/rpm-traces.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 761c510a06c5..56bdab5b3793 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
+obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c
new file mode 100644
index 000000000000..4b3b5eaf94d1
--- /dev/null
+++ b/kernel/trace/rpm-traces.c
@@ -0,0 +1,20 @@
+/*
+ * Power trace points
+ *
+ * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com>
+ */
+
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/usb.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/rpm.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume);
-- 
cgit v1.2.2


From f786ecba4158880f8cdc0ebb93e7d78e6c125449 Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vzapolskiy@gmail.com>
Date: Wed, 21 Sep 2011 09:26:44 +0000
Subject: connector: add comm change event report to proc connector

Add an event to monitor comm value changes of tasks.  Such an event
becomes vital, if someone desires to control threads of a process in
different manner.

A natural characteristic of threads is its comm value, and helpfully
application developers have an opportunity to change it in runtime.
Reporting about such events via proc connector allows to fine-grain
monitoring and control potentials, for instance a process control daemon
listening to proc connector and following comm value policies can place
specific threads to assigned cgroup partitions.

It might be possible to achieve a pale partial one-shot likeness without
this update, if an application changes comm value of a thread generator
task beforehand, then a new thread is cloned, and after that proc
connector listener gets the fork event and reads new thread's comm value
from procfs stat file, but this change visibly simplifies and extends the
matter.

Signed-off-by: Vladimir Zapolskiy <vzapolskiy@gmail.com>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/sys.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 18ee1d2f6474..b3dfb76f8073 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1759,6 +1759,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 					      sizeof(me->comm) - 1) < 0)
 				return -EFAULT;
 			set_task_comm(me, comm);
+			proc_comm_connector(me);
 			return 0;
 		case PR_GET_NAME:
 			get_task_comm(comm, me);
-- 
cgit v1.2.2


From 1f288094807861ec1e48c428d2c49ccf7aaf3767 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 16 Jun 2011 15:53:18 -0700
Subject: rcu: Use kthread_create_on_node()

Commit a26ac2455ffc (move TREE_RCU from softirq to kthread) added
per-CPU kthreads.  However, kthread creation uses kthread_create(), which
can put the kthread's stack and task struct on the wrong NUMA node.
Therefore, use kthread_create_on_node() instead of kthread_create()
so that the stacks and task structs are placed on the correct NUMA node.

A similar change was carried out in commit 94dcf29a11b3 (kthread:
use kthread_create_on_node()).

Also change rcutorture's priority-boost-test kthread creation.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Tejun Heo <tj@kernel.org>
CC: Rusty Russell <rusty@rustcorp.com.au>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Andi Kleen <ak@linux.intel.com>
CC: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutorture.c     | 5 +++--
 kernel/rcutree_plugin.h | 5 ++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 98f51b13bb7e..1d2415046154 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -1280,8 +1280,9 @@ static int rcutorture_booster_init(int cpu)
 	/* Don't allow time recalculation while creating a new task. */
 	mutex_lock(&boost_mutex);
 	VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
-	boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
-					  "rcu_torture_boost");
+	boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
+						  cpu_to_node(cpu),
+						  "rcu_torture_boost");
 	if (IS_ERR(boost_tasks[cpu])) {
 		retval = PTR_ERR(boost_tasks[cpu]);
 		VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 8aafbb80b8b0..7b850cdc0aee 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1560,7 +1560,10 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
 	if (!rcu_scheduler_fully_active ||
 	    per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
 		return 0;
-	t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
+	t = kthread_create_on_node(rcu_cpu_kthread,
+				   (void *)(long)cpu,
+				   cpu_to_node(cpu),
+				   "rcuc%d", cpu);
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 	if (cpu_online(cpu))
-- 
cgit v1.2.2


From 1eb521210a8c9823038abe4ddfe8c69e713ec17d Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Thu, 16 Jun 2011 16:02:54 -0700
Subject: rcu: Avoid unnecessary self-wakeup of per-CPU kthreads

There are a number of cases where the RCU can find additional work
for the per-CPU kthread within the context of that per-CPU kthread.
In such cases, the per-CPU kthread is already running, so attempting
to wake itself up does nothing except waste CPU cycles.  This commit
therefore checks to see if it is in the per-CPU kthread context,
omitting the wakeup in this case.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_plugin.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 7b850cdc0aee..970329853dc5 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1291,11 +1291,9 @@ static void invoke_rcu_callbacks_kthread(void)
 
 	local_irq_save(flags);
 	__this_cpu_write(rcu_cpu_has_work, 1);
-	if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
-		local_irq_restore(flags);
-		return;
-	}
-	wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
+	if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
+	    current != __this_cpu_read(rcu_cpu_kthread_task))
+		wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
 	local_irq_restore(flags);
 }
 
-- 
cgit v1.2.2


From b3fbab0571eb09746cc0283648165ec00efc8eb2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 24 May 2011 08:31:09 -0700
Subject: rcu: Restore checks for blocking in RCU read-side critical sections

Long ago, using TREE_RCU with PREEMPT would result in "scheduling
while atomic" diagnostics if you blocked in an RCU read-side critical
section.  However, PREEMPT now implies TREE_PREEMPT_RCU, which defeats
this diagnostic.  This commit therefore adds a replacement diagnostic
based on PROVE_RCU.

Because rcu_lockdep_assert() and lockdep_rcu_dereference() are now being
used for things that have nothing to do with rcu_dereference(), rename
lockdep_rcu_dereference() to lockdep_rcu_suspicious() and add a third
argument that is a string indicating what is suspicious.  This third
argument is passed in from a new third argument to rcu_lockdep_assert().
Update all calls to rcu_lockdep_assert() to add an informative third
argument.

Also, add a pair of rcu_lockdep_assert() calls from within
rcu_note_context_switch(), one complaining if a context switch occurs
in an RCU-bh read-side critical section and another complaining if a
context switch occurs in an RCU-sched read-side critical section.
These are present only if the PROVE_RCU kernel parameter is enabled.

Finally, fix some checkpatch whitespace complaints in lockdep.c.

Again, you must enable PROVE_RCU to see these new diagnostics.  But you
are enabling PROVE_RCU to check out new RCU uses in any case, aren't you?

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/lockdep.c | 84 +++++++++++++++++++++++++++++++-------------------------
 kernel/pid.c     |  4 ++-
 kernel/sched.c   |  2 ++
 3 files changed, 52 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 91d67ce3a8d5..1e48f1c3ea70 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1129,10 +1129,11 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
 	if (debug_locks_silent)
 		return 0;
 
-	printk("\n=======================================================\n");
-	printk(  "[ INFO: possible circular locking dependency detected ]\n");
+	printk("\n");
+	printk("======================================================\n");
+	printk("[ INFO: possible circular locking dependency detected ]\n");
 	print_kernel_version();
-	printk(  "-------------------------------------------------------\n");
+	printk("-------------------------------------------------------\n");
 	printk("%s/%d is trying to acquire lock:\n",
 		curr->comm, task_pid_nr(curr));
 	print_lock(check_src);
@@ -1463,11 +1464,12 @@ print_bad_irq_dependency(struct task_struct *curr,
 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
 
-	printk("\n======================================================\n");
-	printk(  "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
+	printk("\n");
+	printk("======================================================\n");
+	printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
 		irqclass, irqclass);
 	print_kernel_version();
-	printk(  "------------------------------------------------------\n");
+	printk("------------------------------------------------------\n");
 	printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
 		curr->comm, task_pid_nr(curr),
 		curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
@@ -1692,10 +1694,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
 
-	printk("\n=============================================\n");
-	printk(  "[ INFO: possible recursive locking detected ]\n");
+	printk("\n");
+	printk("=============================================\n");
+	printk("[ INFO: possible recursive locking detected ]\n");
 	print_kernel_version();
-	printk(  "---------------------------------------------\n");
+	printk("---------------------------------------------\n");
 	printk("%s/%d is trying to acquire lock:\n",
 		curr->comm, task_pid_nr(curr));
 	print_lock(next);
@@ -2177,10 +2180,11 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
 
-	printk("\n=================================\n");
-	printk(  "[ INFO: inconsistent lock state ]\n");
+	printk("\n");
+	printk("=================================\n");
+	printk("[ INFO: inconsistent lock state ]\n");
 	print_kernel_version();
-	printk(  "---------------------------------\n");
+	printk("---------------------------------\n");
 
 	printk("inconsistent {%s} -> {%s} usage.\n",
 		usage_str[prev_bit], usage_str[new_bit]);
@@ -2241,10 +2245,11 @@ print_irq_inversion_bug(struct task_struct *curr,
 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
 
-	printk("\n=========================================================\n");
-	printk(  "[ INFO: possible irq lock inversion dependency detected ]\n");
+	printk("\n");
+	printk("=========================================================\n");
+	printk("[ INFO: possible irq lock inversion dependency detected ]\n");
 	print_kernel_version();
-	printk(  "---------------------------------------------------------\n");
+	printk("---------------------------------------------------------\n");
 	printk("%s/%d just changed the state of lock:\n",
 		curr->comm, task_pid_nr(curr));
 	print_lock(this);
@@ -3065,9 +3070,10 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
 	if (debug_locks_silent)
 		return 0;
 
-	printk("\n=====================================\n");
-	printk(  "[ BUG: bad unlock balance detected! ]\n");
-	printk(  "-------------------------------------\n");
+	printk("\n");
+	printk("=====================================\n");
+	printk("[ BUG: bad unlock balance detected! ]\n");
+	printk("-------------------------------------\n");
 	printk("%s/%d is trying to release lock (",
 		curr->comm, task_pid_nr(curr));
 	print_lockdep_cache(lock);
@@ -3478,9 +3484,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
 	if (debug_locks_silent)
 		return 0;
 
-	printk("\n=================================\n");
-	printk(  "[ BUG: bad contention detected! ]\n");
-	printk(  "---------------------------------\n");
+	printk("\n");
+	printk("=================================\n");
+	printk("[ BUG: bad contention detected! ]\n");
+	printk("---------------------------------\n");
 	printk("%s/%d is trying to contend lock (",
 		curr->comm, task_pid_nr(curr));
 	print_lockdep_cache(lock);
@@ -3839,9 +3846,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
 	if (debug_locks_silent)
 		return;
 
-	printk("\n=========================\n");
-	printk(  "[ BUG: held lock freed! ]\n");
-	printk(  "-------------------------\n");
+	printk("\n");
+	printk("=========================\n");
+	printk("[ BUG: held lock freed! ]\n");
+	printk("-------------------------\n");
 	printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
 		curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
 	print_lock(hlock);
@@ -3895,9 +3903,10 @@ static void print_held_locks_bug(struct task_struct *curr)
 	if (debug_locks_silent)
 		return;
 
-	printk("\n=====================================\n");
-	printk(  "[ BUG: lock held at task exit time! ]\n");
-	printk(  "-------------------------------------\n");
+	printk("\n");
+	printk("=====================================\n");
+	printk("[ BUG: lock held at task exit time! ]\n");
+	printk("-------------------------------------\n");
 	printk("%s/%d is exiting with locks still held!\n",
 		curr->comm, task_pid_nr(curr));
 	lockdep_print_held_locks(curr);
@@ -3991,16 +4000,17 @@ void lockdep_sys_exit(void)
 	if (unlikely(curr->lockdep_depth)) {
 		if (!debug_locks_off())
 			return;
-		printk("\n================================================\n");
-		printk(  "[ BUG: lock held when returning to user space! ]\n");
-		printk(  "------------------------------------------------\n");
+		printk("\n");
+		printk("================================================\n");
+		printk("[ BUG: lock held when returning to user space! ]\n");
+		printk("------------------------------------------------\n");
 		printk("%s/%d is leaving the kernel with locks still held!\n",
 				curr->comm, curr->pid);
 		lockdep_print_held_locks(curr);
 	}
 }
 
-void lockdep_rcu_dereference(const char *file, const int line)
+void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
 {
 	struct task_struct *curr = current;
 
@@ -4009,15 +4019,15 @@ void lockdep_rcu_dereference(const char *file, const int line)
 		return;
 #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
 	/* Note: the following can be executed concurrently, so be careful. */
-	printk("\n===================================================\n");
-	printk(  "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
-	printk(  "---------------------------------------------------\n");
-	printk("%s:%d invoked rcu_dereference_check() without protection!\n",
-			file, line);
+	printk("\n");
+	printk("===============================\n");
+	printk("[ INFO: suspicious RCU usage. ]\n");
+	printk("-------------------------------\n");
+	printk("%s:%d %s!\n", file, line, s);
 	printk("\nother info that might help us debug this:\n\n");
 	printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
 	lockdep_print_held_locks(curr);
 	printk("\nstack backtrace:\n");
 	dump_stack();
 }
-EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
+EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/pid.c b/kernel/pid.c
index e432057f3b21..8cafe7e72ad2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -418,7 +418,9 @@ EXPORT_SYMBOL(pid_task);
  */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
-	rcu_lockdep_assert(rcu_read_lock_held());
+	rcu_lockdep_assert(rcu_read_lock_held(),
+			   "find_task_by_pid_ns() needs rcu_read_lock()"
+			   " protection");
 	return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index ec5f472bc5b9..e24cebe0e6cb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4237,6 +4237,7 @@ static inline void schedule_debug(struct task_struct *prev)
 	 */
 	if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
 		__schedule_bug(prev);
+	rcu_sleep_check();
 
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
@@ -8230,6 +8231,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
 {
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
+	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
 	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
 	    system_state != SYSTEM_RUNNING || oops_in_progress)
 		return;
-- 
cgit v1.2.2


From f039d1f1884b2fe9c13d28f59d8330f0b0518fc4 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 7 Jun 2011 15:26:30 -0700
Subject: rcu: Fix mismatched variable in rcutree_trace.c

rcutree.c defines rcu_cpu_kthread_cpu as int, not unsigned int,
so the extern has to follow that.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 3b0c0986afc0..0a5a8bedc522 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -49,7 +49,7 @@
 #ifdef CONFIG_RCU_BOOST
 
 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
+DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
 DECLARE_PER_CPU(char, rcu_cpu_has_work);
 
-- 
cgit v1.2.2


From 2c42818e962e2858334bf45bfc56662b3752df34 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 26 May 2011 22:14:36 -0700
Subject: rcu: Abstract common code for RCU grace-period-wait primitives

Pull the code that waits for an RCU grace period into a single function,
which is then called by synchronize_rcu() and friends in the case of
TREE_RCU and TREE_PREEMPT_RCU, and from rcu_barrier() and friends in
the case of TINY_RCU and TINY_PREEMPT_RCU.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcupdate.c       | 21 ++++++++++++++++++++-
 kernel/rcutiny.c        | 28 ----------------------------
 kernel/rcutiny_plugin.h | 14 --------------
 kernel/rcutree.c        | 22 ++--------------------
 kernel/rcutree_plugin.h | 11 +----------
 5 files changed, 23 insertions(+), 73 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index ddddb320be61..09b3b1b54e02 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -94,11 +94,16 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
 
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
+struct rcu_synchronize {
+	struct rcu_head head;
+	struct completion completion;
+};
+
 /*
  * Awaken the corresponding synchronize_rcu() instance now that a
  * grace period has elapsed.
  */
-void wakeme_after_rcu(struct rcu_head  *head)
+static void wakeme_after_rcu(struct rcu_head  *head)
 {
 	struct rcu_synchronize *rcu;
 
@@ -106,6 +111,20 @@ void wakeme_after_rcu(struct rcu_head  *head)
 	complete(&rcu->completion);
 }
 
+void wait_rcu_gp(call_rcu_func_t crf)
+{
+	struct rcu_synchronize rcu;
+
+	init_rcu_head_on_stack(&rcu.head);
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	crf(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
+}
+EXPORT_SYMBOL_GPL(wait_rcu_gp);
+
 #ifdef CONFIG_PROVE_RCU
 /*
  * wrapper function to avoid #include problems.
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 7bbac7d0f5ab..f544e343256a 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -281,34 +281,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
 
-void rcu_barrier_bh(void)
-{
-	struct rcu_synchronize rcu;
-
-	init_rcu_head_on_stack(&rcu.head);
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu_bh(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-	destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_bh);
-
-void rcu_barrier_sched(void)
-{
-	struct rcu_synchronize rcu;
-
-	init_rcu_head_on_stack(&rcu.head);
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu_sched(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-	destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_sched);
-
 /*
  * Spawn the kthread that invokes RCU callbacks.
  */
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f259c676195f..6b0cedb383e0 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -697,20 +697,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
-void rcu_barrier(void)
-{
-	struct rcu_synchronize rcu;
-
-	init_rcu_head_on_stack(&rcu.head);
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-	destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
-
 /*
  * synchronize_rcu - wait until a grace period has elapsed.
  *
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ba06207b1dd3..a7c6bce1af83 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1613,18 +1613,9 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
  */
 void synchronize_sched(void)
 {
-	struct rcu_synchronize rcu;
-
 	if (rcu_blocking_is_gp())
 		return;
-
-	init_rcu_head_on_stack(&rcu.head);
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu_sched(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-	destroy_rcu_head_on_stack(&rcu.head);
+	wait_rcu_gp(call_rcu_sched);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
 
@@ -1639,18 +1630,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
  */
 void synchronize_rcu_bh(void)
 {
-	struct rcu_synchronize rcu;
-
 	if (rcu_blocking_is_gp())
 		return;
-
-	init_rcu_head_on_stack(&rcu.head);
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu_bh(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-	destroy_rcu_head_on_stack(&rcu.head);
+	wait_rcu_gp(call_rcu_bh);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
 
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 970329853dc5..43daa46bc6f2 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -656,18 +656,9 @@ EXPORT_SYMBOL_GPL(call_rcu);
  */
 void synchronize_rcu(void)
 {
-	struct rcu_synchronize rcu;
-
 	if (!rcu_scheduler_active)
 		return;
-
-	init_rcu_head_on_stack(&rcu.head);
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-	destroy_rcu_head_on_stack(&rcu.head);
+	wait_rcu_gp(call_rcu);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
-- 
cgit v1.2.2


From bdf2a4364904d6cf2f59b16b0bd86fdc5a2c6152 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 7 Jun 2011 16:59:35 -0700
Subject: rcu: Catch rcutorture up to new RCU API additions

Now that the RCU API contains synchronize_rcu_bh(), synchronize_sched(),
call_rcu_sched(), and rcu_bh_expedited()...

Make rcutorture test synchronize_rcu_bh(), getting rid of the old
rcu_bh_torture_synchronize() workaround.  Similarly, make rcutorture test
synchronize_sched(), getting rid of the old sched_torture_synchronize()
workaround.  Make rcutorture test call_rcu_sched() instead of wrappering
synchronize_sched().  Also add testing of rcu_bh_expedited().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutorture.c | 55 ++++++++++++++++++++---------------------------------
 1 file changed, 21 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 1d2415046154..75fca518888c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -480,30 +480,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
 	call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
 }
 
-struct rcu_bh_torture_synchronize {
-	struct rcu_head head;
-	struct completion completion;
-};
-
-static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head)
-{
-	struct rcu_bh_torture_synchronize *rcu;
-
-	rcu = container_of(head, struct rcu_bh_torture_synchronize, head);
-	complete(&rcu->completion);
-}
-
-static void rcu_bh_torture_synchronize(void)
-{
-	struct rcu_bh_torture_synchronize rcu;
-
-	init_rcu_head_on_stack(&rcu.head);
-	init_completion(&rcu.completion);
-	call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
-	wait_for_completion(&rcu.completion);
-	destroy_rcu_head_on_stack(&rcu.head);
-}
-
 static struct rcu_torture_ops rcu_bh_ops = {
 	.init		= NULL,
 	.cleanup	= NULL,
@@ -512,7 +488,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
 	.readunlock	= rcu_bh_torture_read_unlock,
 	.completed	= rcu_bh_torture_completed,
 	.deferred_free	= rcu_bh_torture_deferred_free,
-	.sync		= rcu_bh_torture_synchronize,
+	.sync		= synchronize_rcu_bh,
 	.cb_barrier	= rcu_barrier_bh,
 	.fqs		= rcu_bh_force_quiescent_state,
 	.stats		= NULL,
@@ -528,7 +504,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
 	.readunlock	= rcu_bh_torture_read_unlock,
 	.completed	= rcu_bh_torture_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
-	.sync		= rcu_bh_torture_synchronize,
+	.sync		= synchronize_rcu_bh,
 	.cb_barrier	= NULL,
 	.fqs		= rcu_bh_force_quiescent_state,
 	.stats		= NULL,
@@ -536,6 +512,22 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
 	.name		= "rcu_bh_sync"
 };
 
+static struct rcu_torture_ops rcu_bh_expedited_ops = {
+	.init		= rcu_sync_torture_init,
+	.cleanup	= NULL,
+	.readlock	= rcu_bh_torture_read_lock,
+	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock	= rcu_bh_torture_read_unlock,
+	.completed	= rcu_bh_torture_completed,
+	.deferred_free	= rcu_sync_torture_deferred_free,
+	.sync		= synchronize_rcu_bh_expedited,
+	.cb_barrier	= NULL,
+	.fqs		= rcu_bh_force_quiescent_state,
+	.stats		= NULL,
+	.irq_capable	= 1,
+	.name		= "rcu_bh_expedited"
+};
+
 /*
  * Definitions for srcu torture testing.
  */
@@ -659,11 +651,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
 	call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
 }
 
-static void sched_torture_synchronize(void)
-{
-	synchronize_sched();
-}
-
 static struct rcu_torture_ops sched_ops = {
 	.init		= rcu_sync_torture_init,
 	.cleanup	= NULL,
@@ -672,7 +659,7 @@ static struct rcu_torture_ops sched_ops = {
 	.readunlock	= sched_torture_read_unlock,
 	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_sched_torture_deferred_free,
-	.sync		= sched_torture_synchronize,
+	.sync		= synchronize_sched,
 	.cb_barrier	= rcu_barrier_sched,
 	.fqs		= rcu_sched_force_quiescent_state,
 	.stats		= NULL,
@@ -688,7 +675,7 @@ static struct rcu_torture_ops sched_sync_ops = {
 	.readunlock	= sched_torture_read_unlock,
 	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
-	.sync		= sched_torture_synchronize,
+	.sync		= synchronize_sched,
 	.cb_barrier	= NULL,
 	.fqs		= rcu_sched_force_quiescent_state,
 	.stats		= NULL,
@@ -1425,7 +1412,7 @@ rcu_torture_init(void)
 	int firsterr = 0;
 	static struct rcu_torture_ops *torture_ops[] =
 		{ &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
-		  &rcu_bh_ops, &rcu_bh_sync_ops,
+		  &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
 		  &srcu_ops, &srcu_expedited_ops,
 		  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
 
-- 
cgit v1.2.2


From 9d68197c05201d8edc70d58bd1d5dad05d8455e8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Tue, 21 Jun 2011 01:48:03 -0700
Subject: rcu: Don't destroy rcu_torture_boost() callback until it is done

The rcu_torture_boost() cleanup code destroyed debug-objects state before
waiting for the last RCU callback to be invoked, resulting in rare but
very real debug-objects warnings.  Move the destruction to after the
waiting to fix this problem.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 75fca518888c..6a30ea3f1d82 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -796,11 +796,11 @@ checkwait:	rcu_stutter_wait("rcu_torture_boost");
 
 	/* Clean up and exit. */
 	VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
-	destroy_rcu_head_on_stack(&rbi.rcu);
 	rcutorture_shutdown_absorb("rcu_torture_boost");
 	while (!kthread_should_stop() || rbi.inflight)
 		schedule_timeout_uninterruptible(1);
 	smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+	destroy_rcu_head_on_stack(&rbi.rcu);
 	return 0;
 }
 
-- 
cgit v1.2.2


From 29c00b4a1d9e277786120032aa8364631820d863 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Fri, 17 Jun 2011 15:53:19 -0700
Subject: rcu: Add event-tracing for RCU callback invocation

There was recently some controversy about the overhead of invoking RCU
callbacks.  Add TRACE_EVENT()s to obtain fine-grained timings for the
start and stop of a batch of callbacks and also for each callback invoked.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu.h      | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcupdate.c |  5 ++++
 kernel/rcutiny.c  | 26 +++++++++++++++++-
 kernel/rcutree.c  | 15 ++++++++---
 4 files changed, 121 insertions(+), 4 deletions(-)
 create mode 100644 kernel/rcu.h

(limited to 'kernel')

diff --git a/kernel/rcu.h b/kernel/rcu.h
new file mode 100644
index 000000000000..7bc16436aba0
--- /dev/null
+++ b/kernel/rcu.h
@@ -0,0 +1,79 @@
+/*
+ * Read-Copy Update definitions shared among RCU implementations.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2011
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#ifndef __LINUX_RCU_H
+#define __LINUX_RCU_H
+
+/*
+ * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
+ * by call_rcu() and rcu callback execution, and are therefore not part of the
+ * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
+ */
+
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+# define STATE_RCU_HEAD_READY	0
+# define STATE_RCU_HEAD_QUEUED	1
+
+extern struct debug_obj_descr rcuhead_debug_descr;
+
+static inline void debug_rcu_head_queue(struct rcu_head *head)
+{
+	WARN_ON_ONCE((unsigned long)head & 0x3);
+	debug_object_activate(head, &rcuhead_debug_descr);
+	debug_object_active_state(head, &rcuhead_debug_descr,
+				  STATE_RCU_HEAD_READY,
+				  STATE_RCU_HEAD_QUEUED);
+}
+
+static inline void debug_rcu_head_unqueue(struct rcu_head *head)
+{
+	debug_object_active_state(head, &rcuhead_debug_descr,
+				  STATE_RCU_HEAD_QUEUED,
+				  STATE_RCU_HEAD_READY);
+	debug_object_deactivate(head, &rcuhead_debug_descr);
+}
+#else	/* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+static inline void debug_rcu_head_queue(struct rcu_head *head)
+{
+}
+
+static inline void debug_rcu_head_unqueue(struct rcu_head *head)
+{
+}
+#endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+
+extern void kfree(const void *);
+
+static inline void __rcu_reclaim(struct rcu_head *head)
+{
+	unsigned long offset = (unsigned long)head->func;
+
+	if (__is_kfree_rcu_offset(offset)) {
+		trace_rcu_invoke_kfree_callback(head, offset);
+		kfree((void *)head - offset);
+	} else {
+		trace_rcu_invoke_callback(head);
+		head->func(head);
+	}
+}
+
+#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 09b3b1b54e02..ca0d23b6b3e8 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -46,6 +46,11 @@
 #include <linux/module.h>
 #include <linux/hardirq.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/rcu.h>
+
+#include "rcu.h"
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
 struct lockdep_map rcu_lock_map =
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index f544e343256a..19453ba1392e 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -37,6 +37,25 @@
 #include <linux/cpu.h>
 #include <linux/prefetch.h>
 
+#ifdef CONFIG_RCU_TRACE
+
+#include <trace/events/rcu.h>
+
+#else /* #ifdef CONFIG_RCU_TRACE */
+
+/* No by-default tracing in TINY_RCU: Keep TINY_RCU tiny! */
+static void trace_rcu_invoke_kfree_callback(struct rcu_head *rhp,
+					    unsigned long offset)
+{
+}
+static void trace_rcu_invoke_callback(struct rcu_head *head)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+#include "rcu.h"
+
 /* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
 static struct task_struct *rcu_kthread_task;
 static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
@@ -161,11 +180,15 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 	RCU_TRACE(int cb_count = 0);
 
 	/* If no RCU callbacks ready to invoke, just return. */
-	if (&rcp->rcucblist == rcp->donetail)
+	if (&rcp->rcucblist == rcp->donetail) {
+		RCU_TRACE(trace_rcu_batch_start(0, -1));
+		RCU_TRACE(trace_rcu_batch_end(0));
 		return;
+	}
 
 	/* Move the ready-to-invoke callbacks to a local list. */
 	local_irq_save(flags);
+	RCU_TRACE(trace_rcu_batch_start(0, -1));
 	list = rcp->rcucblist;
 	rcp->rcucblist = *rcp->donetail;
 	*rcp->donetail = NULL;
@@ -187,6 +210,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 		RCU_TRACE(cb_count++);
 	}
 	RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
+	RCU_TRACE(trace_rcu_batch_end(cb_count));
 }
 
 /*
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index a7c6bce1af83..45dcc2036a1e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -52,6 +52,9 @@
 #include <linux/prefetch.h>
 
 #include "rcutree.h"
+#include <trace/events/rcu.h>
+
+#include "rcu.h"
 
 /* Data structures. */
 
@@ -1190,17 +1193,22 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	unsigned long flags;
 	struct rcu_head *next, *list, **tail;
-	int count;
+	int bl, count;
 
 	/* If no callbacks are ready, just return.*/
-	if (!cpu_has_callbacks_ready_to_invoke(rdp))
+	if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
+		trace_rcu_batch_start(0, 0);
+		trace_rcu_batch_end(0);
 		return;
+	}
 
 	/*
 	 * Extract the list of ready callbacks, disabling to prevent
 	 * races with call_rcu() from interrupt handlers.
 	 */
 	local_irq_save(flags);
+	bl = rdp->blimit;
+	trace_rcu_batch_start(rdp->qlen, bl);
 	list = rdp->nxtlist;
 	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
 	*rdp->nxttail[RCU_DONE_TAIL] = NULL;
@@ -1218,11 +1226,12 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 		debug_rcu_head_unqueue(list);
 		__rcu_reclaim(list);
 		list = next;
-		if (++count >= rdp->blimit)
+		if (++count >= bl)
 			break;
 	}
 
 	local_irq_save(flags);
+	trace_rcu_batch_end(count);
 
 	/* Update count, and requeue any remaining callbacks. */
 	rdp->qlen -= count;
-- 
cgit v1.2.2


From 300df91ca9358f7f09298eec9503c12b32054ef7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Sat, 18 Jun 2011 22:26:31 -0700
Subject: rcu: Event-trace markers for computing RCU CPU utilization

This commit adds the trace_rcu_utilization() marker that is to be
used to allow postprocessing scripts compute RCU's CPU utilization,
give or take event-trace overhead.  Note that we do not include RCU's
dyntick-idle interface because event tracing requires RCU protection,
which is not available in dyntick-idle mode.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 45dcc2036a1e..2a9643bd6ae9 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -184,8 +184,10 @@ void rcu_bh_qs(int cpu)
  */
 void rcu_note_context_switch(int cpu)
 {
+	trace_rcu_utilization("Start context switch");
 	rcu_sched_qs(cpu);
 	rcu_preempt_note_context_switch(cpu);
+	trace_rcu_utilization("End context switch");
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 
@@ -1275,6 +1277,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
  */
 void rcu_check_callbacks(int cpu, int user)
 {
+	trace_rcu_utilization("Start scheduler-tick");
 	if (user ||
 	    (idle_cpu(cpu) && rcu_scheduler_active &&
 	     !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1308,6 +1311,7 @@ void rcu_check_callbacks(int cpu, int user)
 	rcu_preempt_check_callbacks(cpu);
 	if (rcu_pending(cpu))
 		invoke_rcu_core();
+	trace_rcu_utilization("End scheduler-tick");
 }
 
 #ifdef CONFIG_SMP
@@ -1369,10 +1373,14 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 	unsigned long flags;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	if (!rcu_gp_in_progress(rsp))
+	trace_rcu_utilization("Start fqs");
+	if (!rcu_gp_in_progress(rsp)) {
+		trace_rcu_utilization("End fqs");
 		return;  /* No grace period in progress, nothing to force. */
+	}
 	if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
 		rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
+		trace_rcu_utilization("End fqs");
 		return;	/* Someone else is already on the job. */
 	}
 	if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
@@ -1421,11 +1429,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 		raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
 		rsp->fqs_need_gp = 0;
 		rcu_start_gp(rsp, flags); /* releases rnp->lock */
+		trace_rcu_utilization("End fqs");
 		return;
 	}
 	raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
 unlock_fqs_ret:
 	raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
+	trace_rcu_utilization("End fqs");
 }
 
 #else /* #ifdef CONFIG_SMP */
@@ -1481,6 +1491,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
  */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
+	trace_rcu_utilization("Start RCU core");
 	__rcu_process_callbacks(&rcu_sched_state,
 				&__get_cpu_var(rcu_sched_data));
 	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
@@ -1488,6 +1499,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 
 	/* If we are last CPU on way to dyntick-idle mode, accelerate it. */
 	rcu_needs_cpu_flush();
+	trace_rcu_utilization("End RCU core");
 }
 
 /*
@@ -1910,6 +1922,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;
 
+	trace_rcu_utilization("Start CPU hotplug");
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
@@ -1945,6 +1958,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	default:
 		break;
 	}
+	trace_rcu_utilization("End CPU hotplug");
 	return NOTIFY_OK;
 }
 
-- 
cgit v1.2.2


From e99033c5c160f1f247c665923a66acec693a967c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Tue, 21 Jun 2011 00:13:44 -0700
Subject: rcu: Put names into TINY_RCU structures under RCU_TRACE

In order to allow event tracing to distinguish between flavors of
RCU, we need those names in the relevant RCU data structures.  TINY_RCU
has avoided them for memory-footprint reasons, so add them only if
CONFIG_RCU_TRACE=y.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu.h            | 10 ++++++++--
 kernel/rcutiny.c        | 13 -------------
 kernel/rcutiny_plugin.h | 10 ++++------
 kernel/rcutree.c        | 10 +++++-----
 kernel/rcutree_plugin.h |  2 +-
 5 files changed, 18 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu.h b/kernel/rcu.h
index 7bc16436aba0..d7f00ec8b47b 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -23,6 +23,12 @@
 #ifndef __LINUX_RCU_H
 #define __LINUX_RCU_H
 
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt) stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
 /*
  * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
  * by call_rcu() and rcu callback execution, and are therefore not part of the
@@ -68,10 +74,10 @@ static inline void __rcu_reclaim(struct rcu_head *head)
 	unsigned long offset = (unsigned long)head->func;
 
 	if (__is_kfree_rcu_offset(offset)) {
-		trace_rcu_invoke_kfree_callback(head, offset);
+		RCU_TRACE(trace_rcu_invoke_kfree_callback(head, offset));
 		kfree((void *)head - offset);
 	} else {
-		trace_rcu_invoke_callback(head);
+		RCU_TRACE(trace_rcu_invoke_callback(head));
 		head->func(head);
 	}
 }
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 19453ba1392e..0d28974b78f4 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -38,20 +38,7 @@
 #include <linux/prefetch.h>
 
 #ifdef CONFIG_RCU_TRACE
-
 #include <trace/events/rcu.h>
-
-#else /* #ifdef CONFIG_RCU_TRACE */
-
-/* No by-default tracing in TINY_RCU: Keep TINY_RCU tiny! */
-static void trace_rcu_invoke_kfree_callback(struct rcu_head *rhp,
-					    unsigned long offset)
-{
-}
-static void trace_rcu_invoke_callback(struct rcu_head *head)
-{
-}
-
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
 
 #include "rcu.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 6b0cedb383e0..791ddf7c99ab 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -26,29 +26,26 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
-#ifdef CONFIG_RCU_TRACE
-#define RCU_TRACE(stmt)	stmt
-#else /* #ifdef CONFIG_RCU_TRACE */
-#define RCU_TRACE(stmt)
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
-
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
 	struct rcu_head *rcucblist;	/* List of pending callbacks (CBs). */
 	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */
 	struct rcu_head **curtail;	/* ->next pointer of last CB. */
 	RCU_TRACE(long qlen);		/* Number of pending CBs. */
+	RCU_TRACE(char *name);		/* Name of RCU type. */
 };
 
 /* Definition for rcupdate control block. */
 static struct rcu_ctrlblk rcu_sched_ctrlblk = {
 	.donetail	= &rcu_sched_ctrlblk.rcucblist,
 	.curtail	= &rcu_sched_ctrlblk.rcucblist,
+	RCU_TRACE(.name = "rcu_sched")
 };
 
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.donetail	= &rcu_bh_ctrlblk.rcucblist,
 	.curtail	= &rcu_bh_ctrlblk.rcucblist,
+	RCU_TRACE(.name = "rcu_bh")
 };
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -131,6 +128,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
 	.rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
 	.nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
 	.blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
+	RCU_TRACE(.rcb.name = "rcu_preempt")
 };
 
 static int rcu_preempted_readers_exp(void);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 2a9643bd6ae9..b953e2c72e25 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -61,7 +61,7 @@
 static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 
 #define RCU_STATE_INITIALIZER(structname) { \
-	.level = { &structname.node[0] }, \
+	.level = { &structname##_state.node[0] }, \
 	.levelcnt = { \
 		NUM_RCU_LVL_0,  /* root of hierarchy. */ \
 		NUM_RCU_LVL_1, \
@@ -72,17 +72,17 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 	.signaled = RCU_GP_IDLE, \
 	.gpnum = -300, \
 	.completed = -300, \
-	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
-	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
+	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
+	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
 	.n_force_qs = 0, \
 	.n_force_qs_ngp = 0, \
 	.name = #structname, \
 }
 
-struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
+struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
 DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 static struct rcu_state *rcu_state;
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 43daa46bc6f2..a90bf3c17492 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -64,7 +64,7 @@ static void __init rcu_bootup_announce_oddness(void)
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
-struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
+struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt);
 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
 static struct rcu_state *rcu_state = &rcu_preempt_state;
 
-- 
cgit v1.2.2


From 72fe701b70e6ced35d734b676c13efbc8fc769a9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Tue, 21 Jun 2011 01:14:54 -0700
Subject: rcu: Add RCU type to callback-invocation tracing

Add a string to the rcu_batch_start() and rcu_batch_end() trace
messages that indicates the RCU type ("rcu_sched", "rcu_bh", or
"rcu_preempt").  The trace messages for the actual invocations
themselves are not marked, as it should be clear from the
rcu_batch_start() and rcu_batch_end() events before and after.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutiny.c | 8 ++++----
 kernel/rcutree.c | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 0d28974b78f4..1c37bdd464f1 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -168,14 +168,14 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 
 	/* If no RCU callbacks ready to invoke, just return. */
 	if (&rcp->rcucblist == rcp->donetail) {
-		RCU_TRACE(trace_rcu_batch_start(0, -1));
-		RCU_TRACE(trace_rcu_batch_end(0));
+		RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
+		RCU_TRACE(trace_rcu_batch_end(rcp->name, 0));
 		return;
 	}
 
 	/* Move the ready-to-invoke callbacks to a local list. */
 	local_irq_save(flags);
-	RCU_TRACE(trace_rcu_batch_start(0, -1));
+	RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
 	list = rcp->rcucblist;
 	rcp->rcucblist = *rcp->donetail;
 	*rcp->donetail = NULL;
@@ -197,7 +197,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 		RCU_TRACE(cb_count++);
 	}
 	RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
-	RCU_TRACE(trace_rcu_batch_end(cb_count));
+	RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count));
 }
 
 /*
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index b953e2c72e25..eb6e731088a0 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1199,8 +1199,8 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	/* If no callbacks are ready, just return.*/
 	if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
-		trace_rcu_batch_start(0, 0);
-		trace_rcu_batch_end(0);
+		trace_rcu_batch_start(rsp->name, 0, 0);
+		trace_rcu_batch_end(rsp->name, 0);
 		return;
 	}
 
@@ -1210,7 +1210,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 	 */
 	local_irq_save(flags);
 	bl = rdp->blimit;
-	trace_rcu_batch_start(rdp->qlen, bl);
+	trace_rcu_batch_start(rsp->name, rdp->qlen, bl);
 	list = rdp->nxtlist;
 	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
 	*rdp->nxttail[RCU_DONE_TAIL] = NULL;
@@ -1233,7 +1233,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 	}
 
 	local_irq_save(flags);
-	trace_rcu_batch_end(count);
+	trace_rcu_batch_end(rsp->name, count);
 
 	/* Update count, and requeue any remaining callbacks. */
 	rdp->qlen -= count;
-- 
cgit v1.2.2


From e0f23060adfa3f27beaa7918eff70258b88471b6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Tue, 21 Jun 2011 01:29:39 -0700
Subject: rcu: Update comments to reflect softirqs vs. kthreads

We now have kthreads only for flavors of RCU that support boosting,
so update the now-misleading comments accordingly.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c        | 23 ++++++++++++-----------
 kernel/rcutree_plugin.h |  3 ++-
 2 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index eb6e731088a0..4e24399cabcf 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -198,7 +198,7 @@ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 };
 #endif /* #ifdef CONFIG_NO_HZ */
 
-static int blimit = 10;		/* Maximum callbacks per softirq. */
+static int blimit = 10;		/* Maximum callbacks per rcu_do_batch. */
 static int qhimark = 10000;	/* If this many pending, ignore blimit. */
 static int qlowmark = 100;	/* Once only this many pending, use blimit. */
 
@@ -1261,7 +1261,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	local_irq_restore(flags);
 
-	/* Re-raise the RCU softirq if there are callbacks remaining. */
+	/* Re-invoke RCU core processing if there are callbacks remaining. */
 	if (cpu_has_callbacks_ready_to_invoke(rdp))
 		invoke_rcu_core();
 }
@@ -1269,7 +1269,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 /*
  * Check to see if this CPU is in a non-context-switch quiescent state
  * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
- * Also schedule the RCU softirq handler.
+ * Also schedule RCU core processing.
  *
  * This function must be called with hardirqs disabled.  It is normally
  * invoked from the scheduling-clock interrupt.  If rcu_pending returns
@@ -1448,9 +1448,9 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 #endif /* #else #ifdef CONFIG_SMP */
 
 /*
- * This does the RCU processing work from softirq context for the
- * specified rcu_state and rcu_data structures.  This may be called
- * only from the CPU to whom the rdp belongs.
+ * This does the RCU core processing work for the specified rcu_state
+ * and rcu_data structures.  This may be called only from the CPU to
+ * whom the rdp belongs.
  */
 static void
 __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -1487,7 +1487,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 }
 
 /*
- * Do softirq processing for the current CPU.
+ * Do RCU core processing for the current CPU.
  */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
@@ -1503,10 +1503,11 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 }
 
 /*
- * Wake up the current CPU's kthread.  This replaces raise_softirq()
- * in earlier versions of RCU.  Note that because we are running on
- * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
- * cannot disappear out from under us.
+ * Schedule RCU callback invocation.  If the specified type of RCU
+ * does not support RCU priority boosting, just do a direct call,
+ * otherwise wake up the per-CPU kernel kthread.  Note that because we
+ * are running on the current CPU with interrupts disabled, the
+ * rcu_cpu_kthread_task cannot disappear out from under us.
  */
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 {
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a90bf3c17492..ecd48a2e3eeb 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1478,7 +1478,8 @@ static int rcu_cpu_kthread_should_stop(int cpu)
 
 /*
  * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
- * earlier RCU softirq.
+ * RCU softirq used in flavors and configurations of RCU that do not
+ * support RCU priority boosting.
  */
 static int rcu_cpu_kthread(void *arg)
 {
-- 
cgit v1.2.2


From eab0993c7ba5c7d9b3613d6037e0f31f0ccbe181 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Tue, 21 Jun 2011 01:59:33 -0700
Subject: rcu: Move RCU_BOOST declarations to allow compiler checking

Andi Kleen noticed that one of the RCU_BOOST data declarations was
out of sync with the definition.  Move the declarations so that the
compiler can do the checking in the future.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.h       | 7 +++++++
 kernel/rcutree_trace.c | 5 -----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 01b2ccda26fb..eee6c9406b46 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -417,6 +417,13 @@ extern struct rcu_state rcu_preempt_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 
+#ifdef CONFIG_RCU_BOOST
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DECLARE_PER_CPU(char, rcu_cpu_has_work);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
 #ifndef RCU_TREE_NONCORE
 
 /* Forward declarations for rcutree_plugin.h */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 0a5a8bedc522..f328ed1c6e46 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -48,11 +48,6 @@
 
 #ifdef CONFIG_RCU_BOOST
 
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DECLARE_PER_CPU(char, rcu_cpu_has_work);
-
 static char convert_kthread_status(unsigned int kthread_status)
 {
 	if (kthread_status > RCU_KTHREAD_MAX)
-- 
cgit v1.2.2


From 385680a9487d2f85382ad6d74e2a15837e47bfd9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Tue, 21 Jun 2011 22:43:26 -0700
Subject: rcu: Add event-trace markers to TREE_RCU kthreads

Add event-trace markers to TREE_RCU kthreads to allow including these
kthread's CPU time in the utilization calculations.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_plugin.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index ecd48a2e3eeb..94d9ca1e4061 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1219,9 +1219,12 @@ static int rcu_boost_kthread(void *arg)
 	int spincnt = 0;
 	int more2boost;
 
+	trace_rcu_utilization("Start boost kthread@init");
 	for (;;) {
 		rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
+		trace_rcu_utilization("End boost kthread@rcu_wait");
 		rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
+		trace_rcu_utilization("Start boost kthread@rcu_wait");
 		rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
 		more2boost = rcu_boost(rnp);
 		if (more2boost)
@@ -1229,11 +1232,14 @@ static int rcu_boost_kthread(void *arg)
 		else
 			spincnt = 0;
 		if (spincnt > 10) {
+			trace_rcu_utilization("End boost kthread@rcu_yield");
 			rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
+			trace_rcu_utilization("Start boost kthread@rcu_yield");
 			spincnt = 0;
 		}
 	}
 	/* NOTREACHED */
+	trace_rcu_utilization("End boost kthread@notreached");
 	return 0;
 }
 
@@ -1490,9 +1496,12 @@ static int rcu_cpu_kthread(void *arg)
 	char work;
 	char *workp = &per_cpu(rcu_cpu_has_work, cpu);
 
+	trace_rcu_utilization("Start CPU kthread@init");
 	for (;;) {
 		*statusp = RCU_KTHREAD_WAITING;
+		trace_rcu_utilization("End CPU kthread@rcu_wait");
 		rcu_wait(*workp != 0 || kthread_should_stop());
+		trace_rcu_utilization("Start CPU kthread@rcu_wait");
 		local_bh_disable();
 		if (rcu_cpu_kthread_should_stop(cpu)) {
 			local_bh_enable();
@@ -1513,11 +1522,14 @@ static int rcu_cpu_kthread(void *arg)
 			spincnt = 0;
 		if (spincnt > 10) {
 			*statusp = RCU_KTHREAD_YIELDING;
+			trace_rcu_utilization("End CPU kthread@rcu_yield");
 			rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
+			trace_rcu_utilization("Start CPU kthread@rcu_yield");
 			spincnt = 0;
 		}
 	}
 	*statusp = RCU_KTHREAD_STOPPED;
+	trace_rcu_utilization("End CPU kthread@term");
 	return 0;
 }
 
-- 
cgit v1.2.2


From 965a002b4f1a458c5dcb334ec29f48a0046faa25 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Sat, 18 Jun 2011 09:55:39 -0700
Subject: rcu: Make TINY_RCU also use softirq for RCU_BOOST=n

This patch #ifdefs TINY_RCU kthreads out of the kernel unless RCU_BOOST=y,
thus eliminating context-switch overhead if RCU priority boosting has
not been configured.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutiny.c        |  74 +++++---------------------------
 kernel/rcutiny_plugin.h | 110 ++++++++++++++++++++++++++++++++++++------------
 2 files changed, 93 insertions(+), 91 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 1c37bdd464f1..c9321d86999b 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -43,16 +43,11 @@
 
 #include "rcu.h"
 
-/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
-static struct task_struct *rcu_kthread_task;
-static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
-static unsigned long have_rcu_kthread_work;
-
 /* Forward declarations for rcutiny_plugin.h. */
 struct rcu_ctrlblk;
-static void invoke_rcu_kthread(void);
-static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
-static int rcu_kthread(void *arg);
+static void invoke_rcu_callbacks(void);
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static void rcu_process_callbacks(struct softirq_action *unused);
 static void __call_rcu(struct rcu_head *head,
 		       void (*func)(struct rcu_head *rcu),
 		       struct rcu_ctrlblk *rcp);
@@ -101,16 +96,6 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 	return 0;
 }
 
-/*
- * Wake up rcu_kthread() to process callbacks now eligible for invocation
- * or to boost readers.
- */
-static void invoke_rcu_kthread(void)
-{
-	have_rcu_kthread_work = 1;
-	wake_up(&rcu_kthread_wq);
-}
-
 /*
  * Record an rcu quiescent state.  And an rcu_bh quiescent state while we
  * are at it, given that any rcu quiescent state is also an rcu_bh
@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu)
 	local_irq_save(flags);
 	if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
 	    rcu_qsctr_help(&rcu_bh_ctrlblk))
-		invoke_rcu_kthread();
+		invoke_rcu_callbacks();
 	local_irq_restore(flags);
 }
 
@@ -136,7 +121,7 @@ void rcu_bh_qs(int cpu)
 
 	local_irq_save(flags);
 	if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-		invoke_rcu_kthread();
+		invoke_rcu_callbacks();
 	local_irq_restore(flags);
 }
 
@@ -160,7 +145,7 @@ void rcu_check_callbacks(int cpu, int user)
  * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
  * whose grace period has elapsed.
  */
-static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
 	struct rcu_head *next, *list;
 	unsigned long flags;
@@ -200,36 +185,11 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 	RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count));
 }
 
-/*
- * This kthread invokes RCU callbacks whose grace periods have
- * elapsed.  It is awakened as needed, and takes the place of the
- * RCU_SOFTIRQ that was used previously for this purpose.
- * This is a kthread, but it is never stopped, at least not until
- * the system goes down.
- */
-static int rcu_kthread(void *arg)
+static void rcu_process_callbacks(struct softirq_action *unused)
 {
-	unsigned long work;
-	unsigned long morework;
-	unsigned long flags;
-
-	for (;;) {
-		wait_event_interruptible(rcu_kthread_wq,
-					 have_rcu_kthread_work != 0);
-		morework = rcu_boost();
-		local_irq_save(flags);
-		work = have_rcu_kthread_work;
-		have_rcu_kthread_work = morework;
-		local_irq_restore(flags);
-		if (work) {
-			rcu_process_callbacks(&rcu_sched_ctrlblk);
-			rcu_process_callbacks(&rcu_bh_ctrlblk);
-			rcu_preempt_process_callbacks();
-		}
-		schedule_timeout_interruptible(1); /* Leave CPU for others. */
-	}
-
-	return 0;  /* Not reached, but needed to shut gcc up. */
+	__rcu_process_callbacks(&rcu_sched_ctrlblk);
+	__rcu_process_callbacks(&rcu_bh_ctrlblk);
+	rcu_preempt_process_callbacks();
 }
 
 /*
@@ -291,17 +251,3 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 	__call_rcu(head, func, &rcu_bh_ctrlblk);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
-
-/*
- * Spawn the kthread that invokes RCU callbacks.
- */
-static int __init rcu_spawn_kthreads(void)
-{
-	struct sched_param sp;
-
-	rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
-	sp.sched_priority = RCU_BOOST_PRIO;
-	sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
-	return 0;
-}
-early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 791ddf7c99ab..02aa7139861c 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -245,6 +245,13 @@ static void show_tiny_preempt_stats(struct seq_file *m)
 
 #include "rtmutex_common.h"
 
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+
+/* Controls for rcu_kthread() kthread. */
+static struct task_struct *rcu_kthread_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static unsigned long have_rcu_kthread_work;
+
 /*
  * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
  * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
@@ -332,7 +339,7 @@ static int rcu_initiate_boost(void)
 		if (rcu_preempt_ctrlblk.exp_tasks == NULL)
 			rcu_preempt_ctrlblk.boost_tasks =
 				rcu_preempt_ctrlblk.gp_tasks;
-		invoke_rcu_kthread();
+		invoke_rcu_callbacks();
 	} else
 		RCU_TRACE(rcu_initiate_boost_trace());
 	return 1;
@@ -350,14 +357,6 @@ static void rcu_preempt_boost_start_gp(void)
 
 #else /* #ifdef CONFIG_RCU_BOOST */
 
-/*
- * If there is no RCU priority boosting, we don't boost.
- */
-static int rcu_boost(void)
-{
-	return 0;
-}
-
 /*
  * If there is no RCU priority boosting, we don't initiate boosting,
  * but we do indicate whether there are blocked readers blocking the
@@ -425,7 +424,7 @@ static void rcu_preempt_cpu_qs(void)
 
 	/* If there are done callbacks, cause them to be invoked. */
 	if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-		invoke_rcu_kthread();
+		invoke_rcu_callbacks();
 }
 
 /*
@@ -646,7 +645,7 @@ static void rcu_preempt_check_callbacks(void)
 		rcu_preempt_cpu_qs();
 	if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
 	    rcu_preempt_ctrlblk.rcb.donetail)
-		invoke_rcu_kthread();
+		invoke_rcu_callbacks();
 	if (rcu_preempt_gp_in_progress() &&
 	    rcu_cpu_blocking_cur_gp() &&
 	    rcu_preempt_running_reader())
@@ -672,7 +671,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
  */
 static void rcu_preempt_process_callbacks(void)
 {
-	rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+	__rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
 }
 
 /*
@@ -847,15 +846,6 @@ static void show_tiny_preempt_stats(struct seq_file *m)
 
 #endif /* #ifdef CONFIG_RCU_TRACE */
 
-/*
- * Because preemptible RCU does not exist, it is never necessary to
- * boost preempted RCU readers.
- */
-static int rcu_boost(void)
-{
-	return 0;
-}
-
 /*
  * Because preemptible RCU does not exist, it never has any callbacks
  * to check.
@@ -882,6 +872,78 @@ static void rcu_preempt_process_callbacks(void)
 
 #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
 
+#ifdef CONFIG_RCU_BOOST
+
+/*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_callbacks(void)
+{
+	have_rcu_kthread_work = 1;
+	wake_up(&rcu_kthread_wq);
+}
+
+/*
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed.  It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that is used for this purpose when boosting is disabled.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
+ */
+static int rcu_kthread(void *arg)
+{
+	unsigned long work;
+	unsigned long morework;
+	unsigned long flags;
+
+	for (;;) {
+		wait_event_interruptible(rcu_kthread_wq,
+					 have_rcu_kthread_work != 0);
+		morework = rcu_boost();
+		local_irq_save(flags);
+		work = have_rcu_kthread_work;
+		have_rcu_kthread_work = morework;
+		local_irq_restore(flags);
+		if (work)
+			rcu_process_callbacks(NULL);
+		schedule_timeout_interruptible(1); /* Leave CPU for others. */
+	}
+
+	return 0;  /* Not reached, but needed to shut gcc up. */
+}
+
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
+{
+	struct sched_param sp;
+
+	rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+	sp.sched_priority = RCU_BOOST_PRIO;
+	sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+	return 0;
+}
+early_initcall(rcu_spawn_kthreads);
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Start up softirq processing of callbacks.
+ */
+void invoke_rcu_callbacks(void)
+{
+	raise_softirq(RCU_SOFTIRQ);
+}
+
+void rcu_init(void)
+{
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
+
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #include <linux/kernel_stat.h>
 
@@ -897,12 +959,6 @@ void __init rcu_scheduler_starting(void)
 
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
-#ifdef CONFIG_RCU_BOOST
-#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
-#else /* #ifdef CONFIG_RCU_BOOST */
-#define RCU_BOOST_PRIO 1
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
-
 #ifdef CONFIG_RCU_TRACE
 
 #ifdef CONFIG_RCU_BOOST
-- 
cgit v1.2.2


From d4c08f2ac311a360230eef7e5395b0ec8d8f0670 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Sat, 25 Jun 2011 06:36:56 -0700
Subject: rcu: Add grace-period, quiescent-state, and call_rcu trace events

Add trace events to record grace-period start and end, quiescent states,
CPUs noticing grace-period start and end, grace-period initialization,
call_rcu() invocation, tasks blocking in RCU read-side critical sections,
tasks exiting those same critical sections, force_quiescent_state()
detection of dyntick-idle and offline CPUs, CPUs entering and leaving
dyntick-idle mode (except from NMIs), CPUs coming online and going
offline, and CPUs being kicked for staying in dyntick-idle mode for too
long (as in many weeks, even on 32-bit systems).

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

rcu: Add the rcu flavor to callback trace events

The earlier trace events for registering RCU callbacks and for invoking
them did not include the RCU flavor (rcu_bh, rcu_preempt, or rcu_sched).
This commit adds the RCU flavor to those trace events.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu.h            |  6 +++---
 kernel/rcutiny.c        |  4 +++-
 kernel/rcutree.c        | 45 ++++++++++++++++++++++++++++++++++++++++++---
 kernel/rcutree.h        |  1 +
 kernel/rcutree_plugin.h | 22 +++++++++++++++++++---
 5 files changed, 68 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu.h b/kernel/rcu.h
index d7f00ec8b47b..f600868d550d 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -69,15 +69,15 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
 
 extern void kfree(const void *);
 
-static inline void __rcu_reclaim(struct rcu_head *head)
+static inline void __rcu_reclaim(char *rn, struct rcu_head *head)
 {
 	unsigned long offset = (unsigned long)head->func;
 
 	if (__is_kfree_rcu_offset(offset)) {
-		RCU_TRACE(trace_rcu_invoke_kfree_callback(head, offset));
+		RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
 		kfree((void *)head - offset);
 	} else {
-		RCU_TRACE(trace_rcu_invoke_callback(head));
+		RCU_TRACE(trace_rcu_invoke_callback(rn, head));
 		head->func(head);
 	}
 }
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index c9321d86999b..da775c87f27f 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -147,6 +147,7 @@ void rcu_check_callbacks(int cpu, int user)
  */
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
+	char *rn = NULL;
 	struct rcu_head *next, *list;
 	unsigned long flags;
 	RCU_TRACE(int cb_count = 0);
@@ -171,12 +172,13 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 	local_irq_restore(flags);
 
 	/* Invoke the callbacks on the local list. */
+	RCU_TRACE(rn = rcp->name);
 	while (list) {
 		next = list->next;
 		prefetch(next);
 		debug_rcu_head_unqueue(list);
 		local_bh_disable();
-		__rcu_reclaim(list);
+		__rcu_reclaim(rn, list);
 		local_bh_enable();
 		list = next;
 		RCU_TRACE(cb_count++);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4e24399cabcf..7e0282949f8a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -166,6 +166,8 @@ void rcu_sched_qs(int cpu)
 
 	rdp->passed_quiesc_completed = rdp->gpnum - 1;
 	barrier();
+	if (rdp->passed_quiesc == 0)
+		trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
 	rdp->passed_quiesc = 1;
 }
 
@@ -175,6 +177,8 @@ void rcu_bh_qs(int cpu)
 
 	rdp->passed_quiesc_completed = rdp->gpnum - 1;
 	barrier();
+	if (rdp->passed_quiesc == 0)
+		trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
 	rdp->passed_quiesc = 1;
 }
 
@@ -319,6 +323,7 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
 	 * trust its state not to change because interrupts are disabled.
 	 */
 	if (cpu_is_offline(rdp->cpu)) {
+		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
 		rdp->offline_fqs++;
 		return 1;
 	}
@@ -359,6 +364,7 @@ void rcu_enter_nohz(void)
 		local_irq_restore(flags);
 		return;
 	}
+	trace_rcu_dyntick("Start");
 	/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
 	smp_mb__before_atomic_inc();  /* See above. */
 	atomic_inc(&rdtp->dynticks);
@@ -396,6 +402,7 @@ void rcu_exit_nohz(void)
 	/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
 	smp_mb__after_atomic_inc();  /* See above. */
 	WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+	trace_rcu_dyntick("End");
 	local_irq_restore(flags);
 }
 
@@ -501,6 +508,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	 * of the current RCU grace period.
 	 */
 	if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
+		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
 		rdp->dynticks_fqs++;
 		return 1;
 	}
@@ -683,6 +691,7 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
 		 * go looking for one.
 		 */
 		rdp->gpnum = rnp->gpnum;
+		trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
 		if (rnp->qsmask & rdp->grpmask) {
 			rdp->qs_pending = 1;
 			rdp->passed_quiesc = 0;
@@ -746,6 +755,7 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
 
 		/* Remember that we saw this grace-period completion. */
 		rdp->completed = rnp->completed;
+		trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
 
 		/*
 		 * If we were in an extended quiescent state, we may have
@@ -856,6 +866,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 
 	/* Advance to a new grace period and initialize state. */
 	rsp->gpnum++;
+	trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
 	WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
 	rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
 	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
@@ -870,6 +881,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 		rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
 		rcu_start_gp_per_cpu(rsp, rnp, rdp);
 		rcu_preempt_boost_start_gp(rnp);
+		trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
+					    rnp->level, rnp->grplo,
+					    rnp->grphi, rnp->qsmask);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
@@ -906,6 +920,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 		if (rnp == rdp->mynode)
 			rcu_start_gp_per_cpu(rsp, rnp, rdp);
 		rcu_preempt_boost_start_gp(rnp);
+		trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
+					    rnp->level, rnp->grplo,
+					    rnp->grphi, rnp->qsmask);
 		raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
 	}
 
@@ -939,6 +956,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	if (gp_duration > rsp->gp_max)
 		rsp->gp_max = gp_duration;
 	rsp->completed = rsp->gpnum;
+	trace_rcu_grace_period(rsp->name, rsp->completed, "end");
 	rsp->signaled = RCU_GP_IDLE;
 	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
 }
@@ -967,6 +985,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 			return;
 		}
 		rnp->qsmask &= ~mask;
+		trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
+						 mask, rnp->qsmask, rnp->level,
+						 rnp->grplo, rnp->grphi,
+						 !!rnp->gp_tasks);
 		if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
 
 			/* Other bits still set at this level, so done. */
@@ -1135,11 +1157,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 		if (rnp->qsmaskinit != 0) {
 			if (rnp != rdp->mynode)
 				raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+			else
+				trace_rcu_grace_period(rsp->name,
+						       rnp->gpnum + 1 -
+						       !!(rnp->qsmask & mask),
+						       "cpuofl");
 			break;
 		}
-		if (rnp == rdp->mynode)
+		if (rnp == rdp->mynode) {
+			trace_rcu_grace_period(rsp->name,
+					       rnp->gpnum + 1 -
+					       !!(rnp->qsmask & mask),
+					       "cpuofl");
 			need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
-		else
+		} else
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		mask = rnp->grpmask;
 		rnp = rnp->parent;
@@ -1226,7 +1257,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 		next = list->next;
 		prefetch(next);
 		debug_rcu_head_unqueue(list);
-		__rcu_reclaim(list);
+		__rcu_reclaim(rsp->name, list);
 		list = next;
 		if (++count >= bl)
 			break;
@@ -1552,6 +1583,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
 	rdp->qlen++;
 
+	if (__is_kfree_rcu_offset((unsigned long)func))
+		trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
+					 rdp->qlen);
+	else
+		trace_rcu_callback(rsp->name, head, rdp->qlen);
+
 	/* If interrupts were disabled, don't dive into RCU core. */
 	if (irqs_disabled_flags(flags)) {
 		local_irq_restore(flags);
@@ -1850,6 +1887,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 #endif /* #ifdef CONFIG_NO_HZ */
 	rdp->cpu = cpu;
+	rdp->rsp = rsp;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 
@@ -1898,6 +1936,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 			rdp->gpnum = rnp->completed; /* if GP in progress... */
 			rdp->completed = rnp->completed;
 			rdp->passed_quiesc_completed = rnp->completed - 1;
+			trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
 		}
 		raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
 		rnp = rnp->parent;
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index eee6c9406b46..d11a0065321c 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -299,6 +299,7 @@ struct rcu_data {
 	unsigned long n_rp_need_nothing;
 
 	int cpu;
+	struct rcu_state *rsp;
 };
 
 /* Values for signaled field in struct rcu_state. */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 94d9ca1e4061..bdb2e82f78d3 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -124,6 +124,8 @@ static void rcu_preempt_qs(int cpu)
 
 	rdp->passed_quiesc_completed = rdp->gpnum - 1;
 	barrier();
+	if (rdp->passed_quiesc == 0)
+		trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
 	rdp->passed_quiesc = 1;
 	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 }
@@ -190,6 +192,11 @@ static void rcu_preempt_note_context_switch(int cpu)
 			if (rnp->qsmask & rdp->grpmask)
 				rnp->gp_tasks = &t->rcu_node_entry;
 		}
+		trace_rcu_preempt_task(rdp->rsp->name,
+				       t->pid,
+				       (rnp->qsmask & rdp->grpmask)
+				       ? rnp->gpnum
+				       : rnp->gpnum + 1);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	} else if (t->rcu_read_lock_nesting < 0 &&
 		   t->rcu_read_unlock_special) {
@@ -344,6 +351,8 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
 		np = rcu_next_node_entry(t, rnp);
 		list_del_init(&t->rcu_node_entry);
+		trace_rcu_unlock_preempted_task("rcu_preempt",
+						rnp->gpnum, t->pid);
 		if (&t->rcu_node_entry == rnp->gp_tasks)
 			rnp->gp_tasks = np;
 		if (&t->rcu_node_entry == rnp->exp_tasks)
@@ -364,10 +373,17 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
 		 * we aren't waiting on any CPUs, report the quiescent state.
 		 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
 		 */
-		if (empty)
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
-		else
+		if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
+			trace_rcu_quiescent_state_report("preempt_rcu",
+							 rnp->gpnum,
+							 0, rnp->qsmask,
+							 rnp->level,
+							 rnp->grplo,
+							 rnp->grphi,
+							 !!rnp->gp_tasks);
 			rcu_report_unblock_qs_rnp(rnp, flags);
+		} else
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
 #ifdef CONFIG_RCU_BOOST
 		/* Unboost if we were boosted. */
-- 
cgit v1.2.2


From e4cc1f22b2f4e9b0207a8cdb63e56dcf99e82d35 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Mon, 27 Jun 2011 00:17:43 -0700
Subject: rcu: Simplify quiescent-state accounting

There is often a delay between the time that a CPU passes through a
quiescent state and the time that this quiescent state is reported to the
RCU core.  It is quite possible that the grace period ended before the
quiescent state could be reported, for example, some other CPU might have
deduced that this CPU passed through dyntick-idle mode.  It is critically
important that quiescent state be counted only against the grace period
that was in effect at the time that the quiescent state was detected.

Previously, this was handled by recording the number of the last grace
period to complete when passing through a quiescent state.  The RCU
core then checks this number against the current value, and rejects
the quiescent state if there is a mismatch.  However, one additional
possibility must be accounted for, namely that the quiescent state was
recorded after the prior grace period completed but before the current
grace period started.  In this case, the RCU core must reject the
quiescent state, but the recorded number will match.  This is handled
when the CPU becomes aware of a new grace period -- at that point,
it invalidates any prior quiescent state.

This works, but is a bit indirect.  The new approach records the current
grace period, and the RCU core checks to see (1) that this is still the
current grace period and (2) that this grace period has not yet ended.
This approach simplifies reasoning about correctness, and this commit
changes over to this new approach.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c        | 44 ++++++++++++++++++++++----------------------
 kernel/rcutree.h        |  6 +++---
 kernel/rcutree_plugin.h |  6 +++---
 kernel/rcutree_trace.c  |  8 ++++----
 4 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7e0282949f8a..7e2f297aeec8 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -159,32 +159,34 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
  * Note a quiescent state.  Because we do not need to know
  * how many quiescent states passed, just if there was at least
  * one since the start of the grace period, this just sets a flag.
+ * The caller must have disabled preemption.
  */
 void rcu_sched_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
 
-	rdp->passed_quiesc_completed = rdp->gpnum - 1;
+	rdp->passed_quiesce_gpnum = rdp->gpnum;
 	barrier();
-	if (rdp->passed_quiesc == 0)
+	if (rdp->passed_quiesce == 0)
 		trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
-	rdp->passed_quiesc = 1;
+	rdp->passed_quiesce = 1;
 }
 
 void rcu_bh_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
 
-	rdp->passed_quiesc_completed = rdp->gpnum - 1;
+	rdp->passed_quiesce_gpnum = rdp->gpnum;
 	barrier();
-	if (rdp->passed_quiesc == 0)
+	if (rdp->passed_quiesce == 0)
 		trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
-	rdp->passed_quiesc = 1;
+	rdp->passed_quiesce = 1;
 }
 
 /*
  * Note a context switch.  This is a quiescent state for RCU-sched,
  * and requires special handling for preemptible RCU.
+ * The caller must have disabled preemption.
  */
 void rcu_note_context_switch(int cpu)
 {
@@ -694,7 +696,7 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
 		trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
 		if (rnp->qsmask & rdp->grpmask) {
 			rdp->qs_pending = 1;
-			rdp->passed_quiesc = 0;
+			rdp->passed_quiesce = 0;
 		} else
 			rdp->qs_pending = 0;
 	}
@@ -1027,7 +1029,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
  * based on quiescent states detected in an earlier grace period!
  */
 static void
-rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
+rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp)
 {
 	unsigned long flags;
 	unsigned long mask;
@@ -1035,17 +1037,15 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
 
 	rnp = rdp->mynode;
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	if (lastcomp != rnp->completed) {
+	if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) {
 
 		/*
-		 * Someone beat us to it for this grace period, so leave.
-		 * The race with GP start is resolved by the fact that we
-		 * hold the leaf rcu_node lock, so that the per-CPU bits
-		 * cannot yet be initialized -- so we would simply find our
-		 * CPU's bit already cleared in rcu_report_qs_rnp() if this
-		 * race occurred.
+		 * The grace period in which this quiescent state was
+		 * recorded has ended, so don't report it upwards.
+		 * We will instead need a new quiescent state that lies
+		 * within the current grace period.
 		 */
-		rdp->passed_quiesc = 0;	/* try again later! */
+		rdp->passed_quiesce = 0;	/* need qs for new gp. */
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
@@ -1089,14 +1089,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 	 * Was there a quiescent state since the beginning of the grace
 	 * period? If no, then exit and wait for the next call.
 	 */
-	if (!rdp->passed_quiesc)
+	if (!rdp->passed_quiesce)
 		return;
 
 	/*
 	 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
 	 * judge of that).
 	 */
-	rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
+	rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1712,7 +1712,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 	check_cpu_stall(rsp, rdp);
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
-	if (rdp->qs_pending && !rdp->passed_quiesc) {
+	if (rdp->qs_pending && !rdp->passed_quiesce) {
 
 		/*
 		 * If force_quiescent_state() coming soon and this CPU
@@ -1724,7 +1724,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 		    ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
 				 jiffies))
 			set_need_resched();
-	} else if (rdp->qs_pending && rdp->passed_quiesc) {
+	} else if (rdp->qs_pending && rdp->passed_quiesce) {
 		rdp->n_rp_report_qs++;
 		return 1;
 	}
@@ -1907,7 +1907,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
+	rdp->passed_quiesce = 0;  /* We could be racing with new GP, */
 	rdp->qs_pending = 1;	 /*  so set up to respond to current GP. */
 	rdp->beenonline = 1;	 /* We have now been online. */
 	rdp->preemptible = preemptible;
@@ -1935,7 +1935,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 		if (rnp == rdp->mynode) {
 			rdp->gpnum = rnp->completed; /* if GP in progress... */
 			rdp->completed = rnp->completed;
-			rdp->passed_quiesc_completed = rnp->completed - 1;
+			rdp->passed_quiesce_gpnum = rnp->gpnum - 1;
 			trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
 		}
 		raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index d11a0065321c..51638b68b2dc 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -230,9 +230,9 @@ struct rcu_data {
 					/*  in order to detect GP end. */
 	unsigned long	gpnum;		/* Highest gp number that this CPU */
 					/*  is aware of having started. */
-	unsigned long	passed_quiesc_completed;
-					/* Value of completed at time of qs. */
-	bool		passed_quiesc;	/* User-mode/idle loop etc. */
+	unsigned long	passed_quiesce_gpnum;
+					/* gpnum at time of quiescent state. */
+	bool		passed_quiesce;	/* User-mode/idle loop etc. */
 	bool		qs_pending;	/* Core waits for quiesc state. */
 	bool		beenonline;	/* CPU online at least once. */
 	bool		preemptible;	/* Preemptible RCU? */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index bdb2e82f78d3..4bac5a29fb69 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -122,11 +122,11 @@ static void rcu_preempt_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
 
-	rdp->passed_quiesc_completed = rdp->gpnum - 1;
+	rdp->passed_quiesce_gpnum = rdp->gpnum;
 	barrier();
-	if (rdp->passed_quiesc == 0)
+	if (rdp->passed_quiesce == 0)
 		trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
-	rdp->passed_quiesc = 1;
+	rdp->passed_quiesce = 1;
 	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 }
 
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index f328ed1c6e46..9feffa4c0695 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -61,11 +61,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
 	if (!rdp->beenonline)
 		return;
-	seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
+	seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d",
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
 		   rdp->completed, rdp->gpnum,
-		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
+		   rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
 		   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
 	seq_printf(m, " dt=%d/%d/%d df=%lu",
@@ -139,7 +139,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
 		   rdp->completed, rdp->gpnum,
-		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
+		   rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
 		   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
 	seq_printf(m, ",%d,%d,%d,%lu",
@@ -170,7 +170,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 
 static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
-	seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
+	seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
 #ifdef CONFIG_NO_HZ
 	seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
 #endif /* #ifdef CONFIG_NO_HZ */
-- 
cgit v1.2.2


From 3721bc1d3ed9940b17791805b7ee3a4743295d12 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 21 Jul 2011 16:00:17 -0700
Subject: rcu: Allow rcutorture's stat_interval parameter to be changed at
 runtime

When rcutorture is compiled directly into the kernel
(instead of separately as a module), it is necessary to specify
rcutorture.stat_interval as a kernel command-line parameter, otherwise,
the rcu_torture_stats kthread is never started.  However, when working
with the system after it has booted, it is convenient to be able to
change the time between statistic printing, particularly when logged
into the console.

This commit therefore allows the stat_interval parameter to be changed
at runtime.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 6a30ea3f1d82..ade7771b208a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -73,7 +73,7 @@ module_param(nreaders, int, 0444);
 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
 module_param(nfakewriters, int, 0444);
 MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
-module_param(stat_interval, int, 0444);
+module_param(stat_interval, int, 0644);
 MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
 module_param(verbose, bool, 0444);
 MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
-- 
cgit v1.2.2


From fc0763f53e3ff6a6bfa66934662a3446b9ca6f16 Mon Sep 17 00:00:00 2001
From: "Shi, Alex" <alex.shi@intel.com>
Date: Thu, 28 Jul 2011 14:56:12 +0800
Subject: nohz: Remove nohz_cpu_mask

RCU no longer uses this global variable, nor does anyone else.  This
commit therefore removes this variable.  This reduces memory footprint
and also removes some atomic instructions and memory barriers from
the dyntick-idle path.

Signed-off-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/sched.c           | 11 -----------
 kernel/time/tick-sched.c |  6 ------
 2 files changed, 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e24cebe0e6cb..3e5525630459 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5979,15 +5979,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	ftrace_graph_init_idle_task(idle, cpu);
 }
 
-/*
- * In a system that switches off the HZ timer nohz_cpu_mask
- * indicates which cpus entered this state. This is used
- * in the rcu update to wait only for active cpus. For system
- * which do not switch off the HZ timer nohz_cpu_mask should
- * always be CPU_BITS_NONE.
- */
-cpumask_var_t nohz_cpu_mask;
-
 /*
  * Increase the granularity value when there are more CPUs,
  * because with more CPUs the 'effective latency' as visible
@@ -8200,8 +8191,6 @@ void __init sched_init(void)
 	 */
 	current->sched_class = &fair_sched_class;
 
-	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
-	zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
 #ifdef CONFIG_NO_HZ
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d5097c44b407..eb98e55196b9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -139,7 +139,6 @@ static void tick_nohz_update_jiffies(ktime_t now)
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 	unsigned long flags;
 
-	cpumask_clear_cpu(cpu, nohz_cpu_mask);
 	ts->idle_waketime = now;
 
 	local_irq_save(flags);
@@ -389,9 +388,6 @@ void tick_nohz_stop_sched_tick(int inidle)
 		else
 			expires.tv64 = KTIME_MAX;
 
-		if (delta_jiffies > 1)
-			cpumask_set_cpu(cpu, nohz_cpu_mask);
-
 		/* Skip reprogram of event if its not changed */
 		if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
 			goto out;
@@ -441,7 +437,6 @@ void tick_nohz_stop_sched_tick(int inidle)
 		 * softirq.
 		 */
 		tick_do_update_jiffies64(ktime_get());
-		cpumask_clear_cpu(cpu, nohz_cpu_mask);
 	}
 	raise_softirq_irqoff(TIMER_SOFTIRQ);
 out:
@@ -524,7 +519,6 @@ void tick_nohz_restart_sched_tick(void)
 	/* Update jiffies first */
 	select_nohz_load_balancer(0);
 	tick_do_update_jiffies64(now);
-	cpumask_clear_cpu(cpu, nohz_cpu_mask);
 
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	/*
-- 
cgit v1.2.2


From 717f98f0f87c16c1e5768ac8f67c27f3d3535a25 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 5 Aug 2011 22:39:02 -0700
Subject: rcu: Eliminate in_irq() checks in rcu_enter_nohz()

The in_irq() check in rcu_enter_nohz() is redundant because if we really
are in an interrupt, the attempt to re-enter dyntick-idle mode will invoke
rcu_needs_cpu() in any case, which will force the check for RCU callbacks.
So this commit removes the check along with the set_need_resched().

Suggested-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7e2f297aeec8..0c6c30dc6a7c 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -373,13 +373,6 @@ void rcu_enter_nohz(void)
 	smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
 	WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 	local_irq_restore(flags);
-
-	/* If the interrupt queued a callback, get out of dyntick mode. */
-	if (in_irq() &&
-	    (__get_cpu_var(rcu_sched_data).nxtlist ||
-	     __get_cpu_var(rcu_bh_data).nxtlist ||
-	     rcu_preempt_needs_cpu(smp_processor_id())))
-		set_need_resched();
 }
 
 /*
-- 
cgit v1.2.2


From 7eb4f4553ceaa6c64da83c8a71d5a991c0188655 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 30 Jul 2011 07:32:48 -0700
Subject: rcu: Make rcu_implicit_dynticks_qs() locals be correct size

When the ->dynticks field in the rcu_dynticks structure changed to an
atomic_t, its size on 64-bit systems changed from 64 bits to 32 bits.
The local variables in rcu_implicit_dynticks_qs() need to change as
well, hence this commit.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0c6c30dc6a7c..ebd18e56947b 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -488,11 +488,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
  */
 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 {
-	unsigned long curr;
-	unsigned long snap;
+	unsigned int curr;
+	unsigned int snap;
 
-	curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
-	snap = (unsigned long)rdp->dynticks_snap;
+	curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
+	snap = (unsigned int)rdp->dynticks_snap;
 
 	/*
 	 * If the CPU passed through or entered a dynticks idle phase with
@@ -502,7 +502,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	 * read-side critical section that started before the beginning
 	 * of the current RCU grace period.
 	 */
-	if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
+	if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
 		rdp->dynticks_fqs++;
 		return 1;
-- 
cgit v1.2.2


From 6206ab9bab620fc0fbbed30ce20d145b0b3d1840 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Mon, 1 Aug 2011 06:22:11 -0700
Subject: rcu: Move __rcu_read_unlock()'s barrier() within if-statement

We only need to constrain the compiler if we are actually exiting
the top-level RCU read-side critical section.  This commit therefore
moves the first barrier() cal in __rcu_read_unlock() to inside the
"if" statement, thus avoiding needless register flushes for inner
rcu_read_unlock() calls.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 4bac5a29fb69..ed70f6bf4c31 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -415,10 +415,10 @@ void __rcu_read_unlock(void)
 {
 	struct task_struct *t = current;
 
-	barrier();  /* needed if we ever invoke rcu_read_unlock in rcutree.c */
 	if (t->rcu_read_lock_nesting != 1)
 		--t->rcu_read_lock_nesting;
 	else {
+		barrier();  /* critical section before exit code. */
 		t->rcu_read_lock_nesting = INT_MIN;
 		barrier();  /* assign before ->rcu_read_unlock_special load */
 		if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
-- 
cgit v1.2.2


From 4627e240dfee4a0a46a58010b1b721b4ded1918f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 3 Aug 2011 03:34:24 -0700
Subject: rcu: Dump local stack if cannot dump all CPUs' stacks

The trigger_all_cpu_backtrace() function is a no-op in architectures that
do not define arch_trigger_all_cpu_backtrace.  On such architectures, RCU
CPU stall warning messages contain no stack trace information, which makes
debugging quite difficult.  This commit therefore substitutes dump_stack()
for architectures that do not define arch_trigger_all_cpu_backtrace,
so that at least the local CPU's stack is dumped as part of the RCU CPU
stall warning message.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ebd18e56947b..a07bf553e02a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -583,7 +583,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 	}
 	printk("} (detected by %d, t=%ld jiffies)\n",
 	       smp_processor_id(), (long)(jiffies - rsp->gp_start));
-	trigger_all_cpu_backtrace();
+	if (!trigger_all_cpu_backtrace())
+		dump_stack();
 
 	/* If so configured, complain about tasks blocking the grace period. */
 
@@ -604,7 +605,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
 	 */
 	printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
 	       rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
-	trigger_all_cpu_backtrace();
+	if (!trigger_all_cpu_backtrace())
+		dump_stack();
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
-- 
cgit v1.2.2


From 5c51dd7349d4bb26f845f17f85daa168f5fa03f2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 4 Aug 2011 06:59:03 -0700
Subject: rcu: Prevent early boot set_need_resched() from __rcu_pending()

There isn't a whole lot of point in poking the scheduler before there
are other tasks to switch to.  This commit therefore adds a check
for rcu_scheduler_fully_active in __rcu_pending() to suppress any
pre-scheduler calls to set_need_resched().  The downside of this approach
is additional runtime overhead in a reasonably hot code path.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index a07bf553e02a..0051dbf6958e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1707,7 +1707,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 	check_cpu_stall(rsp, rdp);
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
-	if (rdp->qs_pending && !rdp->passed_quiesce) {
+	if (rcu_scheduler_fully_active &&
+	    rdp->qs_pending && !rdp->passed_quiesce) {
 
 		/*
 		 * If force_quiescent_state() coming soon and this CPU
-- 
cgit v1.2.2


From 82e78d80fc392ac7e98326bc8beeb8a679913ffd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 4 Aug 2011 07:55:34 -0700
Subject: rcu: Simplify unboosting checks

Commit 7765be (Fix RCU_BOOST race handling current->rcu_read_unlock_special)
introduced a new ->rcu_boosted field in the task structure.  This is
redundant because the existing ->rcu_boost_mutex will be non-NULL at
any time that ->rcu_boosted is nonzero.  Therefore, this commit removes
->rcu_boosted and tests ->rcu_boost_mutex instead.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_plugin.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index ed70f6bf4c31..eeb38ee8ebba 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -306,6 +306,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
 	int empty_exp;
 	unsigned long flags;
 	struct list_head *np;
+#ifdef CONFIG_RCU_BOOST
+	struct rt_mutex *rbmp = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
 	struct rcu_node *rnp;
 	int special;
 
@@ -351,6 +354,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
 		np = rcu_next_node_entry(t, rnp);
 		list_del_init(&t->rcu_node_entry);
+		t->rcu_blocked_node = NULL;
 		trace_rcu_unlock_preempted_task("rcu_preempt",
 						rnp->gpnum, t->pid);
 		if (&t->rcu_node_entry == rnp->gp_tasks)
@@ -360,13 +364,12 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
 		if (&t->rcu_node_entry == rnp->boost_tasks)
 			rnp->boost_tasks = np;
-		/* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
-		if (t->rcu_boosted) {
-			special |= RCU_READ_UNLOCK_BOOSTED;
-			t->rcu_boosted = 0;
+		/* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
+		if (t->rcu_boost_mutex) {
+			rbmp = t->rcu_boost_mutex;
+			t->rcu_boost_mutex = NULL;
 		}
 #endif /* #ifdef CONFIG_RCU_BOOST */
-		t->rcu_blocked_node = NULL;
 
 		/*
 		 * If this was the last task on the current list, and if
@@ -387,10 +390,8 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
 
 #ifdef CONFIG_RCU_BOOST
 		/* Unboost if we were boosted. */
-		if (special & RCU_READ_UNLOCK_BOOSTED) {
-			rt_mutex_unlock(t->rcu_boost_mutex);
-			t->rcu_boost_mutex = NULL;
-		}
+		if (rbmp)
+			rt_mutex_unlock(rbmp);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
 		/*
@@ -1206,7 +1207,6 @@ static int rcu_boost(struct rcu_node *rnp)
 	t = container_of(tb, struct task_struct, rcu_node_entry);
 	rt_mutex_init_proxy_locked(&mtx, t);
 	t->rcu_boost_mutex = &mtx;
-	t->rcu_boosted = 1;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
 	rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
-- 
cgit v1.2.2


From 037067a1b6f9a70f862f3ed9d59fe28b7cd55ac4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Sun, 7 Aug 2011 20:26:31 -0700
Subject: rcu: Prohibit grace periods during early boot

Greater use of RCU during early boot (before the scheduler is operating)
is causing RCU to attempt to start grace periods during that time, which
in turn is resulting in both RCU and the callback functions attempting
to use the scheduler before it is ready.

This commit prevents these problems by prohibiting RCU grace periods
until after the scheduler has spawned the first non-idle task.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0051dbf6958e..9970116163ba 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -838,8 +838,11 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
-		if (cpu_needs_another_gp(rsp, rdp))
+	if (!rcu_scheduler_fully_active ||
+	    !cpu_needs_another_gp(rsp, rdp) ||
+	    rsp->fqs_active) {
+		if (rcu_scheduler_fully_active &&
+		    cpu_needs_another_gp(rsp, rdp))
 			rsp->fqs_need_gp = 1;
 		if (rnp->completed == rsp->completed) {
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
-- 
cgit v1.2.2


From 9bc8b5586f94be6391458074ecbba8827ba8ba9d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 13 Aug 2011 13:31:47 -0700
Subject: rcu: Suppress NMI backtraces when stall ends before dump

It is possible for an RCU CPU stall to end just as it is detected, in
which case the current code will uselessly dump all CPU's stacks.
This commit therefore checks for this condition and refrains from
sending needless NMIs.

And yes, the stall might also end just after we checked all CPUs and
tasks, but in that case we would at least have given some clue as
to which CPU/task was at fault.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c        | 13 +++++++++----
 kernel/rcutree.h        |  2 +-
 kernel/rcutree_plugin.h | 13 +++++++++----
 3 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 9970116163ba..ade788320dd6 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -545,6 +545,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 	int cpu;
 	long delta;
 	unsigned long flags;
+	int ndetected;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	/* Only let one CPU complain about others per time interval. */
@@ -561,7 +562,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 	 * Now rat on any tasks that got kicked up to the root rcu_node
 	 * due to CPU offlining.
 	 */
-	rcu_print_task_stall(rnp);
+	ndetected = rcu_print_task_stall(rnp);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
 	/*
@@ -573,17 +574,21 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 	       rsp->name);
 	rcu_for_each_leaf_node(rsp, rnp) {
 		raw_spin_lock_irqsave(&rnp->lock, flags);
-		rcu_print_task_stall(rnp);
+		ndetected += rcu_print_task_stall(rnp);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		if (rnp->qsmask == 0)
 			continue;
 		for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
-			if (rnp->qsmask & (1UL << cpu))
+			if (rnp->qsmask & (1UL << cpu)) {
 				printk(" %d", rnp->grplo + cpu);
+				ndetected++;
+			}
 	}
 	printk("} (detected by %d, t=%ld jiffies)\n",
 	       smp_processor_id(), (long)(jiffies - rsp->gp_start));
-	if (!trigger_all_cpu_backtrace())
+	if (ndetected == 0)
+		printk(KERN_ERR "INFO: Stall ended before state dump start\n");
+	else if (!trigger_all_cpu_backtrace())
 		dump_stack();
 
 	/* If so configured, complain about tasks blocking the grace period. */
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 51638b68b2dc..f509f728f9fb 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -438,7 +438,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
 static void rcu_stop_cpu_kthread(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
-static void rcu_print_task_stall(struct rcu_node *rnp);
+static int rcu_print_task_stall(struct rcu_node *rnp);
 static void rcu_preempt_stall_reset(void);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index eeb38ee8ebba..d3127e8764cb 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -483,16 +483,20 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
  * Scan the current list of tasks blocked within RCU read-side critical
  * sections, printing out the tid of each.
  */
-static void rcu_print_task_stall(struct rcu_node *rnp)
+static int rcu_print_task_stall(struct rcu_node *rnp)
 {
 	struct task_struct *t;
+	int ndetected = 0;
 
 	if (!rcu_preempt_blocked_readers_cgp(rnp))
-		return;
+		return 0;
 	t = list_entry(rnp->gp_tasks,
 		       struct task_struct, rcu_node_entry);
-	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
+	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
 		printk(" P%d", t->pid);
+		ndetected++;
+	}
+	return ndetected;
 }
 
 /*
@@ -976,8 +980,9 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
  * Because preemptible RCU does not exist, we never have to check for
  * tasks blocked within RCU read-side critical sections.
  */
-static void rcu_print_task_stall(struct rcu_node *rnp)
+static int rcu_print_task_stall(struct rcu_node *rnp)
 {
+	return 0;
 }
 
 /*
-- 
cgit v1.2.2


From 06ae115a1d551cd952d80df06eaf8b5153351875 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 14 Aug 2011 15:56:54 -0700
Subject: rcu: Avoid having just-onlined CPU resched itself when RCU is idle

CPUs set rdp->qs_pending when coming online to resolve races with
grace-period start.  However, this means that if RCU is idle, the
just-onlined CPU might needlessly send itself resched IPIs.  Adjust
the online-CPU initialization to avoid this, and also to correctly
cause the CPU to respond to the current grace period if needed.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Josh Boyer <jwboyer@redhat.com>
Tested-by: Christian Hoffmann <email@christianhoffmann.info>
---
 kernel/rcutree.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ade788320dd6..c95fa89ffef9 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1911,8 +1911,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	rdp->passed_quiesce = 0;  /* We could be racing with new GP, */
-	rdp->qs_pending = 1;	 /*  so set up to respond to current GP. */
 	rdp->beenonline = 1;	 /* We have now been online. */
 	rdp->preemptible = preemptible;
 	rdp->qlen_last_fqs_check = 0;
@@ -1937,8 +1935,15 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 		rnp->qsmaskinit |= mask;
 		mask = rnp->grpmask;
 		if (rnp == rdp->mynode) {
-			rdp->gpnum = rnp->completed; /* if GP in progress... */
+			/*
+			 * If there is a grace period in progress, we will
+			 * set up to wait for it next time we run the
+			 * RCU core code.
+			 */
+			rdp->gpnum = rnp->completed;
 			rdp->completed = rnp->completed;
+			rdp->passed_quiesce = 0;
+			rdp->qs_pending = 0;
 			rdp->passed_quiesce_gpnum = rnp->gpnum - 1;
 			trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
 		}
-- 
cgit v1.2.2


From 5342e269b2b58ee0b0b4168a94087faaa60d0567 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Tue, 16 Aug 2011 17:46:46 -0700
Subject: rcu: Permit rt_mutex_unlock() with irqs disabled

Create a separate lockdep class for the rt_mutex used for RCU priority
boosting and enable use of rt_mutex_lock() with irqs disabled.  This
prevents RCU priority boosting from falling prey to deadlocks when
someone begins an RCU read-side critical section in preemptible state,
but releases it with an irq-disabled lock held.

Unfortunately, the scheduler's runqueue and priority-inheritance locks
still must either completely enclose or be completely enclosed by any
overlapping RCU read-side critical section.

This version removes a redundant local_irq_restore() noted by
Yong Zhang.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_plugin.h | 5 +++++
 kernel/rtmutex.c        | 8 ++++++++
 2 files changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index d3127e8764cb..28422767d854 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1149,6 +1149,8 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
 
+static struct lock_class_key rcu_boost_class;
+
 /*
  * Carry out RCU priority boosting on the task indicated by ->exp_tasks
  * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1211,6 +1213,9 @@ static int rcu_boost(struct rcu_node *rnp)
 	 */
 	t = container_of(tb, struct task_struct, rcu_node_entry);
 	rt_mutex_init_proxy_locked(&mtx, t);
+	/* Avoid lockdep false positives.  This rt_mutex is its own thing. */
+	lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class,
+				   "rcu_boost_mutex");
 	t->rcu_boost_mutex = &mtx;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 255e1662acdb..5e8d9cce7470 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -579,6 +579,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		    struct rt_mutex_waiter *waiter)
 {
 	int ret = 0;
+	int was_disabled;
 
 	for (;;) {
 		/* Try to acquire the lock: */
@@ -601,10 +602,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 
 		raw_spin_unlock(&lock->wait_lock);
 
+		was_disabled = irqs_disabled();
+		if (was_disabled)
+			local_irq_enable();
+
 		debug_rt_mutex_print_deadlock(waiter);
 
 		schedule_rt_mutex(lock);
 
+		if (was_disabled)
+			local_irq_disable();
+
 		raw_spin_lock(&lock->wait_lock);
 		set_current_state(state);
 	}
-- 
cgit v1.2.2


From 93898fb1a395d2a5a53db238c68036da2f8c64d1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 17 Aug 2011 12:39:34 -0700
Subject: rcu: Make rcu_torture_fqs() exit loops at end of test

The rcu_torture_fqs() function can prevent the rcutorture tests from
completing, resulting in a hang.  This commit therefore ensures that
rcu_torture_fqs() will exit its inner loops at the end of the test,
and also applies the newish ULONG_CMP_LT() macro to time comparisons.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutorture.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index ade7771b208a..2431d576e9ca 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -741,7 +741,7 @@ static int rcu_torture_boost(void *arg)
 	do {
 		/* Wait for the next test interval. */
 		oldstarttime = boost_starttime;
-		while (jiffies - oldstarttime > ULONG_MAX / 2) {
+		while (ULONG_CMP_LT(jiffies, oldstarttime)) {
 			schedule_timeout_uninterruptible(1);
 			rcu_stutter_wait("rcu_torture_boost");
 			if (kthread_should_stop() ||
@@ -752,7 +752,7 @@ static int rcu_torture_boost(void *arg)
 		/* Do one boost-test interval. */
 		endtime = oldstarttime + test_boost_duration * HZ;
 		call_rcu_time = jiffies;
-		while (jiffies - endtime > ULONG_MAX / 2) {
+		while (ULONG_CMP_LT(jiffies, endtime)) {
 			/* If we don't have a callback in flight, post one. */
 			if (!rbi.inflight) {
 				smp_mb(); /* RCU core before ->inflight = 1. */
@@ -818,11 +818,13 @@ rcu_torture_fqs(void *arg)
 	VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
 	do {
 		fqs_resume_time = jiffies + fqs_stutter * HZ;
-		while (jiffies - fqs_resume_time > LONG_MAX) {
+		while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
+		       !kthread_should_stop()) {
 			schedule_timeout_interruptible(1);
 		}
 		fqs_burst_remaining = fqs_duration;
-		while (fqs_burst_remaining > 0) {
+		while (fqs_burst_remaining > 0 &&
+		       !kthread_should_stop()) {
 			cur_ops->fqs();
 			udelay(fqs_holdoff);
 			fqs_burst_remaining -= fqs_holdoff;
-- 
cgit v1.2.2


From ab8f11e5f6655861ad4758a7da76b2fc0e0dcc98 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 18 Aug 2011 09:30:32 -0700
Subject: rcu: Make rcu_torture_boost() exit loops at end of test

One of the loops in rcu_torture_boost() fails to check kthread_should_stop(),
and thus might be slowing or even stopping completion of rcutorture tests
at rmmod time.  This commit adds the kthread_should_stop() check to the
offending loop.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutorture.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 2431d576e9ca..764825c2685c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -779,7 +779,8 @@ static int rcu_torture_boost(void *arg)
 		 * interval.  Besides, we are running at RT priority,
 		 * so delays should be relatively rare.
 		 */
-		while (oldstarttime == boost_starttime) {
+		while (oldstarttime == boost_starttime &&
+		       !kthread_should_stop()) {
 			if (mutex_trylock(&boost_mutex)) {
 				boost_starttime = jiffies +
 						  test_boost_interval * HZ;
-- 
cgit v1.2.2


From 5b61b0baa9e80289c53413e573befc5790a04ac7 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 19 Aug 2011 11:39:11 -0700
Subject: rcu: Wire up RCU_BOOST_PRIO for rcutree

RCU boost threads start life at RCU_BOOST_PRIO, while others remain
at RCU_KTHREAD_PRIO.  While here, change thread names to match other
kthreads, and adjust rcu_yield() to not override the priority set by
the user.  This last change sets the stage for runtime changes to
priority in the -rt tree.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c        |  2 --
 kernel/rcutree_plugin.h | 20 +++++++++++++++-----
 2 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index c95fa89ffef9..8455043c9250 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -131,8 +131,6 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 
-#define RCU_KTHREAD_PRIO 1	/* RT priority for per-CPU kthreads. */
-
 /*
  * Track the rcutorture test sequence number and the update version
  * number within a given test.  The rcutorture_testseq is incremented
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 28422767d854..b4cbe5bf2326 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -27,6 +27,14 @@
 #include <linux/delay.h>
 #include <linux/stop_machine.h>
 
+#define RCU_KTHREAD_PRIO 1
+
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else
+#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
+#endif
+
 /*
  * Check the RCU kernel configuration parameters and print informative
  * messages about anything out of the ordinary.  If you like #ifdef, you
@@ -1364,13 +1372,13 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 	if (rnp->boost_kthread_task != NULL)
 		return 0;
 	t = kthread_create(rcu_boost_kthread, (void *)rnp,
-			   "rcub%d", rnp_index);
+			   "rcub/%d", rnp_index);
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	rnp->boost_kthread_task = t;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
-	sp.sched_priority = RCU_KTHREAD_PRIO;
+	sp.sched_priority = RCU_BOOST_PRIO;
 	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 	wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
 	return 0;
@@ -1465,6 +1473,7 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
 {
 	struct sched_param sp;
 	struct timer_list yield_timer;
+	int prio = current->rt_priority;
 
 	setup_timer_on_stack(&yield_timer, f, arg);
 	mod_timer(&yield_timer, jiffies + 2);
@@ -1472,7 +1481,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
 	sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
 	set_user_nice(current, 19);
 	schedule();
-	sp.sched_priority = RCU_KTHREAD_PRIO;
+	set_user_nice(current, 0);
+	sp.sched_priority = prio;
 	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
 	del_timer(&yield_timer);
 }
@@ -1591,7 +1601,7 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
 	t = kthread_create_on_node(rcu_cpu_kthread,
 				   (void *)(long)cpu,
 				   cpu_to_node(cpu),
-				   "rcuc%d", cpu);
+				   "rcuc/%d", cpu);
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 	if (cpu_online(cpu))
@@ -1700,7 +1710,7 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
 		return 0;
 	if (rnp->node_kthread_task == NULL) {
 		t = kthread_create(rcu_node_kthread, (void *)rnp,
-				   "rcun%d", rnp_index);
+				   "rcun/%d", rnp_index);
 		if (IS_ERR(t))
 			return PTR_ERR(t);
 		raw_spin_lock_irqsave(&rnp->lock, flags);
-- 
cgit v1.2.2


From e90c53d3e238dd0b7b02964370e8fece1778df96 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 20 Aug 2011 18:29:32 -0700
Subject: rcu: Remove rcu_needs_cpu_flush() to avoid false quiescent states

The purpose of rcu_needs_cpu_flush() was to iterate on pushing the
current grace period in order to help the current CPU enter dyntick-idle
mode.  However, this can result in failures if the CPU starts entering
dyntick-idle mode, but then backs out.  In this case, the call to
rcu_pending() from rcu_needs_cpu_flush() might end up announcing a
non-existing quiescent state.

This commit therefore removes rcu_needs_cpu_flush() in favor of letting
the dyntick-idle machinery at the end of the softirq handler push the
loop along via its call to rcu_pending().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c        |  3 ---
 kernel/rcutree.h        |  1 -
 kernel/rcutree_plugin.h | 25 -------------------------
 3 files changed, 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 8455043c9250..e75df0c93abd 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1528,9 +1528,6 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 				&__get_cpu_var(rcu_sched_data));
 	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
 	rcu_preempt_process_callbacks();
-
-	/* If we are last CPU on way to dyntick-idle mode, accelerate it. */
-	rcu_needs_cpu_flush();
 	trace_rcu_utilization("End RCU core");
 }
 
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index f509f728f9fb..849ce9ec51fe 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -458,7 +458,6 @@ static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
 static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
-static void rcu_needs_cpu_flush(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
 static void invoke_rcu_callbacks_kthread(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index b4cbe5bf2326..4b9b9f8a4184 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1948,15 +1948,6 @@ int rcu_needs_cpu(int cpu)
 	return rcu_needs_cpu_quick_check(cpu);
 }
 
-/*
- * Check to see if we need to continue a callback-flush operations to
- * allow the last CPU to enter dyntick-idle mode.  But fast dyntick-idle
- * entry is not configured, so we never do need to.
- */
-static void rcu_needs_cpu_flush(void)
-{
-}
-
 #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
 
 #define RCU_NEEDS_CPU_FLUSHES 5
@@ -2032,20 +2023,4 @@ int rcu_needs_cpu(int cpu)
 	return c;
 }
 
-/*
- * Check to see if we need to continue a callback-flush operations to
- * allow the last CPU to enter dyntick-idle mode.
- */
-static void rcu_needs_cpu_flush(void)
-{
-	int cpu = smp_processor_id();
-	unsigned long flags;
-
-	if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
-		return;
-	local_irq_save(flags);
-	(void)rcu_needs_cpu(cpu);
-	local_irq_restore(flags);
-}
-
 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-- 
cgit v1.2.2


From afe24b122eb6edb5f1cb942570ac8d766105c7fc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 24 Aug 2011 16:52:09 -0700
Subject: rcu: Move propagation of ->completed from rcu_start_gp() to
 rcu_report_qs_rsp()

It is possible for the CPU that noted the end of the prior grace period
to not need a new one, and therefore to decide to propagate ->completed
throughout the rcu_node tree without starting another grace period.
However, in so doing, it releases the root rcu_node structure's lock,
which can allow some other CPU to start another grace period.  The first
CPU will be propagating ->completed in parallel with the second CPU
initializing the rcu_node tree for the new grace period.  In theory
this is harmless, but in practice we need to keep things simple.

This commit therefore moves the propagation of ->completed to
rcu_report_qs_rsp(), and refrains from marking the old grace period
as having been completed until it has finished doing this.  This
prevents anyone from starting a new grace period concurrently with
marking the old grace period as having been completed.

Of course, the optimization where a CPU needing a new grace period
doesn't bother marking the old one completed is still in effect:
In that case, the marking happens implicitly as part of initializing
the new grace period.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 71 ++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 51 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e75df0c93abd..e234eb92a177 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -842,28 +842,24 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	if (!rcu_scheduler_fully_active ||
-	    !cpu_needs_another_gp(rsp, rdp) ||
-	    rsp->fqs_active) {
-		if (rcu_scheduler_fully_active &&
-		    cpu_needs_another_gp(rsp, rdp))
-			rsp->fqs_need_gp = 1;
-		if (rnp->completed == rsp->completed) {
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
-			return;
-		}
-		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
+	    !cpu_needs_another_gp(rsp, rdp)) {
+		/*
+		 * Either the scheduler hasn't yet spawned the first
+		 * non-idle task or this CPU does not need another
+		 * grace period.  Either way, don't start a new grace
+		 * period.
+		 */
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
 
+	if (rsp->fqs_active) {
 		/*
-		 * Propagate new ->completed value to rcu_node structures
-		 * so that other CPUs don't have to wait until the start
-		 * of the next grace period to process their callbacks.
+		 * This CPU needs a grace period, but force_quiescent_state()
+		 * is running.  Tell it to start one on this CPU's behalf.
 		 */
-		rcu_for_each_node_breadth_first(rsp, rnp) {
-			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
-			rnp->completed = rsp->completed;
-			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-		}
-		local_irq_restore(flags);
+		rsp->fqs_need_gp = 1;
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
 
@@ -947,6 +943,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	__releases(rcu_get_root(rsp)->lock)
 {
 	unsigned long gp_duration;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 
@@ -958,7 +956,40 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	gp_duration = jiffies - rsp->gp_start;
 	if (gp_duration > rsp->gp_max)
 		rsp->gp_max = gp_duration;
-	rsp->completed = rsp->gpnum;
+
+	/*
+	 * We know the grace period is complete, but to everyone else
+	 * it appears to still be ongoing.  But it is also the case
+	 * that to everyone else it looks like there is nothing that
+	 * they can do to advance the grace period.  It is therefore
+	 * safe for us to drop the lock in order to mark the grace
+	 * period as completed in all of the rcu_node structures.
+	 *
+	 * But if this CPU needs another grace period, it will take
+	 * care of this while initializing the next grace period.
+	 * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
+	 * because the callbacks have not yet been advanced: Those
+	 * callbacks are waiting on the grace period that just now
+	 * completed.
+	 */
+	if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
+		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
+
+		/*
+		 * Propagate new ->completed value to rcu_node structures
+		 * so that other CPUs don't have to wait until the start
+		 * of the next grace period to process their callbacks.
+		 */
+		rcu_for_each_node_breadth_first(rsp, rnp) {
+			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+			rnp->completed = rsp->gpnum;
+			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+		}
+		rnp = rcu_get_root(rsp);
+		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+	}
+
+	rsp->completed = rsp->gpnum;  /* Declare the grace period complete. */
 	trace_rcu_grace_period(rsp->name, rsp->completed, "end");
 	rsp->signaled = RCU_GP_IDLE;
 	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
-- 
cgit v1.2.2


From 2a5306cc5f383b0e7414c75e458111afd4a563a4 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Thu, 29 Sep 2011 22:07:23 +0200
Subject: PM / Tracing: build rpm-traces.c only if CONFIG_PM_RUNTIME is set

Do not build kernel/trace/rpm-traces.c if CONFIG_PM_RUNTIME is not
set, which avoids a build failure.

[rjw: Added the changelog and modified the subject slightly.]

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/trace/Makefile | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 56bdab5b3793..f49405f842f4 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,7 +53,9 @@ endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
+ifeq ($(CONFIG_PM_RUNTIME),y)
 obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
+endif
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
-- 
cgit v1.2.2


From d178bc3a708f39cbfefc3fab37032d3f2511b4ec Mon Sep 17 00:00:00 2001
From: Serge Hallyn <serge.hallyn@canonical.com>
Date: Mon, 26 Sep 2011 10:45:18 -0500
Subject: user namespace: usb: make usb urbs user namespace aware (v2)

Add to the dev_state and alloc_async structures the user namespace
corresponding to the uid and euid.  Pass these to kill_pid_info_as_uid(),
which can then implement a proper, user-namespace-aware uid check.

Changelog:
Sep 20: Per Oleg's suggestion: Instead of caching and passing user namespace,
	uid, and euid each separately, pass a struct cred.
Sep 26: Address Alan Stern's comments: don't define a struct cred at
	usbdev_open(), and take and put a cred at async_completed() to
	ensure it lasts for the duration of kill_pid_info_as_cred().

Signed-off-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/signal.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 291c9700be75..d252be2d3de5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1344,13 +1344,24 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 	return error;
 }
 
+static int kill_as_cred_perm(const struct cred *cred,
+			     struct task_struct *target)
+{
+	const struct cred *pcred = __task_cred(target);
+	if (cred->user_ns != pcred->user_ns)
+		return 0;
+	if (cred->euid != pcred->suid && cred->euid != pcred->uid &&
+	    cred->uid  != pcred->suid && cred->uid  != pcred->uid)
+		return 0;
+	return 1;
+}
+
 /* like kill_pid_info(), but doesn't use uid/euid of "current" */
-int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
-		      uid_t uid, uid_t euid, u32 secid)
+int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
+			 const struct cred *cred, u32 secid)
 {
 	int ret = -EINVAL;
 	struct task_struct *p;
-	const struct cred *pcred;
 	unsigned long flags;
 
 	if (!valid_signal(sig))
@@ -1362,10 +1373,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 		ret = -ESRCH;
 		goto out_unlock;
 	}
-	pcred = __task_cred(p);
-	if (si_fromuser(info) &&
-	    euid != pcred->suid && euid != pcred->uid &&
-	    uid  != pcred->suid && uid  != pcred->uid) {
+	if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) {
 		ret = -EPERM;
 		goto out_unlock;
 	}
@@ -1384,7 +1392,7 @@ out_unlock:
 	rcu_read_unlock();
 	return ret;
 }
-EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
+EXPORT_SYMBOL_GPL(kill_pid_info_as_cred);
 
 /*
  * kill_something_info() interprets pid in interesting ways just like kill(2).
-- 
cgit v1.2.2


From 9d823e8f6b1b7b39f952d7d1795f29162143a433 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sat, 11 Jun 2011 18:10:12 -0600
Subject: writeback: per task dirty rate limit

Add two fields to task_struct.

1) account dirtied pages in the individual tasks, for accuracy
2) per-task balance_dirty_pages() call intervals, for flexibility

The balance_dirty_pages() call interval (ie. nr_dirtied_pause) will
scale near-sqrt to the safety gap between dirty pages and threshold.

The main problem of per-task nr_dirtied is, if 1k+ tasks start dirtying
pages at exactly the same time, each task will be assigned a large
initial nr_dirtied_pause, so that the dirty threshold will be exceeded
long before each task reached its nr_dirtied_pause and hence call
balance_dirty_pages().

The solution is to watch for the number of pages dirtied on each CPU in
between the calls into balance_dirty_pages(). If it exceeds ratelimit_pages
(3% dirty threshold), force call balance_dirty_pages() for a chance to
set bdi->dirty_exceeded. In normal situations, this safeguarding
condition is not expected to trigger at all.

On the sqrt in dirty_poll_interval():

It will serve as an initial guess when dirty pages are still in the
freerun area.

When dirty pages are floating inside the dirty control scope [freerun,
limit], a followup patch will use some refined dirty poll interval to
get the desired pause time.

   thresh-dirty (MB)    sqrt
		   1      16
		   2      22
		   4      32
		   8      45
		  16      64
		  32      90
		  64     128
		 128     181
		 256     256
		 512     362
		1024     512

The above table means, given 1MB (or 1GB) gap and the dd tasks polling
balance_dirty_pages() on every 16 (or 512) pages, the dirty limit won't
be exceeded as long as there are less than 16 (or 512) concurrent dd's.

So sqrt naturally leads to less overheads and more safe concurrent tasks
for large memory servers, which have large (thresh-freerun) gaps.

peter: keep the per-CPU ratelimit for safeguarding the 1k+ tasks case

CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Andrea Righi <andrea@betterlinux.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 kernel/fork.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8e6b6f4fb272..cc0815df99f2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1302,6 +1302,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->pdeath_signal = 0;
 	p->exit_state = 0;
 
+	p->nr_dirtied = 0;
+	p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
+
 	/*
 	 * Ok, make it visible to the rest of the system.
 	 * We dont wake it up yet.
-- 
cgit v1.2.2


From 31d9d9b6d83030f748d013e61502fa5477e2ac0e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 23 Sep 2011 17:03:06 +0100
Subject: genirq: Add support for per-cpu dev_id interrupts

The ARM GIC interrupt controller offers per CPU interrupts (PPIs),
which are usually used to connect local timers to each core. Each CPU
has its own private interface to the GIC, and only sees the PPIs that
are directly connect to it.

While these timers are separate devices and have a separate interrupt
line to a core, they all use the same IRQ number.

For these devices, request_irq() is not the right API as it assumes
that an IRQ number is visible by a number of CPUs (through the
affinity setting), but makes it very awkward to express that an IRQ
number can be handled by all CPUs, and yet be a different interrupt
line on each CPU, requiring a different dev_id cookie to be passed
back to the handler.

The *_percpu_irq() functions is designed to overcome these
limitations, by providing a per-cpu dev_id vector:

int request_percpu_irq(unsigned int irq, irq_handler_t handler,
		   const char *devname, void __percpu *percpu_dev_id);
void free_percpu_irq(unsigned int, void __percpu *);
int setup_percpu_irq(unsigned int irq, struct irqaction *new);
void remove_percpu_irq(unsigned int irq, struct irqaction *act);
void enable_percpu_irq(unsigned int irq);
void disable_percpu_irq(unsigned int irq);

The API has a number of limitations:
- no interrupt sharing
- no threading
- common handler across all the CPUs

Once the interrupt is requested using setup_percpu_irq() or
request_percpu_irq(), it must be enabled by each core that wishes its
local interrupt to be delivered.

Based on an initial patch by Thomas Gleixner.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1316793788-14500-2-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c      |  64 ++++++++++++++--
 kernel/irq/internals.h |  19 +++--
 kernel/irq/irqdesc.c   |  32 +++++++-
 kernel/irq/manage.c    | 202 ++++++++++++++++++++++++++++++++++++++++++++++---
 kernel/irq/settings.h  |   7 ++
 5 files changed, 302 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index dc5114b4c16c..f7c543a801d9 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -26,7 +26,7 @@
 int irq_set_chip(unsigned int irq, struct irq_chip *chip)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
 
 	if (!desc)
 		return -EINVAL;
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_set_chip);
 int irq_set_irq_type(unsigned int irq, unsigned int type)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
 	int ret = 0;
 
 	if (!desc)
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(irq_set_irq_type);
 int irq_set_handler_data(unsigned int irq, void *data)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
 
 	if (!desc)
 		return -EINVAL;
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(irq_set_handler_data);
 int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
 
 	if (!desc)
 		return -EINVAL;
@@ -119,7 +119,7 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
 int irq_set_chip_data(unsigned int irq, void *data)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
 
 	if (!desc)
 		return -EINVAL;
@@ -204,6 +204,24 @@ void irq_disable(struct irq_desc *desc)
 	}
 }
 
+void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu)
+{
+	if (desc->irq_data.chip->irq_enable)
+		desc->irq_data.chip->irq_enable(&desc->irq_data);
+	else
+		desc->irq_data.chip->irq_unmask(&desc->irq_data);
+	cpumask_set_cpu(cpu, desc->percpu_enabled);
+}
+
+void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu)
+{
+	if (desc->irq_data.chip->irq_disable)
+		desc->irq_data.chip->irq_disable(&desc->irq_data);
+	else
+		desc->irq_data.chip->irq_mask(&desc->irq_data);
+	cpumask_clear_cpu(cpu, desc->percpu_enabled);
+}
+
 static inline void mask_ack_irq(struct irq_desc *desc)
 {
 	if (desc->irq_data.chip->irq_mask_ack)
@@ -544,12 +562,44 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 		chip->irq_eoi(&desc->irq_data);
 }
 
+/**
+ * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids
+ * @irq:	the interrupt number
+ * @desc:	the interrupt description structure for this irq
+ *
+ * Per CPU interrupts on SMP machines without locking requirements. Same as
+ * handle_percpu_irq() above but with the following extras:
+ *
+ * action->percpu_dev_id is a pointer to percpu variables which
+ * contain the real device id for the cpu on which this handler is
+ * called
+ */
+void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
+{
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+	struct irqaction *action = desc->action;
+	void *dev_id = __this_cpu_ptr(action->percpu_dev_id);
+	irqreturn_t res;
+
+	kstat_incr_irqs_this_cpu(irq, desc);
+
+	if (chip->irq_ack)
+		chip->irq_ack(&desc->irq_data);
+
+	trace_irq_handler_entry(irq, action);
+	res = action->handler(irq, dev_id);
+	trace_irq_handler_exit(irq, action, res);
+
+	if (chip->irq_eoi)
+		chip->irq_eoi(&desc->irq_data);
+}
+
 void
 __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		  const char *name)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
 
 	if (!desc)
 		return;
@@ -593,7 +643,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
 void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
 
 	if (!desc)
 		return;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 6546431447d7..a73dd6c7372d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -71,6 +71,8 @@ extern int irq_startup(struct irq_desc *desc);
 extern void irq_shutdown(struct irq_desc *desc);
 extern void irq_enable(struct irq_desc *desc);
 extern void irq_disable(struct irq_desc *desc);
+extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
+extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
 extern void mask_irq(struct irq_desc *desc);
 extern void unmask_irq(struct irq_desc *desc);
 
@@ -114,14 +116,21 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
 		desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
 }
 
+#define _IRQ_DESC_CHECK		(1 << 0)
+#define _IRQ_DESC_PERCPU	(1 << 1)
+
+#define IRQ_GET_DESC_CHECK_GLOBAL	(_IRQ_DESC_CHECK)
+#define IRQ_GET_DESC_CHECK_PERCPU	(_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
+
 struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+		    unsigned int check);
 void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
 
 static inline struct irq_desc *
-irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
+irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check)
 {
-	return __irq_get_desc_lock(irq, flags, true);
+	return __irq_get_desc_lock(irq, flags, true, check);
 }
 
 static inline void
@@ -131,9 +140,9 @@ irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
 }
 
 static inline struct irq_desc *
-irq_get_desc_lock(unsigned int irq, unsigned long *flags)
+irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check)
 {
-	return __irq_get_desc_lock(irq, flags, false);
+	return __irq_get_desc_lock(irq, flags, false, check);
 }
 
 static inline void
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 039b889ea053..1550e8447a16 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -424,11 +424,22 @@ unsigned int irq_get_next_irq(unsigned int offset)
 }
 
 struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+		    unsigned int check)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
 	if (desc) {
+		if (check & _IRQ_DESC_CHECK) {
+			if ((check & _IRQ_DESC_PERCPU) &&
+			    !irq_settings_is_per_cpu_devid(desc))
+				return NULL;
+
+			if (!(check & _IRQ_DESC_PERCPU) &&
+			    irq_settings_is_per_cpu_devid(desc))
+				return NULL;
+		}
+
 		if (bus)
 			chip_bus_lock(desc);
 		raw_spin_lock_irqsave(&desc->lock, *flags);
@@ -443,6 +454,25 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
 		chip_bus_sync_unlock(desc);
 }
 
+int irq_set_percpu_devid(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (!desc)
+		return -EINVAL;
+
+	if (desc->percpu_enabled)
+		return -EINVAL;
+
+	desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL);
+
+	if (!desc->percpu_enabled)
+		return -ENOMEM;
+
+	irq_set_percpu_devid_flags(irq);
+	return 0;
+}
+
 /**
  * dynamic_irq_cleanup - cleanup a dynamically allocated irq
  * @irq:	irq number to initialize
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7e1a3ed1e61a..7b4b156d065c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -195,7 +195,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
 
 	if (!desc)
 		return -EINVAL;
@@ -356,7 +356,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 static int __disable_irq_nosync(unsigned int irq)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
 
 	if (!desc)
 		return -EINVAL;
@@ -448,7 +448,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 void enable_irq(unsigned int irq)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
 
 	if (!desc)
 		return;
@@ -491,7 +491,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
 int irq_set_irq_wake(unsigned int irq, unsigned int on)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
 	int ret = 0;
 
 	if (!desc)
@@ -532,7 +532,7 @@ EXPORT_SYMBOL(irq_set_irq_wake);
 int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
 	int canrequest = 0;
 
 	if (!desc)
@@ -1121,6 +1121,8 @@ int setup_irq(unsigned int irq, struct irqaction *act)
 	int retval;
 	struct irq_desc *desc = irq_to_desc(irq);
 
+	if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+		return -EINVAL;
 	chip_bus_lock(desc);
 	retval = __setup_irq(irq, desc, act);
 	chip_bus_sync_unlock(desc);
@@ -1129,7 +1131,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
 }
 EXPORT_SYMBOL_GPL(setup_irq);
 
- /*
+/*
  * Internal function to unregister an irqaction - used to free
  * regular and special interrupts that are part of the architecture.
  */
@@ -1227,7 +1229,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
  */
 void remove_irq(unsigned int irq, struct irqaction *act)
 {
-	__free_irq(irq, act->dev_id);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+	    __free_irq(irq, act->dev_id);
 }
 EXPORT_SYMBOL_GPL(remove_irq);
 
@@ -1249,7 +1254,7 @@ void free_irq(unsigned int irq, void *dev_id)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	if (!desc)
+	if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
 		return;
 
 #ifdef CONFIG_SMP
@@ -1327,7 +1332,8 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 	if (!desc)
 		return -EINVAL;
 
-	if (!irq_settings_can_request(desc))
+	if (!irq_settings_can_request(desc) ||
+	    WARN_ON(irq_settings_is_per_cpu_devid(desc)))
 		return -EINVAL;
 
 	if (!handler) {
@@ -1412,3 +1418,181 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
 	return !ret ? IRQC_IS_HARDIRQ : ret;
 }
 EXPORT_SYMBOL_GPL(request_any_context_irq);
+
+void enable_percpu_irq(unsigned int irq)
+{
+	unsigned int cpu = smp_processor_id();
+	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+
+	if (!desc)
+		return;
+
+	irq_percpu_enable(desc, cpu);
+	irq_put_desc_unlock(desc, flags);
+}
+
+void disable_percpu_irq(unsigned int irq)
+{
+	unsigned int cpu = smp_processor_id();
+	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+
+	if (!desc)
+		return;
+
+	irq_percpu_disable(desc, cpu);
+	irq_put_desc_unlock(desc, flags);
+}
+
+/*
+ * Internal function to unregister a percpu irqaction.
+ */
+static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct irqaction *action;
+	unsigned long flags;
+
+	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
+
+	if (!desc)
+		return NULL;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+
+	action = desc->action;
+	if (!action || action->percpu_dev_id != dev_id) {
+		WARN(1, "Trying to free already-free IRQ %d\n", irq);
+		goto bad;
+	}
+
+	if (!cpumask_empty(desc->percpu_enabled)) {
+		WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
+		     irq, cpumask_first(desc->percpu_enabled));
+		goto bad;
+	}
+
+	/* Found it - now remove it from the list of entries: */
+	desc->action = NULL;
+
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	unregister_handler_proc(irq, action);
+
+	module_put(desc->owner);
+	return action;
+
+bad:
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	return NULL;
+}
+
+/**
+ *	remove_percpu_irq - free a per-cpu interrupt
+ *	@irq: Interrupt line to free
+ *	@act: irqaction for the interrupt
+ *
+ * Used to remove interrupts statically setup by the early boot process.
+ */
+void remove_percpu_irq(unsigned int irq, struct irqaction *act)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (desc && irq_settings_is_per_cpu_devid(desc))
+	    __free_percpu_irq(irq, act->percpu_dev_id);
+}
+
+/**
+ *	free_percpu_irq - free an interrupt allocated with request_percpu_irq
+ *	@irq: Interrupt line to free
+ *	@dev_id: Device identity to free
+ *
+ *	Remove a percpu interrupt handler. The handler is removed, but
+ *	the interrupt line is not disabled. This must be done on each
+ *	CPU before calling this function. The function does not return
+ *	until any executing interrupts for this IRQ have completed.
+ *
+ *	This function must not be called from interrupt context.
+ */
+void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (!desc || !irq_settings_is_per_cpu_devid(desc))
+		return;
+
+	chip_bus_lock(desc);
+	kfree(__free_percpu_irq(irq, dev_id));
+	chip_bus_sync_unlock(desc);
+}
+
+/**
+ *	setup_percpu_irq - setup a per-cpu interrupt
+ *	@irq: Interrupt line to setup
+ *	@act: irqaction for the interrupt
+ *
+ * Used to statically setup per-cpu interrupts in the early boot process.
+ */
+int setup_percpu_irq(unsigned int irq, struct irqaction *act)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	int retval;
+
+	if (!desc || !irq_settings_is_per_cpu_devid(desc))
+		return -EINVAL;
+	chip_bus_lock(desc);
+	retval = __setup_irq(irq, desc, act);
+	chip_bus_sync_unlock(desc);
+
+	return retval;
+}
+
+/**
+ *	request_percpu_irq - allocate a percpu interrupt line
+ *	@irq: Interrupt line to allocate
+ *	@handler: Function to be called when the IRQ occurs.
+ *	@devname: An ascii name for the claiming device
+ *	@dev_id: A percpu cookie passed back to the handler function
+ *
+ *	This call allocates interrupt resources, but doesn't
+ *	automatically enable the interrupt. It has to be done on each
+ *	CPU using enable_percpu_irq().
+ *
+ *	Dev_id must be globally unique. It is a per-cpu variable, and
+ *	the handler gets called with the interrupted CPU's instance of
+ *	that variable.
+ */
+int request_percpu_irq(unsigned int irq, irq_handler_t handler,
+		       const char *devname, void __percpu *dev_id)
+{
+	struct irqaction *action;
+	struct irq_desc *desc;
+	int retval;
+
+	if (!dev_id)
+		return -EINVAL;
+
+	desc = irq_to_desc(irq);
+	if (!desc || !irq_settings_can_request(desc) ||
+	    !irq_settings_is_per_cpu_devid(desc))
+		return -EINVAL;
+
+	action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+	if (!action)
+		return -ENOMEM;
+
+	action->handler = handler;
+	action->flags = IRQF_PERCPU;
+	action->name = devname;
+	action->percpu_dev_id = dev_id;
+
+	chip_bus_lock(desc);
+	retval = __setup_irq(irq, desc, action);
+	chip_bus_sync_unlock(desc);
+
+	if (retval)
+		kfree(action);
+
+	return retval;
+}
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index f1667833d444..1162f1030f18 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -13,6 +13,7 @@ enum {
 	_IRQ_MOVE_PCNTXT	= IRQ_MOVE_PCNTXT,
 	_IRQ_NO_BALANCING	= IRQ_NO_BALANCING,
 	_IRQ_NESTED_THREAD	= IRQ_NESTED_THREAD,
+	_IRQ_PER_CPU_DEVID	= IRQ_PER_CPU_DEVID,
 	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,
 };
 
@@ -24,6 +25,7 @@ enum {
 #define IRQ_NOTHREAD		GOT_YOU_MORON
 #define IRQ_NOAUTOEN		GOT_YOU_MORON
 #define IRQ_NESTED_THREAD	GOT_YOU_MORON
+#define IRQ_PER_CPU_DEVID	GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK	GOT_YOU_MORON
 
@@ -39,6 +41,11 @@ static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
 	return desc->status_use_accessors & _IRQ_PER_CPU;
 }
 
+static inline bool irq_settings_is_per_cpu_devid(struct irq_desc *desc)
+{
+	return desc->status_use_accessors & _IRQ_PER_CPU_DEVID;
+}
+
 static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
 {
 	desc->status_use_accessors |= _IRQ_PER_CPU;
-- 
cgit v1.2.2


From 1e7c5fd29487ee88cb3abac945bafa60ae026146 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 30 Sep 2011 10:48:47 +0100
Subject: genirq: percpu: allow interrupt type to be set at enable time

As request_percpu_irq() doesn't allow for a percpu interrupt to have
its type configured (it is generally impossible to configure it on all
CPUs at once), add a 'type' argument to enable_percpu_irq().

This allows some low-level, board specific init code to be switched to
a generic API.

[ tglx: Added WARN_ON argument ]

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Abhijeet Dharmapurikar <adharmap@codeaurora.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7b4b156d065c..2bc86869859e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1419,7 +1419,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
 }
 EXPORT_SYMBOL_GPL(request_any_context_irq);
 
-void enable_percpu_irq(unsigned int irq)
+void enable_percpu_irq(unsigned int irq, unsigned int type)
 {
 	unsigned int cpu = smp_processor_id();
 	unsigned long flags;
@@ -1428,7 +1428,20 @@ void enable_percpu_irq(unsigned int irq)
 	if (!desc)
 		return;
 
+	type &= IRQ_TYPE_SENSE_MASK;
+	if (type != IRQ_TYPE_NONE) {
+		int ret;
+
+		ret = __irq_set_trigger(desc, irq, type);
+
+		if (ret) {
+			WARN(1, "failed to set type for IRQ%d\n, irq");
+			goto out;
+		}
+	}
+
 	irq_percpu_enable(desc, cpu);
+out:
 	irq_put_desc_unlock(desc, flags);
 }
 
-- 
cgit v1.2.2


From 349d2895cc8b7db1f5be677cd685209a3805d2ed Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@sw.ru>
Date: Fri, 30 Sep 2011 01:11:10 +0000
Subject: ipv4: NET_IPV4_ROUTE_GC_INTERVAL removal

removing obsoleted sysctl,
ip_rt_gc_interval variable no longer used since 2.6.38

Signed-off-by: Vasily Averin <vvs@sw.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/sysctl_binary.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index e8bffbe2ba4b..6318b511afa1 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -214,7 +214,7 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
 	{ CTL_INT,	NET_IPV4_ROUTE_GC_MIN_INTERVAL,		"gc_min_interval" },
 	{ CTL_INT,	NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,	"gc_min_interval_ms" },
 	{ CTL_INT,	NET_IPV4_ROUTE_GC_TIMEOUT,		"gc_timeout" },
-	{ CTL_INT,	NET_IPV4_ROUTE_GC_INTERVAL,		"gc_interval" },
+	/* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */
 	{ CTL_INT,	NET_IPV4_ROUTE_REDIRECT_LOAD,		"redirect_load" },
 	{ CTL_INT,	NET_IPV4_ROUTE_REDIRECT_NUMBER,		"redirect_number" },
 	{ CTL_INT,	NET_IPV4_ROUTE_REDIRECT_SILENCE,	"redirect_silence" },
-- 
cgit v1.2.2


From 38aaf8090d34b623b7919d8c933f6e938c9bf44b Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Thu, 8 Sep 2011 14:00:46 +0800
Subject: irq_work: Use llist in the struct irq_work logic

Use llist in irq_work instead of the lock-less linked list
implementation in irq_work to avoid the code duplication.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1315461646-1379-6-git-send-email-ying.huang@intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq_work.c | 91 ++++++++++++++++++++-----------------------------------
 1 file changed, 33 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index c58fa7da8aef..6f0a4310defd 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -17,54 +17,34 @@
  * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
  * pending   next, 3 -> {busy}          : queued, pending callback
  * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
- *
- * We use the lower two bits of the next pointer to keep PENDING and BUSY
- * flags.
  */
 
 #define IRQ_WORK_PENDING	1UL
 #define IRQ_WORK_BUSY		2UL
 #define IRQ_WORK_FLAGS		3UL
 
-static inline bool irq_work_is_set(struct irq_work *entry, int flags)
-{
-	return (unsigned long)entry->next & flags;
-}
-
-static inline struct irq_work *irq_work_next(struct irq_work *entry)
-{
-	unsigned long next = (unsigned long)entry->next;
-	next &= ~IRQ_WORK_FLAGS;
-	return (struct irq_work *)next;
-}
-
-static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
-{
-	unsigned long next = (unsigned long)entry;
-	next |= flags;
-	return (struct irq_work *)next;
-}
-
-static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
+static DEFINE_PER_CPU(struct llist_head, irq_work_list);
 
 /*
  * Claim the entry so that no one else will poke at it.
  */
-static bool irq_work_claim(struct irq_work *entry)
+static bool irq_work_claim(struct irq_work *work)
 {
-	struct irq_work *next, *nflags;
+	unsigned long flags, nflags;
 
-	do {
-		next = entry->next;
-		if ((unsigned long)next & IRQ_WORK_PENDING)
+	for (;;) {
+		flags = work->flags;
+		if (flags & IRQ_WORK_PENDING)
 			return false;
-		nflags = next_flags(next, IRQ_WORK_FLAGS);
-	} while (cmpxchg(&entry->next, next, nflags) != next);
+		nflags = flags | IRQ_WORK_FLAGS;
+		if (cmpxchg(&work->flags, flags, nflags) == flags)
+			break;
+		cpu_relax();
+	}
 
 	return true;
 }
 
-
 void __weak arch_irq_work_raise(void)
 {
 	/*
@@ -75,20 +55,15 @@ void __weak arch_irq_work_raise(void)
 /*
  * Queue the entry and raise the IPI if needed.
  */
-static void __irq_work_queue(struct irq_work *entry)
+static void __irq_work_queue(struct irq_work *work)
 {
-	struct irq_work *next;
+	bool empty;
 
 	preempt_disable();
 
-	do {
-		next = __this_cpu_read(irq_work_list);
-		/* Can assign non-atomic because we keep the flags set. */
-		entry->next = next_flags(next, IRQ_WORK_FLAGS);
-	} while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
-
+	empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
 	/* The list was empty, raise self-interrupt to start processing. */
-	if (!irq_work_next(entry))
+	if (empty)
 		arch_irq_work_raise();
 
 	preempt_enable();
@@ -100,16 +75,16 @@ static void __irq_work_queue(struct irq_work *entry)
  *
  * Can be re-enqueued while the callback is still in progress.
  */
-bool irq_work_queue(struct irq_work *entry)
+bool irq_work_queue(struct irq_work *work)
 {
-	if (!irq_work_claim(entry)) {
+	if (!irq_work_claim(work)) {
 		/*
 		 * Already enqueued, can't do!
 		 */
 		return false;
 	}
 
-	__irq_work_queue(entry);
+	__irq_work_queue(work);
 	return true;
 }
 EXPORT_SYMBOL_GPL(irq_work_queue);
@@ -120,34 +95,34 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
  */
 void irq_work_run(void)
 {
-	struct irq_work *list;
+	struct irq_work *work;
+	struct llist_head *this_list;
+	struct llist_node *llnode;
 
-	if (this_cpu_read(irq_work_list) == NULL)
+	this_list = &__get_cpu_var(irq_work_list);
+	if (llist_empty(this_list))
 		return;
 
 	BUG_ON(!in_irq());
 	BUG_ON(!irqs_disabled());
 
-	list = this_cpu_xchg(irq_work_list, NULL);
-
-	while (list != NULL) {
-		struct irq_work *entry = list;
+	llnode = llist_del_all(this_list);
+	while (llnode != NULL) {
+		work = llist_entry(llnode, struct irq_work, llnode);
 
-		list = irq_work_next(list);
+		llnode = llnode->next;
 
 		/*
-		 * Clear the PENDING bit, after this point the @entry
+		 * Clear the PENDING bit, after this point the @work
 		 * can be re-used.
 		 */
-		entry->next = next_flags(NULL, IRQ_WORK_BUSY);
-		entry->func(entry);
+		work->flags = IRQ_WORK_BUSY;
+		work->func(work);
 		/*
 		 * Clear the BUSY bit and return to the free state if
 		 * no-one else claimed it meanwhile.
 		 */
-		(void)cmpxchg(&entry->next,
-			      next_flags(NULL, IRQ_WORK_BUSY),
-			      NULL);
+		(void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
 	}
 }
 EXPORT_SYMBOL_GPL(irq_work_run);
@@ -156,11 +131,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
  * Synchronize against the irq_work @entry, ensures the entry is not
  * currently in use.
  */
-void irq_work_sync(struct irq_work *entry)
+void irq_work_sync(struct irq_work *work)
 {
 	WARN_ON_ONCE(irqs_disabled());
 
-	while (irq_work_is_set(entry, IRQ_WORK_BUSY))
+	while (work->flags & IRQ_WORK_BUSY)
 		cpu_relax();
 }
 EXPORT_SYMBOL_GPL(irq_work_sync);
-- 
cgit v1.2.2


From 924f8f5af31423529cc3940cb2ae9fee736b7517 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 12 Sep 2011 13:12:28 +0200
Subject: llist: Add llist_next()

So we don't have to expose the struct list_node member.

Cc: Huang Ying <ying.huang@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1315836348.26517.41.camel@twins
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq_work.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 6f0a4310defd..0e2cde4f380b 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -110,7 +110,7 @@ void irq_work_run(void)
 	while (llnode != NULL) {
 		work = llist_entry(llnode, struct irq_work, llnode);
 
-		llnode = llnode->next;
+		llnode = llist_next(llnode);
 
 		/*
 		 * Clear the PENDING bit, after this point the @work
-- 
cgit v1.2.2


From fa14ff4accfb24e59d2473f3d864d6648d80563b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 12 Sep 2011 13:06:17 +0200
Subject: sched: Convert to struct llist

Use the generic llist primitives.

We had a private lockless list implementation in the scheduler in the wake-list
code, now that we have a generic llist implementation that provides all required
operations, switch to it.

This patch is not expected to change any behavior.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1315836353.26517.42.camel@twins
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 48 ++++++++++--------------------------------------
 1 file changed, 10 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c5cf15e1eb57..1874c7418319 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -702,7 +702,7 @@ struct rq {
 #endif
 
 #ifdef CONFIG_SMP
-	struct task_struct *wake_list;
+	struct llist_head wake_list;
 #endif
 };
 
@@ -2698,42 +2698,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 }
 
 #ifdef CONFIG_SMP
-static void sched_ttwu_do_pending(struct task_struct *list)
+static void sched_ttwu_pending(void)
 {
 	struct rq *rq = this_rq();
+	struct llist_node *llist = llist_del_all(&rq->wake_list);
+	struct task_struct *p;
 
 	raw_spin_lock(&rq->lock);
 
-	while (list) {
-		struct task_struct *p = list;
-		list = list->wake_entry;
+	while (llist) {
+		p = llist_entry(llist, struct task_struct, wake_entry);
+		llist = llist_next(llist);
 		ttwu_do_activate(rq, p, 0);
 	}
 
 	raw_spin_unlock(&rq->lock);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-
-static void sched_ttwu_pending(void)
-{
-	struct rq *rq = this_rq();
-	struct task_struct *list = xchg(&rq->wake_list, NULL);
-
-	if (!list)
-		return;
-
-	sched_ttwu_do_pending(list);
-}
-
-#endif /* CONFIG_HOTPLUG_CPU */
-
 void scheduler_ipi(void)
 {
-	struct rq *rq = this_rq();
-	struct task_struct *list = xchg(&rq->wake_list, NULL);
-
-	if (!list)
+	if (llist_empty(&this_rq()->wake_list))
 		return;
 
 	/*
@@ -2750,25 +2734,13 @@ void scheduler_ipi(void)
 	 * somewhat pessimize the simple resched case.
 	 */
 	irq_enter();
-	sched_ttwu_do_pending(list);
+	sched_ttwu_pending();
 	irq_exit();
 }
 
 static void ttwu_queue_remote(struct task_struct *p, int cpu)
 {
-	struct rq *rq = cpu_rq(cpu);
-	struct task_struct *next = rq->wake_list;
-
-	for (;;) {
-		struct task_struct *old = next;
-
-		p->wake_entry = next;
-		next = cmpxchg(&rq->wake_list, old, p);
-		if (next == old)
-			break;
-	}
-
-	if (!next)
+	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
 		smp_send_reschedule(cpu);
 }
 
-- 
cgit v1.2.2


From 908a3283728d92df36e0c7cd63304fd35e93a8a9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Sep 2011 15:32:06 +0200
Subject: sched: Fix idle_cpu()

On -rt we observed hackbench waking all 400 tasks to a single cpu.
This is because of select_idle_sibling()'s interaction with the new
ipi based wakeup scheme.

The existing idle_cpu() test only checks to see if the current task on
that cpu is the idle task, it does not take already queued tasks into
account, nor does it take queued to be woken tasks into account.

If the remote wakeup IPIs come hard enough, there won't be time to
schedule away from the idle task, and would thus keep thinking the cpu
was in fact idle, regardless of the fact that there were already
several hundred tasks runnable.

We couldn't reproduce on mainline, but there's no reason it couldn't
happen.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-3o30p18b2paswpc9ohy2gltp@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1874c7418319..4cdc91cf48f6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5138,7 +5138,20 @@ EXPORT_SYMBOL(task_nice);
  */
 int idle_cpu(int cpu)
 {
-	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+	struct rq *rq = cpu_rq(cpu);
+
+	if (rq->curr != rq->idle)
+		return 0;
+
+	if (rq->nr_running)
+		return 0;
+
+#ifdef CONFIG_SMP
+	if (!llist_empty(&rq->wake_list))
+		return 0;
+#endif
+
+	return 1;
 }
 
 /**
-- 
cgit v1.2.2


From 32cffdde4a3ee6c2d9e0f0a94edecf1a9ce7586b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 4 Oct 2011 18:43:57 +0200
Subject: genirq: Fix fatfinered fixup really

Putting the argument inside the quote does not really help.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2bc86869859e..67ce837ae52c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1435,7 +1435,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
 		ret = __irq_set_trigger(desc, irq, type);
 
 		if (ret) {
-			WARN(1, "failed to set type for IRQ%d\n, irq");
+			WARN(1, "failed to set type for IRQ%d\n", irq);
 			goto out;
 		}
 	}
-- 
cgit v1.2.2


From 68cc3990a545dc0da221b4844dd8b9c06623a6c5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 5 Oct 2011 13:20:24 +0200
Subject: rtmutex: Add missing rcu_read_unlock() in
 debug_rt_mutex_print_deadlock()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/rtmutex-debug.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index cb1ced6967ad..a2e7e7210f3e 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -94,8 +94,10 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
 		return;
 	}
 
-	if (!debug_locks_off())
+	if (!debug_locks_off()) {
+		rcu_read_unlock();
 		return;
+	}
 
 	printk("\n============================================\n");
 	printk(  "[ BUG: circular locking deadlock detected! ]\n");
-- 
cgit v1.2.2


From ca38062e57e97791c2f62e3dbd06caf3ebb5721c Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 3 Oct 2011 15:09:00 -0700
Subject: sched: Use resched IPI to kick off the nohz idle balance

Current use of smp call function to kick the nohz idle balance can deadlock
in this scenario.

1. cpu-A did a generic_exec_single() to cpu-B and after queuing its call single
data (csd) to the call single queue, cpu-A took a timer interrupt.  Actual IPI
to cpu-B to process the call single queue is not yet sent.

2. As part of the timer interrupt handler, cpu-A decided to kick cpu-B
for the idle load balancing (sets cpu-B's rq->nohz_balance_kick to 1)
and __smp_call_function_single() with nowait will queue the csd to the
cpu-B's queue. But the generic_exec_single() won't send an IPI to cpu-B
as the call single queue was not empty.

3. cpu-A is busy with lot of interrupts

4. Meanwhile cpu-B is entering and exiting idle and noticed that it has
it's rq->nohz_balance_kick set to '1'. So it will go ahead and do the
idle load balancer and clear its rq->nohz_balance_kick.

5. At this point, csd queued as part of the step-2 above is still locked
and waiting to be serviced on cpu-B.

6. cpu-A is still busy with interrupt load and now it got another timer
interrupt and as part of it decided to kick cpu-B for another idle load
balancing (as it finds cpu-B's rq->nohz_balance_kick cleared in step-4
above) and does __smp_call_function_single() with the same csd that is
still locked.

7. And we get a deadlock waiting for the csd_lock() in the
__smp_call_function_single().

Main issue here is that cpu-B can service the idle load balancer kick
request from cpu-A even with out receiving the IPI and this lead to
doing multiple __smp_call_function_single() on the same csd leading to
deadlock.

To kick a cpu, scheduler already has the reschedule vector reserved. Use
that mechanism (kick_process()) instead of using the generic smp call function
mechanism to kick off the nohz idle load balancing and avoid the deadlock.

   [ This issue is present from 2.6.35+ kernels, but marking it -stable
     only from v3.0+ as the proposed fix depends on the scheduler_ipi()
     that is introduced recently. ]

Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: stable@kernel.org # v3.0+
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20111003220934.834943260@sbsiddha-desk.sc.intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 21 +++++++++++++++++++--
 kernel/sched_fair.c | 29 +++++++++--------------------
 2 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 4cdc91cf48f6..9e49af00ae3e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1404,6 +1404,18 @@ void wake_up_idle_cpu(int cpu)
 		smp_send_reschedule(cpu);
 }
 
+static inline bool got_nohz_idle_kick(void)
+{
+	return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
+}
+
+#else /* CONFIG_NO_HZ */
+
+static inline bool got_nohz_idle_kick(void)
+{
+	return false;
+}
+
 #endif /* CONFIG_NO_HZ */
 
 static u64 sched_avg_period(void)
@@ -2717,7 +2729,7 @@ static void sched_ttwu_pending(void)
 
 void scheduler_ipi(void)
 {
-	if (llist_empty(&this_rq()->wake_list))
+	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
 		return;
 
 	/*
@@ -2735,6 +2747,12 @@ void scheduler_ipi(void)
 	 */
 	irq_enter();
 	sched_ttwu_pending();
+
+	/*
+	 * Check if someone kicked us for doing the nohz idle load balance.
+	 */
+	if (unlikely(got_nohz_idle_kick() && !need_resched()))
+		raise_softirq_irqoff(SCHED_SOFTIRQ);
 	irq_exit();
 }
 
@@ -8288,7 +8306,6 @@ void __init sched_init(void)
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
 		rq->nohz_balance_kick = 0;
-		init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
 #endif
 #endif
 		init_rq_hrtick(rq);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fef0bfde7c8c..6c5fa1099229 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -4269,22 +4269,6 @@ out_unlock:
 }
 
 #ifdef CONFIG_NO_HZ
-
-static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
-
-static void trigger_sched_softirq(void *data)
-{
-	raise_softirq_irqoff(SCHED_SOFTIRQ);
-}
-
-static inline void init_sched_softirq_csd(struct call_single_data *csd)
-{
-	csd->func = trigger_sched_softirq;
-	csd->info = NULL;
-	csd->flags = 0;
-	csd->priv = 0;
-}
-
 /*
  * idle load balancing details
  * - One of the idle CPUs nominates itself as idle load_balancer, while
@@ -4450,11 +4434,16 @@ static void nohz_balancer_kick(int cpu)
 	}
 
 	if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
-		struct call_single_data *cp;
-
 		cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
-		cp = &per_cpu(remote_sched_softirq_cb, cpu);
-		__smp_call_function_single(ilb_cpu, cp, 0);
+
+		smp_mb();
+		/*
+		 * Use smp_send_reschedule() instead of resched_cpu().
+		 * This way we generate a sched IPI on the target cpu which
+		 * is idle. And the softirq performing nohz idle load balance
+		 * will be run before returning from the IPI.
+		 */
+		smp_send_reschedule(ilb_cpu);
 	}
 	return;
 }
-- 
cgit v1.2.2


From 6eb57e0d65ebd99a71d435dc96d83e725752eef8 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 3 Oct 2011 15:09:01 -0700
Subject: sched: Request for idle balance during nohz idle load balance

rq's idle_at_tick is set to idle/busy during the timer tick
depending on the cpu was idle or not. This will be used later in the load
balance that will be done in the softirq context (which is a process
context in -RT kernels).

For nohz kernels, for the cpu doing nohz idle load balance on behalf of
all the idle cpu's, its rq->idle_at_tick might have a stale value (which is
recorded when it got the timer tick presumably when it is busy).

As the nohz idle load balancing is also being done at the same place
as the regular load balancing, nohz idle load balancing was bailing out
when it sees rq's idle_at_tick not set.

Thus leading to poor system utilization.

Rename rq's idle_at_tick to idle_balance and set it when someone requests
for nohz idle balance on an idle cpu.

Reported-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20111003220934.892350549@sbsiddha-desk.sc.intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 8 +++++---
 kernel/sched_fair.c | 4 ++--
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 9e49af00ae3e..7bc9b0e84eb3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -644,7 +644,7 @@ struct rq {
 
 	unsigned long cpu_power;
 
-	unsigned char idle_at_tick;
+	unsigned char idle_balance;
 	/* For active balancing */
 	int post_schedule;
 	int active_balance;
@@ -2751,8 +2751,10 @@ void scheduler_ipi(void)
 	/*
 	 * Check if someone kicked us for doing the nohz idle load balance.
 	 */
-	if (unlikely(got_nohz_idle_kick() && !need_resched()))
+	if (unlikely(got_nohz_idle_kick() && !need_resched())) {
+		this_rq()->idle_balance = 1;
 		raise_softirq_irqoff(SCHED_SOFTIRQ);
+	}
 	irq_exit();
 }
 
@@ -4247,7 +4249,7 @@ void scheduler_tick(void)
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
-	rq->idle_at_tick = idle_cpu(cpu);
+	rq->idle_balance = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
 #endif
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6c5fa1099229..506db0966eb8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -4676,7 +4676,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
 	if (time_before(now, nohz.next_balance))
 		return 0;
 
-	if (rq->idle_at_tick)
+	if (idle_cpu(cpu))
 		return 0;
 
 	first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
@@ -4712,7 +4712,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 {
 	int this_cpu = smp_processor_id();
 	struct rq *this_rq = cpu_rq(this_cpu);
-	enum cpu_idle_type idle = this_rq->idle_at_tick ?
+	enum cpu_idle_type idle = this_rq->idle_balance ?
 						CPU_IDLE : CPU_NOT_IDLE;
 
 	rebalance_domains(this_cpu, idle);
-- 
cgit v1.2.2


From fa17b507f142d37aeac322a95f6f7c6375f25601 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 16 Jun 2011 12:23:22 +0200
Subject: sched: Wrap scheduler p->cpus_allowed access

This task is preparatory for the migrate_disable() implementation, but
stands on its own and provides a cleanup.

It currently only converts those sites required for task-placement.
Kosaki-san once mentioned replacing cpus_allowed with a proper
cpumask_t instead of the NR_CPUS sized array it currently is, that
would also require something like this.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Link: http://lkml.kernel.org/n/tip-e42skvaddos99psip0vce41o@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  8 ++++----
 kernel/sched_fair.c | 12 ++++++------
 kernel/sched_rt.c   |  4 ++--
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7bc9b0e84eb3..45174ca5c8ea 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2544,11 +2544,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 
 	/* Look for allowed, online CPU in same node. */
 	for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
-		if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+		if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 			return dest_cpu;
 
 	/* Any allowed, online CPU? */
-	dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
+	dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
 	if (dest_cpu < nr_cpu_ids)
 		return dest_cpu;
 
@@ -2585,7 +2585,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
 	 * [ this allows ->select_task() to simply return task_cpu(p) and
 	 *   not worry about this generic constraint ]
 	 */
-	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
+	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
 		     !cpu_online(cpu)))
 		cpu = select_fallback_rq(task_cpu(p), p);
 
@@ -6262,7 +6262,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	if (task_cpu(p) != src_cpu)
 		goto done;
 	/* Affinity changed (again). */
-	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 		goto fail;
 
 	/*
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 506db0966eb8..5c9e67923b7c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2183,7 +2183,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
 		/* Skip over this group if it has no CPUs allowed */
 		if (!cpumask_intersects(sched_group_cpus(group),
-					&p->cpus_allowed))
+					tsk_cpus_allowed(p)))
 			continue;
 
 		local_group = cpumask_test_cpu(this_cpu,
@@ -2229,7 +2229,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 	int i;
 
 	/* Traverse only the allowed CPUs */
-	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
 		load = weighted_cpuload(i);
 
 		if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2273,7 +2273,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
 		if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
 			break;
 
-		for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+		for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) {
 			if (idle_cpu(i)) {
 				target = i;
 				break;
@@ -2316,7 +2316,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
 	int sync = wake_flags & WF_SYNC;
 
 	if (sd_flag & SD_BALANCE_WAKE) {
-		if (cpumask_test_cpu(cpu, &p->cpus_allowed))
+		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
 			want_affine = 1;
 		new_cpu = prev_cpu;
 	}
@@ -2697,7 +2697,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
-	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
+	if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
 		return 0;
 	}
@@ -4087,7 +4087,7 @@ redo:
 			 * moved to this_cpu
 			 */
 			if (!cpumask_test_cpu(this_cpu,
-					      &busiest->curr->cpus_allowed)) {
+					tsk_cpus_allowed(busiest->curr))) {
 				raw_spin_unlock_irqrestore(&busiest->lock,
 							    flags);
 				all_pinned = 1;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 0cc188cf7664..57a10842afa1 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1179,7 +1179,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
+	    (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
 	    (p->rt.nr_cpus_allowed > 1))
 		return 1;
 	return 0;
@@ -1324,7 +1324,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 			 */
 			if (unlikely(task_rq(task) != rq ||
 				     !cpumask_test_cpu(lowest_rq->cpu,
-						       &task->cpus_allowed) ||
+						       tsk_cpus_allowed(task)) ||
 				     task_running(rq, task) ||
 				     !task->on_rq)) {
 
-- 
cgit v1.2.2


From 4939602a2441306008c6dca38216b741d4e09a42 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 25 Jun 2011 15:45:46 +0200
Subject: sched: Unify the ->cpus_allowed mask copy

Currently every sched_class::set_cpus_allowed() implementation has to
copy the cpumask into task_struct::cpus_allowed, this is pointless,
put this copy in the generic code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/n/tip-jhl5s9fckd9ptw1fzbqqlrd3@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 7 +++----
 kernel/sched_rt.c | 3 ---
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 45174ca5c8ea..ce9a9e7db116 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6161,10 +6161,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
 	if (p->sched_class && p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
-	else {
-		cpumask_copy(&p->cpus_allowed, new_mask);
-		p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
-	}
+
+	cpumask_copy(&p->cpus_allowed, new_mask);
+	p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 57a10842afa1..3023fd1ef364 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1608,9 +1608,6 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 
 		update_rt_migration(&rq->rt);
 	}
-
-	cpumask_copy(&p->cpus_allowed, new_mask);
-	p->rt.nr_cpus_allowed = weight;
 }
 
 /* Assumes rq->lock is held */
-- 
cgit v1.2.2


From 1c83437e80186832a9a48dbb6b8868d28e40e562 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 5 Oct 2011 13:32:34 +0200
Subject: sched: Warn on rt throttling

The default rt-throttling is a source of never ending questions. Warn
once when we go into throttling so folks have that info in dmesg.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1110051331480.18778@ionos
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3023fd1ef364..056cbd2e2a27 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -655,6 +655,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 
 	if (rt_rq->rt_time > runtime) {
 		rt_rq->rt_throttled = 1;
+		printk_once(KERN_WARNING "sched: RT throttling activated\n");
 		if (rt_rq_throttled(rt_rq)) {
 			sched_rt_rq_dequeue(rt_rq);
 			return 1;
-- 
cgit v1.2.2


From 510f5acc4f4fb07f3f075900dc468d6e380beff6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 20:47:54 +0200
Subject: sched: Don't use tasklist_lock for debug prints

Avoid taking locks from debug prints, this avoids latencies on -rt,
and improves reliability of the debug code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ce9a9e7db116..24637c782002 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6021,7 +6021,7 @@ void show_state_filter(unsigned long state_filter)
 	printk(KERN_INFO
 		"  task                        PC stack   pid father\n");
 #endif
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
@@ -6037,7 +6037,7 @@ void show_state_filter(unsigned long state_filter)
 #ifdef CONFIG_SCHED_DEBUG
 	sysrq_sched_debug_show();
 #endif
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	/*
 	 * Only show locks if all tasks are dumped:
 	 */
-- 
cgit v1.2.2


From 02ca1521ad404cf566e0075848f80d064c0a0503 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Tue, 4 Oct 2011 19:44:38 +0900
Subject: ftrace/kprobes: Fix not to delete probes if in use

Fix kprobe-tracer not to delete a probe if the probe is in use.
In that case, delete operation will return -EBUSY.

This bug can cause a kernel panic if enabled probes are deleted
during perf record.

(Add some probes on functions)
sh-4.2# perf probe --del probe:\*
sh-4.2# exit
(kernel panic)

This is originally reported on the fedora bugzilla:

 https://bugzilla.redhat.com/show_bug.cgi?id=742383

I've also checked that this problem doesn't happen on
tracepoints when module removing because perf event
locks target module.

$ sudo ./perf record -e xfs:\* -aR sh
sh-4.2# rmmod xfs
ERROR: Module xfs is in use
sh-4.2# exit
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.203 MB perf.data (~8862 samples) ]

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: stable@kernel.org
Link: http://lkml.kernel.org/r/20111004104438.14591.6553.stgit@fedora15
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 58 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 49 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5fb3697bf0e5..00d527c945a4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -836,11 +836,17 @@ static void __unregister_trace_probe(struct trace_probe *tp)
 }
 
 /* Unregister a trace_probe and probe_event: call with locking probe_lock */
-static void unregister_trace_probe(struct trace_probe *tp)
+static int unregister_trace_probe(struct trace_probe *tp)
 {
+	/* Enabled event can not be unregistered */
+	if (trace_probe_is_enabled(tp))
+		return -EBUSY;
+
 	__unregister_trace_probe(tp);
 	list_del(&tp->list);
 	unregister_probe_event(tp);
+
+	return 0;
 }
 
 /* Register a trace_probe and probe_event */
@@ -854,7 +860,9 @@ static int register_trace_probe(struct trace_probe *tp)
 	/* Delete old (same name) event if exist */
 	old_tp = find_trace_probe(tp->call.name, tp->call.class->system);
 	if (old_tp) {
-		unregister_trace_probe(old_tp);
+		ret = unregister_trace_probe(old_tp);
+		if (ret < 0)
+			goto end;
 		free_trace_probe(old_tp);
 	}
 
@@ -892,6 +900,7 @@ static int trace_probe_module_callback(struct notifier_block *nb,
 	mutex_lock(&probe_lock);
 	list_for_each_entry(tp, &probe_list, list) {
 		if (trace_probe_within_module(tp, mod)) {
+			/* Don't need to check busy - this should have gone. */
 			__unregister_trace_probe(tp);
 			ret = __register_trace_probe(tp);
 			if (ret)
@@ -1205,10 +1214,11 @@ static int create_trace_probe(int argc, char **argv)
 			return -ENOENT;
 		}
 		/* delete an event */
-		unregister_trace_probe(tp);
-		free_trace_probe(tp);
+		ret = unregister_trace_probe(tp);
+		if (ret == 0)
+			free_trace_probe(tp);
 		mutex_unlock(&probe_lock);
-		return 0;
+		return ret;
 	}
 
 	if (argc < 2) {
@@ -1317,18 +1327,29 @@ error:
 	return ret;
 }
 
-static void release_all_trace_probes(void)
+static int release_all_trace_probes(void)
 {
 	struct trace_probe *tp;
+	int ret = 0;
 
 	mutex_lock(&probe_lock);
+	/* Ensure no probe is in use. */
+	list_for_each_entry(tp, &probe_list, list)
+		if (trace_probe_is_enabled(tp)) {
+			ret = -EBUSY;
+			goto end;
+		}
 	/* TODO: Use batch unregistration */
 	while (!list_empty(&probe_list)) {
 		tp = list_entry(probe_list.next, struct trace_probe, list);
 		unregister_trace_probe(tp);
 		free_trace_probe(tp);
 	}
+
+end:
 	mutex_unlock(&probe_lock);
+
+	return ret;
 }
 
 /* Probes listing interfaces */
@@ -1380,9 +1401,13 @@ static const struct seq_operations probes_seq_op = {
 
 static int probes_open(struct inode *inode, struct file *file)
 {
-	if ((file->f_mode & FMODE_WRITE) &&
-	    (file->f_flags & O_TRUNC))
-		release_all_trace_probes();
+	int ret;
+
+	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+		ret = release_all_trace_probes();
+		if (ret < 0)
+			return ret;
+	}
 
 	return seq_open(file, &probes_seq_op);
 }
@@ -2055,6 +2080,21 @@ static __init int kprobe_trace_self_tests_init(void)
 
 	ret = target(1, 2, 3, 4, 5, 6);
 
+	/* Disable trace points before removing it */
+	tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
+	if (WARN_ON_ONCE(tp == NULL)) {
+		pr_warning("error on getting test probe.\n");
+		warn++;
+	} else
+		disable_trace_probe(tp, TP_FLAG_TRACE);
+
+	tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
+	if (WARN_ON_ONCE(tp == NULL)) {
+		pr_warning("error on getting 2nd test probe.\n");
+		warn++;
+	} else
+		disable_trace_probe(tp, TP_FLAG_TRACE);
+
 	ret = command_trace_probe("-:testprobe");
 	if (WARN_ON_ONCE(ret)) {
 		pr_warning("error on deleting a probe.\n");
-- 
cgit v1.2.2


From e0a413f619ef8bc366dafc6f8221674993b8d85f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 29 Sep 2011 21:26:16 -0400
Subject: tracing: Warn on output if the function tracer was found corrupted

As the function tracer is very intrusive, lots of self checks are
performed on the tracer and if something is found to be strange
it will shut itself down keeping it from corrupting the rest of the
kernel. This shutdown may still allow functions to be traced, as the
tracing only stops new modifications from happening. Trying to stop
the function tracer itself can cause more harm as it requires code
modification.

Although a WARN_ON() is executed, a user may not notice it. To help
the user see that something isn't right with the tracing of the system
a big warning is added to the output of the tracer that lets the user
know that their data may be incomplete.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c |  8 ++++++++
 kernel/trace/trace.c  | 15 +++++++++++++++
 kernel/trace/trace.h  |  2 ++
 3 files changed, 25 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c3e4575e7829..077d85387908 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3862,6 +3862,14 @@ void ftrace_kill(void)
 	clear_ftrace_function();
 }
 
+/**
+ * Test if ftrace is dead or not.
+ */
+int ftrace_is_dead(void)
+{
+	return ftrace_disabled;
+}
+
 /**
  * register_ftrace_function - register a function for profiling
  * @ops - ops structure that holds the function for profiling.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4b8df0dc9358..13f2b8472fed 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2160,6 +2160,14 @@ void trace_default_header(struct seq_file *m)
 	}
 }
 
+static void test_ftrace_alive(struct seq_file *m)
+{
+	if (!ftrace_is_dead())
+		return;
+	seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+	seq_printf(m, "#          MAY BE MISSING FUNCTION EVENTS\n");
+}
+
 static int s_show(struct seq_file *m, void *v)
 {
 	struct trace_iterator *iter = v;
@@ -2169,6 +2177,7 @@ static int s_show(struct seq_file *m, void *v)
 		if (iter->tr) {
 			seq_printf(m, "# tracer: %s\n", iter->trace->name);
 			seq_puts(m, "#\n");
+			test_ftrace_alive(m);
 		}
 		if (iter->trace && iter->trace->print_header)
 			iter->trace->print_header(m);
@@ -4613,6 +4622,12 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 
 	tracing_off();
 
+	/* Did function tracer already get disabled? */
+	if (ftrace_is_dead()) {
+		printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+		printk("#          MAY BE MISSING FUNCTION EVENTS\n");
+	}
+
 	if (disable_tracing)
 		ftrace_kill();
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4c7540ad5dcb..092e1f8d18dc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -579,11 +579,13 @@ static inline int ftrace_trace_task(struct task_struct *task)
 
 	return test_tsk_trace_trace(task);
 }
+extern int ftrace_is_dead(void);
 #else
 static inline int ftrace_trace_task(struct task_struct *task)
 {
 	return 1;
 }
+static inline int ftrace_is_dead(void) { return 0; }
 #endif
 
 /*
-- 
cgit v1.2.2


From d696b58ca2c3ca76e784ef89a7e0453d9b7ab187 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 22 Sep 2011 11:50:27 -0400
Subject: tracing: Do not allocate buffer for trace_marker

When doing intense tracing, the kmalloc inside trace_marker can
introduce side effects to what is being traced.

As trace_marker() is used by userspace to inject data into the
kernel ring buffer, it needs to do so with the least amount
of intrusion to the operations of the kernel or the user space
application.

As the ring buffer is designed to write directly into the buffer
without the need to make a temporary buffer, and userspace already
went through the hassle of knowing how big the write will be,
we can simply pin the userspace pages and write the data directly
into the buffer. This improves the impact of tracing via trace_marker
tremendously!

Thanks to Peter Zijlstra and Thomas Gleixner for pointing out the
use of get_user_pages_fast() and kmap_atomic().

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Suggested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 111 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 83 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 13f2b8472fed..f86efe90ca45 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3628,22 +3628,24 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static int mark_printk(const char *fmt, ...)
-{
-	int ret;
-	va_list args;
-	va_start(args, fmt);
-	ret = trace_vprintk(0, fmt, args);
-	va_end(args);
-	return ret;
-}
-
 static ssize_t
 tracing_mark_write(struct file *filp, const char __user *ubuf,
 					size_t cnt, loff_t *fpos)
 {
-	char *buf;
-	size_t written;
+	unsigned long addr = (unsigned long)ubuf;
+	struct ring_buffer_event *event;
+	struct ring_buffer *buffer;
+	struct print_entry *entry;
+	unsigned long irq_flags;
+	struct page *pages[2];
+	int nr_pages = 1;
+	ssize_t written;
+	void *page1;
+	void *page2;
+	int offset;
+	int size;
+	int len;
+	int ret;
 
 	if (tracing_disabled)
 		return -EINVAL;
@@ -3651,28 +3653,81 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	if (cnt > TRACE_BUF_SIZE)
 		cnt = TRACE_BUF_SIZE;
 
-	buf = kmalloc(cnt + 2, GFP_KERNEL);
-	if (buf == NULL)
-		return -ENOMEM;
+	/*
+	 * Userspace is injecting traces into the kernel trace buffer.
+	 * We want to be as non intrusive as possible.
+	 * To do so, we do not want to allocate any special buffers
+	 * or take any locks, but instead write the userspace data
+	 * straight into the ring buffer.
+	 *
+	 * First we need to pin the userspace buffer into memory,
+	 * which, most likely it is, because it just referenced it.
+	 * But there's no guarantee that it is. By using get_user_pages_fast()
+	 * and kmap_atomic/kunmap_atomic() we can get access to the
+	 * pages directly. We then write the data directly into the
+	 * ring buffer.
+	 */
+	BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
 
-	if (copy_from_user(buf, ubuf, cnt)) {
-		kfree(buf);
-		return -EFAULT;
+	/* check if we cross pages */
+	if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
+		nr_pages = 2;
+
+	offset = addr & (PAGE_SIZE - 1);
+	addr &= PAGE_MASK;
+
+	ret = get_user_pages_fast(addr, nr_pages, 0, pages);
+	if (ret < nr_pages) {
+		while (--ret >= 0)
+			put_page(pages[ret]);
+		written = -EFAULT;
+		goto out;
 	}
-	if (buf[cnt-1] != '\n') {
-		buf[cnt] = '\n';
-		buf[cnt+1] = '\0';
+
+	page1 = kmap_atomic(pages[0]);
+	if (nr_pages == 2)
+		page2 = kmap_atomic(pages[1]);
+
+	local_save_flags(irq_flags);
+	size = sizeof(*entry) + cnt + 2; /* possible \n added */
+	buffer = global_trace.buffer;
+	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+					  irq_flags, preempt_count());
+	if (!event) {
+		/* Ring buffer disabled, return as if not open for write */
+		written = -EBADF;
+		goto out_unlock;
+	}
+
+	entry = ring_buffer_event_data(event);
+	entry->ip = _THIS_IP_;
+
+	if (nr_pages == 2) {
+		len = PAGE_SIZE - offset;
+		memcpy(&entry->buf, page1 + offset, len);
+		memcpy(&entry->buf[len], page2, cnt - len);
 	} else
-		buf[cnt] = '\0';
+		memcpy(&entry->buf, page1 + offset, cnt);
 
-	written = mark_printk("%s", buf);
-	kfree(buf);
-	*fpos += written;
+	if (entry->buf[cnt - 1] != '\n') {
+		entry->buf[cnt] = '\n';
+		entry->buf[cnt + 1] = '\0';
+	} else
+		entry->buf[cnt] = '\0';
+
+	ring_buffer_unlock_commit(buffer, event);
 
-	/* don't tell userspace we wrote more - it might confuse them */
-	if (written > cnt)
-		written = cnt;
+	written = cnt;
 
+	*fpos += written;
+
+ out_unlock:
+	if (nr_pages == 2)
+		kunmap_atomic(page2);
+	kunmap_atomic(page1);
+	while (nr_pages > 0)
+		put_page(pages[--nr_pages]);
+ out:
 	return written;
 }
 
-- 
cgit v1.2.2


From 9b5f8b31af57a8ce9e9f77864d9143b5e3304815 Mon Sep 17 00:00:00 2001
From: Geunsik Lim <geunsik.lim@samsung.com>
Date: Fri, 12 Aug 2011 14:30:22 +0900
Subject: ftrace: Fix README to state tracing_on to start/stop tracing

tracing_enabled option is deprecated.
To start/stop tracing, write to /sys/kernel/debug/tracing/tracing_on
without tracing_enabled. This patch is based on Linux 3.1.0-rc1

Signed-off-by: Geunsik Lim <geunsik.lim@samsung.com>
Link: http://lkml.kernel.org/r/1313127022-23830-1-git-send-email-leemgs1@gmail.com
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f86efe90ca45..cea16053f553 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2720,9 +2720,9 @@ static const char readme_msg[] =
 	"# cat /sys/kernel/debug/tracing/trace_options\n"
 	"noprint-parent nosym-offset nosym-addr noverbose\n"
 	"# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
-	"# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
+	"# echo 1 > /sys/kernel/debug/tracing/tracing_on\n"
 	"# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
-	"# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
+	"# echo 0 > /sys/kernel/debug/tracing/tracing_on\n"
 ;
 
 static ssize_t
-- 
cgit v1.2.2


From 436fc280261dcfce5af38f08b89287750dc91cd2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 14 Oct 2011 10:44:25 -0400
Subject: tracing: Fix returning of duplicate data after EOF in trace_pipe_raw

The trace_pipe_raw handler holds a cached page from the time the file
is opened to the time it is closed. The cached page is used to handle
the case of the user space buffer being smaller than what was read from
the ring buffer. The left over buffer is held in the cache so that the
next read will continue where the data left off.

After EOF is returned (no more data in the buffer), the index of
the cached page is set to zero. If a user app reads the page again
after EOF, the check in the buffer will see that the cached page
is less than page size and will return the cached page again. This
will cause reading the trace_pipe_raw again after EOF to return
duplicate data, making the output look like the time went backwards
but instead data is just repeated.

The fix is to not reset the index right after all data is read
from the cache, but to reset it after all data is read and more
data exists in the ring buffer.

Cc: stable <stable@kernel.org>
Reported-by: Jeremy Eder <jeder@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index cea16053f553..b24a72d35008 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3903,8 +3903,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 	if (info->read < PAGE_SIZE)
 		goto read;
 
-	info->read = 0;
-
 	trace_access_lock(info->cpu);
 	ret = ring_buffer_read_page(info->tr->buffer,
 				    &info->spare,
@@ -3914,6 +3912,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 	if (ret < 0)
 		return 0;
 
+	info->read = 0;
+
 read:
 	size = PAGE_SIZE - info->read;
 	if (size > count)
-- 
cgit v1.2.2


From 2a77c46de1e3dace73745015635ebbc648eca69c Mon Sep 17 00:00:00 2001
From: ShuoX Liu <shuox.liu@intel.com>
Date: Wed, 10 Aug 2011 23:01:26 +0200
Subject: PM / Suspend: Add statistics debugfs file for suspend to RAM

Record S3 failure time about each reason and the latest two failed
devices' names in S3 progress.
We can check it through 'suspend_stats' entry in debugfs.

The motivation of the patch:

We are enabling power features on Medfield. Comparing with PC/notebook,
a mobile enters/exits suspend-2-ram (we call it s3 on Medfield) far
more frequently. If it can't enter suspend-2-ram in time, the power
might be used up soon.

We often find sometimes, a device suspend fails. Then, system retries
s3 over and over again. As display is off, testers and developers
don't know what happens.

Some testers and developers complain they don't know if system
tries suspend-2-ram, and what device fails to suspend. They need
such info for a quick check. The patch adds suspend_stats under
debugfs for users to check suspend to RAM statistics quickly.

If not using this patch, we have other methods to get info about
what device fails. One is to turn on  CONFIG_PM_DEBUG, but users
would get too much info and testers need recompile the system.

In addition, dynamic debug is another good tool to dump debug info.
But it still doesn't match our utilization scenario closely.
1) user need write a user space parser to process the syslog output;
2) Our testing scenario is we leave the mobile for at least hours.
   Then, check its status. No serial console available during the
   testing. One is because console would be suspended, and the other
   is serial console connecting with spi or HSU devices would consume
   power. These devices are powered off at suspend-2-ram.

Signed-off-by: ShuoX Liu <shuox.liu@intel.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/main.c    | 102 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/power/suspend.c |  17 +++++++--
 2 files changed, 116 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6c601f871964..2757acba8e8a 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -12,6 +12,8 @@
 #include <linux/string.h>
 #include <linux/resume-trace.h>
 #include <linux/workqueue.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 
 #include "power.h"
 
@@ -133,6 +135,101 @@ power_attr(pm_test);
 
 #endif /* CONFIG_PM_SLEEP */
 
+#ifdef CONFIG_DEBUG_FS
+static char *suspend_step_name(enum suspend_stat_step step)
+{
+	switch (step) {
+	case SUSPEND_FREEZE:
+		return "freeze";
+	case SUSPEND_PREPARE:
+		return "prepare";
+	case SUSPEND_SUSPEND:
+		return "suspend";
+	case SUSPEND_SUSPEND_NOIRQ:
+		return "suspend_noirq";
+	case SUSPEND_RESUME_NOIRQ:
+		return "resume_noirq";
+	case SUSPEND_RESUME:
+		return "resume";
+	default:
+		return "";
+	}
+}
+
+static int suspend_stats_show(struct seq_file *s, void *unused)
+{
+	int i, index, last_dev, last_errno, last_step;
+
+	last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+	last_dev %= REC_FAILED_NUM;
+	last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1;
+	last_errno %= REC_FAILED_NUM;
+	last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
+	last_step %= REC_FAILED_NUM;
+	seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
+			"%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
+			"success", suspend_stats.success,
+			"fail", suspend_stats.fail,
+			"failed_freeze", suspend_stats.failed_freeze,
+			"failed_prepare", suspend_stats.failed_prepare,
+			"failed_suspend", suspend_stats.failed_suspend,
+			"failed_suspend_noirq",
+				suspend_stats.failed_suspend_noirq,
+			"failed_resume", suspend_stats.failed_resume,
+			"failed_resume_noirq",
+				suspend_stats.failed_resume_noirq);
+	seq_printf(s,	"failures:\n  last_failed_dev:\t%-s\n",
+			suspend_stats.failed_devs[last_dev]);
+	for (i = 1; i < REC_FAILED_NUM; i++) {
+		index = last_dev + REC_FAILED_NUM - i;
+		index %= REC_FAILED_NUM;
+		seq_printf(s, "\t\t\t%-s\n",
+			suspend_stats.failed_devs[index]);
+	}
+	seq_printf(s,	"  last_failed_errno:\t%-d\n",
+			suspend_stats.errno[last_errno]);
+	for (i = 1; i < REC_FAILED_NUM; i++) {
+		index = last_errno + REC_FAILED_NUM - i;
+		index %= REC_FAILED_NUM;
+		seq_printf(s, "\t\t\t%-d\n",
+			suspend_stats.errno[index]);
+	}
+	seq_printf(s,	"  last_failed_step:\t%-s\n",
+			suspend_step_name(
+				suspend_stats.failed_steps[last_step]));
+	for (i = 1; i < REC_FAILED_NUM; i++) {
+		index = last_step + REC_FAILED_NUM - i;
+		index %= REC_FAILED_NUM;
+		seq_printf(s, "\t\t\t%-s\n",
+			suspend_step_name(
+				suspend_stats.failed_steps[index]));
+	}
+
+	return 0;
+}
+
+static int suspend_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, suspend_stats_show, NULL);
+}
+
+static const struct file_operations suspend_stats_operations = {
+	.open           = suspend_stats_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = single_release,
+};
+
+static int __init pm_debugfs_init(void)
+{
+	debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO,
+			NULL, NULL, &suspend_stats_operations);
+	return 0;
+}
+
+late_initcall(pm_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
+
 struct kobject *power_kobj;
 
 /**
@@ -194,6 +291,11 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
 	}
 	if (state < PM_SUSPEND_MAX && *s)
 		error = enter_state(state);
+		if (error) {
+			suspend_stats.fail++;
+			dpm_save_failed_errno(error);
+		} else
+			suspend_stats.success++;
 #endif
 
  Exit:
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index b6b71ad2208f..595a3dd56a8a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -104,7 +104,10 @@ static int suspend_prepare(void)
 		goto Finish;
 
 	error = suspend_freeze_processes();
-	if (!error)
+	if (error) {
+		suspend_stats.failed_freeze++;
+		dpm_save_failed_step(SUSPEND_FREEZE);
+	} else
 		return 0;
 
 	suspend_thaw_processes();
@@ -315,8 +318,16 @@ int enter_state(suspend_state_t state)
  */
 int pm_suspend(suspend_state_t state)
 {
-	if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
-		return enter_state(state);
+	int ret;
+	if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) {
+		ret = enter_state(state);
+		if (ret) {
+			suspend_stats.fail++;
+			dpm_save_failed_errno(ret);
+		} else
+			suspend_stats.success++;
+		return ret;
+	}
 	return -EINVAL;
 }
 EXPORT_SYMBOL(pm_suspend);
-- 
cgit v1.2.2


From ca123102f69fb260221502ade9bbc069290fae84 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 11 Aug 2011 22:38:12 +0200
Subject: PM: Fix build issue in main.c for CONFIG_PM_SLEEP unset

Suspend statistics should depend on CONFIG_PM_SLEEP, so make that
happen.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 2757acba8e8a..a52e88425a31 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -133,8 +133,6 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
 power_attr(pm_test);
 #endif /* CONFIG_PM_DEBUG */
 
-#endif /* CONFIG_PM_SLEEP */
-
 #ifdef CONFIG_DEBUG_FS
 static char *suspend_step_name(enum suspend_stat_step step)
 {
@@ -230,6 +228,8 @@ static int __init pm_debugfs_init(void)
 late_initcall(pm_debugfs_init);
 #endif /* CONFIG_DEBUG_FS */
 
+#endif /* CONFIG_PM_SLEEP */
+
 struct kobject *power_kobj;
 
 /**
-- 
cgit v1.2.2


From 85055dd805f0822f13f736bee2a521e222c38293 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 17 Aug 2011 20:42:24 +0200
Subject: PM / Hibernate: Include storage keys in hibernation image on s390

For s390 there is one additional byte associated with each page,
the storage key. This byte contains the referenced and changed
bits and needs to be included into the hibernation image.
If the storage keys are not restored to their previous state all
original pages would appear to be dirty. This can cause
inconsistencies e.g. with read-only filesystems.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/Kconfig    |  3 +++
 kernel/power/snapshot.c | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 3744c594b19b..e01e6899592c 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -65,6 +65,9 @@ config HIBERNATION
 
 	  For more information take a look at <file:Documentation/power/swsusp.txt>.
 
+config ARCH_SAVE_PAGE_KEYS
+	bool
+
 config PM_STD_PARTITION
 	string "Default resume partition"
 	depends on HIBERNATION
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 06efa54f93d6..cbe2c1441392 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1339,6 +1339,9 @@ int hibernate_preallocate_memory(void)
 	count += highmem;
 	count -= totalreserve_pages;
 
+	/* Add number of pages required for page keys (s390 only). */
+	size += page_key_additional_pages(saveable);
+
 	/* Compute the maximum number of saveable pages to leave in memory. */
 	max_size = (count - (size + PAGES_FOR_IO)) / 2
 			- 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
@@ -1662,6 +1665,8 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
 		buf[j] = memory_bm_next_pfn(bm);
 		if (unlikely(buf[j] == BM_END_OF_MAP))
 			break;
+		/* Save page key for data page (s390 only). */
+		page_key_read(buf + j);
 	}
 }
 
@@ -1821,6 +1826,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 		if (unlikely(buf[j] == BM_END_OF_MAP))
 			break;
 
+		/* Extract and buffer page key for data page (s390 only). */
+		page_key_memorize(buf + j);
+
 		if (memory_bm_pfn_present(bm, buf[j]))
 			memory_bm_set_bit(bm, buf[j]);
 		else
@@ -2223,6 +2231,11 @@ int snapshot_write_next(struct snapshot_handle *handle)
 		if (error)
 			return error;
 
+		/* Allocate buffer for page keys. */
+		error = page_key_alloc(nr_copy_pages);
+		if (error)
+			return error;
+
 	} else if (handle->cur <= nr_meta_pages + 1) {
 		error = unpack_orig_pfns(buffer, &copy_bm);
 		if (error)
@@ -2243,6 +2256,8 @@ int snapshot_write_next(struct snapshot_handle *handle)
 		}
 	} else {
 		copy_last_highmem_page();
+		/* Restore page key for data page (s390 only). */
+		page_key_write(handle->buffer);
 		handle->buffer = get_buffer(&orig_bm, &ca);
 		if (IS_ERR(handle->buffer))
 			return PTR_ERR(handle->buffer);
@@ -2264,6 +2279,9 @@ int snapshot_write_next(struct snapshot_handle *handle)
 void snapshot_write_finalize(struct snapshot_handle *handle)
 {
 	copy_last_highmem_page();
+	/* Restore page key for data page (s390 only). */
+	page_key_write(handle->buffer);
+	page_key_free();
 	/* Free only if we have loaded the image entirely */
 	if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
 		memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
-- 
cgit v1.2.2


From 528f7ce6e439edeac38f6b3f8561f1be129b5e91 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 21 Sep 2011 20:55:04 +0200
Subject: PM / Suspend: Off by one in pm_suspend()

In enter_state() we use "state" as an offset for the pm_states[]
array.  The pm_states[] array only has PM_SUSPEND_MAX elements so
this test is off by one.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: stable@kernel.org
---
 kernel/power/suspend.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 595a3dd56a8a..fdd4263b995d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -319,7 +319,7 @@ int enter_state(suspend_state_t state)
 int pm_suspend(suspend_state_t state)
 {
 	int ret;
-	if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) {
+	if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) {
 		ret = enter_state(state);
 		if (ret) {
 			suspend_stats.fail++;
-- 
cgit v1.2.2


From 37cce26b32142f09a8967f6d238178af654b20de Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Wed, 21 Sep 2011 22:47:55 +0200
Subject: PM / VT: Cleanup #if defined uglyness and fix compile error

Introduce the config option CONFIG_VT_CONSOLE_SLEEP in order to cleanup
the #if defined ugliness for the vt suspend support functions. Note that
CONFIG_VT_CONSOLE is already dependant on CONFIG_VT.

The function pm_set_vt_switch is actually dependant on CONFIG_VT and not
CONFIG_PM_SLEEP. This fixes a compile error when CONFIG_PM_SLEEP is
not set:

drivers/tty/vt/vt_ioctl.c:1794: error: redefinition of 'pm_set_vt_switch'
include/linux/suspend.h:17: error: previous definition of 'pm_set_vt_switch' was here

Also, remove the incorrect path from the comment in console.c.

[rjw: Replaced #if defined() with #ifdef in suspend.h.]

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/Makefile  | 2 +-
 kernel/power/console.c | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index ad6bdd8b401a..07e0e28ffba7 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -2,7 +2,7 @@
 ccflags-$(CONFIG_PM_DEBUG)	:= -DDEBUG
 
 obj-$(CONFIG_PM)		+= main.o qos.o
-obj-$(CONFIG_PM_SLEEP)		+= console.o
+obj-$(CONFIG_VT_CONSOLE_SLEEP)	+= console.o
 obj-$(CONFIG_FREEZER)		+= process.o
 obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)	+= suspend_test.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 218e5af90156..b1dc456474b5 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -1,5 +1,5 @@
 /*
- * drivers/power/process.c - Functions for saving/restoring console.
+ * Functions for saving/restoring console.
  *
  * Originally from swsusp.
  */
@@ -10,7 +10,6 @@
 #include <linux/module.h>
 #include "power.h"
 
-#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
 #define SUSPEND_CONSOLE	(MAX_NR_CONSOLES-1)
 
 static int orig_fgconsole, orig_kmsg;
@@ -32,4 +31,3 @@ void pm_restore_console(void)
 		vt_kmsg_redirect(orig_kmsg);
 	}
 }
-#endif
-- 
cgit v1.2.2


From 2aede851ddf08666f68ffc17be446420e9d2a056 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 26 Sep 2011 20:32:27 +0200
Subject: PM / Hibernate: Freeze kernel threads after preallocating memory

There is a problem with the current ordering of hibernate code which
leads to deadlocks in some filesystems' memory shrinkers.  Namely,
some filesystems use freezable kernel threads that are inactive when
the hibernate memory preallocation is carried out.  Those same
filesystems use memory shrinkers that may be triggered by the
hibernate memory preallocation.  If those memory shrinkers wait for
the frozen kernel threads, the hibernate process deadlocks (this
happens with XFS, for one example).

Apparently, it is not technically viable to redesign the filesystems
in question to avoid the situation described above, so the only
possible solution of this issue is to defer the freezing of kernel
threads until the hibernate memory preallocation is done, which is
implemented by this change.

Unfortunately, this requires the memory preallocation to be done
before the "prepare" stage of device freeze, so after this change the
only way drivers can allocate additional memory for their freeze
routines in a clean way is to use PM notifiers.

Reported-by: Christoph <cr2005@u-club.de>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c | 12 ++++++++----
 kernel/power/power.h     |  3 ++-
 kernel/power/process.c   | 30 ++++++++++++++++++++----------
 3 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8f7b1db1ece1..3a20466015f8 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -334,12 +334,16 @@ int hibernation_snapshot(int platform_mode)
 	if (error)
 		goto Close;
 
-	error = dpm_prepare(PMSG_FREEZE);
-	if (error)
-		goto Complete_devices;
-
 	/* Preallocate image memory before shutting down devices. */
 	error = hibernate_preallocate_memory();
+	if (error)
+		goto Close;
+
+	error = freeze_kernel_threads();
+	if (error)
+		goto Close;
+
+	error = dpm_prepare(PMSG_FREEZE);
 	if (error)
 		goto Complete_devices;
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9a00a0a26280..e6206397ce67 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -228,7 +228,8 @@ extern int pm_test_level;
 #ifdef CONFIG_SUSPEND_FREEZER
 static inline int suspend_freeze_processes(void)
 {
-	return freeze_processes();
+	int error = freeze_processes();
+	return error ? : freeze_kernel_threads();
 }
 
 static inline void suspend_thaw_processes(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0cf3a27a6c9d..addbbe5531bc 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -135,7 +135,7 @@ static int try_to_freeze_tasks(bool sig_only)
 }
 
 /**
- *	freeze_processes - tell processes to enter the refrigerator
+ * freeze_processes - Signal user space processes to enter the refrigerator.
  */
 int freeze_processes(void)
 {
@@ -143,20 +143,30 @@ int freeze_processes(void)
 
 	printk("Freezing user space processes ... ");
 	error = try_to_freeze_tasks(true);
-	if (error)
-		goto Exit;
-	printk("done.\n");
+	if (!error) {
+		printk("done.");
+		oom_killer_disable();
+	}
+	printk("\n");
+	BUG_ON(in_atomic());
+
+	return error;
+}
+
+/**
+ * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
+ */
+int freeze_kernel_threads(void)
+{
+	int error;
 
 	printk("Freezing remaining freezable tasks ... ");
 	error = try_to_freeze_tasks(false);
-	if (error)
-		goto Exit;
-	printk("done.");
+	if (!error)
+		printk("done.");
 
-	oom_killer_disable();
- Exit:
-	BUG_ON(in_atomic());
 	printk("\n");
+	BUG_ON(in_atomic());
 
 	return error;
 }
-- 
cgit v1.2.2


From 21e82808fc465b66fedaac0f4e885cafb304e843 Mon Sep 17 00:00:00 2001
From: Barry Song <Baohua.Song@csr.com>
Date: Tue, 27 Sep 2011 22:05:44 +0200
Subject: PM / Hibernate: Fix typo in a kerneldoc comment

Fix a typo in a function name in the kerneldoc comment next to
resume_target_kernel().

[rjw: Changed the subject slightly, added the changelog.]

Signed-off-by: Barry Song <Baohua.Song@csr.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 3a20466015f8..7f44e5c26971 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -467,7 +467,7 @@ static int resume_target_kernel(bool platform_mode)
  * @platform_mode: If set, use platform driver to prepare for the transition.
  *
  * This routine must be called with pm_mutex held.  If it is successful, control
- * reappears in the restored target kernel in hibernation_snaphot().
+ * reappears in the restored target kernel in hibernation_snapshot().
  */
 int hibernation_restore(int platform_mode)
 {
-- 
cgit v1.2.2


From 6f8d7022a842809aeb24db1d15669198ef02c131 Mon Sep 17 00:00:00 2001
From: Barry Song <baohua.song@csr.com>
Date: Thu, 6 Oct 2011 20:34:46 +0200
Subject: PM / Hibernate: Add resumewait param to support MMC-like devices as
 resume file

Some devices like MMC are async detected very slow. For example,
drivers/mmc/host/sdhci.c launches a 200ms delayed work to detect
MMC partitions then add disk.

We have wait_for_device_probe() and scsi_complete_async_scans()
before calling swsusp_check(), but it is not enough to wait for MMC.

This patch adds resumewait kernel param just like rootwait so
that we have enough time to wait until MMC is ready. The difference is
that we wait for resume partition whereas rootwait waits for rootfs
partition (which may be on a different device).

This patch will make hibernation support many embedded products
without SCSI devices, but with devices like MMC.

[rjw: Modified the changelog slightly.]

Signed-off-by: Barry Song <Baohua.Song@csr.com>
Reviewed-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 7f44e5c26971..0f8785080cde 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -14,6 +14,7 @@
 #include <linux/reboot.h>
 #include <linux/string.h>
 #include <linux/device.h>
+#include <linux/async.h>
 #include <linux/kmod.h>
 #include <linux/delay.h>
 #include <linux/fs.h>
@@ -31,6 +32,7 @@
 
 static int nocompress = 0;
 static int noresume = 0;
+static int resume_wait = 0;
 static char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
@@ -736,6 +738,13 @@ static int software_resume(void)
 		 * to wait for this to finish.
 		 */
 		wait_for_device_probe();
+
+		if (resume_wait) {
+			while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0)
+				msleep(10);
+			async_synchronize_full();
+		}
+
 		/*
 		 * We can't depend on SCSI devices being available after loading
 		 * one of their modules until scsi_complete_async_scans() is
@@ -1064,7 +1073,14 @@ static int __init noresume_setup(char *str)
 	return 1;
 }
 
+static int __init resumewait_setup(char *str)
+{
+	resume_wait = 1;
+	return 1;
+}
+
 __setup("noresume", noresume_setup);
 __setup("resume_offset=", resume_offset_setup);
 __setup("resume=", resume_setup);
 __setup("hibernate=", hibernate_setup);
+__setup("resumewait", resumewait_setup);
-- 
cgit v1.2.2


From f126f7334f72e2fd1b7a62bba20c488b86e6e7c4 Mon Sep 17 00:00:00 2001
From: Barry Song <baohua.song@csr.com>
Date: Mon, 10 Oct 2011 23:38:41 +0200
Subject: PM / Hibernate: Add resumedelay kernel param in addition to
 resumewait

Patch "PM / Hibernate: Add resumewait param to support MMC-like
devices as resume file" added the resumewait kernel command line
option.  The present patch adds resumedelay so that
resumewait/delay were analogous to rootwait/delay.

[rjw: Modified the subject and changelog slightly.]

Signed-off-by: Barry Song <baohua.song@csr.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 0f8785080cde..50f537953e70 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -33,6 +33,7 @@
 static int nocompress = 0;
 static int noresume = 0;
 static int resume_wait = 0;
+static int resume_delay = 0;
 static char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
@@ -730,6 +731,12 @@ static int software_resume(void)
 
 	pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
 
+	if (resume_delay) {
+		printk(KERN_INFO "Waiting %dsec before reading resume device...\n",
+			resume_delay);
+		ssleep(resume_delay);
+	}
+
 	/* Check if the device is there */
 	swsusp_resume_device = name_to_dev_t(resume_file);
 	if (!swsusp_resume_device) {
@@ -1079,8 +1086,15 @@ static int __init resumewait_setup(char *str)
 	return 1;
 }
 
+static int __init resumedelay_setup(char *str)
+{
+	resume_delay = simple_strtoul(str, NULL, 0);
+	return 1;
+}
+
 __setup("noresume", noresume_setup);
 __setup("resume_offset=", resume_offset_setup);
 __setup("resume=", resume_setup);
 __setup("hibernate=", hibernate_setup);
 __setup("resumewait", resumewait_setup);
+__setup("resumedelay=", resumedelay_setup);
-- 
cgit v1.2.2


From 27920651fe0d16a25632dc238e81b54f3a504869 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 Oct 2011 21:20:55 +0200
Subject: PM / Freezer: Make fake_signal_wake_up() wake TASK_KILLABLE tasks too

TASK_KILLABLE is often used to put tasks to sleep for quite some time.
One of the most common uses is to put tasks to sleep while waiting for
replies from a server on a networked filesystem (such as CIFS or NFS).

Unfortunately, fake_signal_wake_up does not currently wake up tasks
that are sleeping in TASK_KILLABLE state. This means that even if the
code were in place to allow them to freeze while in this sleep, it
wouldn't work anyway.

This patch changes this function to wake tasks in this state as well.
This should be harmless -- if the code doing the sleeping doesn't have
handling to deal with freezer events, it should just go back to sleep.
If it does, then this will allow that code to do the right thing.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/freezer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/freezer.c b/kernel/freezer.c
index 7b01de98bb6a..66a594e8ad2f 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -67,7 +67,7 @@ static void fake_signal_wake_up(struct task_struct *p)
 	unsigned long flags;
 
 	spin_lock_irqsave(&p->sighand->siglock, flags);
-	signal_wake_up(p, 0);
+	signal_wake_up(p, 1);
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
 }
 
-- 
cgit v1.2.2


From d231ff1af70a2df43d809173cf8c94e9c3beb853 Mon Sep 17 00:00:00 2001
From: Barry Song <Baohua.Song@csr.com>
Date: Tue, 11 Oct 2011 23:29:18 -0700
Subject: PM / Hibernate: Do not initialize static and extern variables to 0

Static and extern variables in kernel/power/hibernate.c need not be
initialized to 0 explicitly, so remove those initializations.

[rjw: Modified subject, added changelog.]

Signed-off-by: Barry Song <Baohua.Song@csr.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 50f537953e70..ea12c8f1bdfd 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -30,14 +30,14 @@
 #include "power.h"
 
 
-static int nocompress = 0;
-static int noresume = 0;
-static int resume_wait = 0;
-static int resume_delay = 0;
+static int nocompress;
+static int noresume;
+static int resume_wait;
+static int resume_delay;
 static char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
-int in_suspend __nosavedata = 0;
+int in_suspend __nosavedata;
 
 enum {
 	HIBERNATION_INVALID,
-- 
cgit v1.2.2


From 081a9d043c983f161b78fdc4671324d1342b86bc Mon Sep 17 00:00:00 2001
From: Bojan Smojver <bojan@rexursive.com>
Date: Thu, 13 Oct 2011 23:58:07 +0200
Subject: PM / Hibernate: Improve performance of LZO/plain hibernation,
 checksum image

Use threads for LZO compression/decompression on hibernate/thaw.
Improve buffering on hibernate/thaw.
Calculate/verify CRC32 of the image pages on hibernate/thaw.

In my testing, this improved write/read speed by a factor of about two.

Signed-off-by: Bojan Smojver <bojan@rexursive.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/Kconfig     |   1 +
 kernel/power/hibernate.c |   3 +
 kernel/power/power.h     |   1 +
 kernel/power/swap.c      | 818 ++++++++++++++++++++++++++++++++++++-----------
 4 files changed, 645 insertions(+), 178 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index e01e6899592c..cedd9982306a 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,7 @@ config HIBERNATION
 	select HIBERNATE_CALLBACKS
 	select LZO_COMPRESS
 	select LZO_DECOMPRESS
+	select CRC32
 	---help---
 	  Enable the suspend to disk (STD) functionality, which is usually
 	  called "hibernation" in user interfaces.  STD checkpoints the
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index ea12c8f1bdfd..1c53f7fad5f7 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -657,6 +657,9 @@ int hibernate(void)
 			flags |= SF_PLATFORM_MODE;
 		if (nocompress)
 			flags |= SF_NOCOMPRESS_MODE;
+		else
+		        flags |= SF_CRC32_MODE;
+
 		pr_debug("PM: writing image.\n");
 		error = swsusp_write(flags);
 		swsusp_free();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index e6206397ce67..23a2db1ec442 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -146,6 +146,7 @@ extern int swsusp_swap_in_use(void);
  */
 #define SF_PLATFORM_MODE	1
 #define SF_NOCOMPRESS_MODE	2
+#define SF_CRC32_MODE	        4
 
 /* kernel/power/hibernate.c */
 extern int swsusp_check(void);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7c97c3a0eee3..11a594c4ba25 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -27,6 +27,10 @@
 #include <linux/slab.h>
 #include <linux/lzo.h>
 #include <linux/vmalloc.h>
+#include <linux/cpumask.h>
+#include <linux/atomic.h>
+#include <linux/kthread.h>
+#include <linux/crc32.h>
 
 #include "power.h"
 
@@ -43,8 +47,7 @@
  *	allocated and populated one at a time, so we only need one memory
  *	page to set up the entire structure.
  *
- *	During resume we also only need to use one swap_map_page structure
- *	at a time.
+ *	During resume we pick up all swap_map_page structures into a list.
  */
 
 #define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(sector_t) - 1)
@@ -54,6 +57,11 @@ struct swap_map_page {
 	sector_t next_swap;
 };
 
+struct swap_map_page_list {
+	struct swap_map_page *map;
+	struct swap_map_page_list *next;
+};
+
 /**
  *	The swap_map_handle structure is used for handling swap in
  *	a file-alike way
@@ -61,13 +69,18 @@ struct swap_map_page {
 
 struct swap_map_handle {
 	struct swap_map_page *cur;
+	struct swap_map_page_list *maps;
 	sector_t cur_swap;
 	sector_t first_sector;
 	unsigned int k;
+	unsigned long nr_free_pages, written;
+	u32 crc32;
 };
 
 struct swsusp_header {
-	char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
+	char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) -
+	              sizeof(u32)];
+	u32	crc32;
 	sector_t image;
 	unsigned int flags;	/* Flags to pass to the "boot" kernel */
 	char	orig_sig[10];
@@ -199,6 +212,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 		memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
 		swsusp_header->image = handle->first_sector;
 		swsusp_header->flags = flags;
+		if (flags & SF_CRC32_MODE)
+			swsusp_header->crc32 = handle->crc32;
 		error = hib_bio_write_page(swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {
@@ -245,6 +260,7 @@ static int swsusp_swap_check(void)
 static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 {
 	void *src;
+	int ret;
 
 	if (!offset)
 		return -ENOSPC;
@@ -254,9 +270,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 		if (src) {
 			copy_page(src, buf);
 		} else {
-			WARN_ON_ONCE(1);
-			bio_chain = NULL;	/* Go synchronous */
-			src = buf;
+			ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
+			if (ret)
+				return ret;
+			src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+			if (src) {
+				copy_page(src, buf);
+			} else {
+				WARN_ON_ONCE(1);
+				bio_chain = NULL;	/* Go synchronous */
+				src = buf;
+			}
 		}
 	} else {
 		src = buf;
@@ -293,6 +317,8 @@ static int get_swap_writer(struct swap_map_handle *handle)
 		goto err_rel;
 	}
 	handle->k = 0;
+	handle->nr_free_pages = nr_free_pages() >> 1;
+	handle->written = 0;
 	handle->first_sector = handle->cur_swap;
 	return 0;
 err_rel:
@@ -316,20 +342,23 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 		return error;
 	handle->cur->entries[handle->k++] = offset;
 	if (handle->k >= MAP_PAGE_ENTRIES) {
-		error = hib_wait_on_bio_chain(bio_chain);
-		if (error)
-			goto out;
 		offset = alloc_swapdev_block(root_swap);
 		if (!offset)
 			return -ENOSPC;
 		handle->cur->next_swap = offset;
-		error = write_page(handle->cur, handle->cur_swap, NULL);
+		error = write_page(handle->cur, handle->cur_swap, bio_chain);
 		if (error)
 			goto out;
 		clear_page(handle->cur);
 		handle->cur_swap = offset;
 		handle->k = 0;
 	}
+	if (bio_chain && ++handle->written > handle->nr_free_pages) {
+		error = hib_wait_on_bio_chain(bio_chain);
+		if (error)
+			goto out;
+		handle->written = 0;
+	}
  out:
 	return error;
 }
@@ -372,6 +401,13 @@ static int swap_writer_finish(struct swap_map_handle *handle,
 			             LZO_HEADER, PAGE_SIZE)
 #define LZO_CMP_SIZE	(LZO_CMP_PAGES * PAGE_SIZE)
 
+/* Maximum number of threads for compression/decompression. */
+#define LZO_THREADS	3
+
+/* Maximum number of pages for read buffering. */
+#define LZO_READ_PAGES	(MAP_PAGE_ENTRIES * 8)
+
+
 /**
  *	save_image - save the suspend image data
  */
@@ -419,6 +455,92 @@ static int save_image(struct swap_map_handle *handle,
 	return ret;
 }
 
+/**
+ * Structure used for CRC32.
+ */
+struct crc_data {
+	struct task_struct *thr;                  /* thread */
+	atomic_t ready;                           /* ready to start flag */
+	atomic_t stop;                            /* ready to stop flag */
+	unsigned run_threads;                     /* nr current threads */
+	wait_queue_head_t go;                     /* start crc update */
+	wait_queue_head_t done;                   /* crc update done */
+	u32 *crc32;                               /* points to handle's crc32 */
+	size_t *unc_len[LZO_THREADS];             /* uncompressed lengths */
+	unsigned char *unc[LZO_THREADS];          /* uncompressed data */
+};
+
+/**
+ * CRC32 update function that runs in its own thread.
+ */
+static int crc32_threadfn(void *data)
+{
+	struct crc_data *d = data;
+	unsigned i;
+
+	while (1) {
+		wait_event(d->go, atomic_read(&d->ready) ||
+		                  kthread_should_stop());
+		if (kthread_should_stop()) {
+			d->thr = NULL;
+			atomic_set(&d->stop, 1);
+			wake_up(&d->done);
+			break;
+		}
+		atomic_set(&d->ready, 0);
+
+		for (i = 0; i < d->run_threads; i++)
+			*d->crc32 = crc32_le(*d->crc32,
+			                     d->unc[i], *d->unc_len[i]);
+		atomic_set(&d->stop, 1);
+		wake_up(&d->done);
+	}
+	return 0;
+}
+/**
+ * Structure used for LZO data compression.
+ */
+struct cmp_data {
+	struct task_struct *thr;                  /* thread */
+	atomic_t ready;                           /* ready to start flag */
+	atomic_t stop;                            /* ready to stop flag */
+	int ret;                                  /* return code */
+	wait_queue_head_t go;                     /* start compression */
+	wait_queue_head_t done;                   /* compression done */
+	size_t unc_len;                           /* uncompressed length */
+	size_t cmp_len;                           /* compressed length */
+	unsigned char unc[LZO_UNC_SIZE];          /* uncompressed buffer */
+	unsigned char cmp[LZO_CMP_SIZE];          /* compressed buffer */
+	unsigned char wrk[LZO1X_1_MEM_COMPRESS];  /* compression workspace */
+};
+
+/**
+ * Compression function that runs in its own thread.
+ */
+static int lzo_compress_threadfn(void *data)
+{
+	struct cmp_data *d = data;
+
+	while (1) {
+		wait_event(d->go, atomic_read(&d->ready) ||
+		                  kthread_should_stop());
+		if (kthread_should_stop()) {
+			d->thr = NULL;
+			d->ret = -1;
+			atomic_set(&d->stop, 1);
+			wake_up(&d->done);
+			break;
+		}
+		atomic_set(&d->ready, 0);
+
+		d->ret = lzo1x_1_compress(d->unc, d->unc_len,
+		                          d->cmp + LZO_HEADER, &d->cmp_len,
+		                          d->wrk);
+		atomic_set(&d->stop, 1);
+		wake_up(&d->done);
+	}
+	return 0;
+}
 
 /**
  * save_image_lzo - Save the suspend image data compressed with LZO.
@@ -437,42 +559,93 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	struct bio *bio;
 	struct timeval start;
 	struct timeval stop;
-	size_t off, unc_len, cmp_len;
-	unsigned char *unc, *cmp, *wrk, *page;
+	size_t off;
+	unsigned thr, run_threads, nr_threads;
+	unsigned char *page = NULL;
+	struct cmp_data *data = NULL;
+	struct crc_data *crc = NULL;
+
+	/*
+	 * We'll limit the number of threads for compression to limit memory
+	 * footprint.
+	 */
+	nr_threads = num_online_cpus() - 1;
+	nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
 
 	page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
 	if (!page) {
 		printk(KERN_ERR "PM: Failed to allocate LZO page\n");
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out_clean;
 	}
 
-	wrk = vmalloc(LZO1X_1_MEM_COMPRESS);
-	if (!wrk) {
-		printk(KERN_ERR "PM: Failed to allocate LZO workspace\n");
-		free_page((unsigned long)page);
-		return -ENOMEM;
+	data = vmalloc(sizeof(*data) * nr_threads);
+	if (!data) {
+		printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+		ret = -ENOMEM;
+		goto out_clean;
 	}
+	for (thr = 0; thr < nr_threads; thr++)
+		memset(&data[thr], 0, offsetof(struct cmp_data, go));
 
-	unc = vmalloc(LZO_UNC_SIZE);
-	if (!unc) {
-		printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
-		vfree(wrk);
-		free_page((unsigned long)page);
-		return -ENOMEM;
+	crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+	if (!crc) {
+		printk(KERN_ERR "PM: Failed to allocate crc\n");
+		ret = -ENOMEM;
+		goto out_clean;
+	}
+	memset(crc, 0, offsetof(struct crc_data, go));
+
+	/*
+	 * Start the compression threads.
+	 */
+	for (thr = 0; thr < nr_threads; thr++) {
+		init_waitqueue_head(&data[thr].go);
+		init_waitqueue_head(&data[thr].done);
+
+		data[thr].thr = kthread_run(lzo_compress_threadfn,
+		                            &data[thr],
+		                            "image_compress/%u", thr);
+		if (IS_ERR(data[thr].thr)) {
+			data[thr].thr = NULL;
+			printk(KERN_ERR
+			       "PM: Cannot start compression threads\n");
+			ret = -ENOMEM;
+			goto out_clean;
+		}
 	}
 
-	cmp = vmalloc(LZO_CMP_SIZE);
-	if (!cmp) {
-		printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
-		vfree(unc);
-		vfree(wrk);
-		free_page((unsigned long)page);
-		return -ENOMEM;
+	/*
+	 * Adjust number of free pages after all allocations have been done.
+	 * We don't want to run out of pages when writing.
+	 */
+	handle->nr_free_pages = nr_free_pages() >> 1;
+
+	/*
+	 * Start the CRC32 thread.
+	 */
+	init_waitqueue_head(&crc->go);
+	init_waitqueue_head(&crc->done);
+
+	handle->crc32 = 0;
+	crc->crc32 = &handle->crc32;
+	for (thr = 0; thr < nr_threads; thr++) {
+		crc->unc[thr] = data[thr].unc;
+		crc->unc_len[thr] = &data[thr].unc_len;
+	}
+
+	crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
+	if (IS_ERR(crc->thr)) {
+		crc->thr = NULL;
+		printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+		ret = -ENOMEM;
+		goto out_clean;
 	}
 
 	printk(KERN_INFO
+		"PM: Using %u thread(s) for compression.\n"
 		"PM: Compressing and saving image data (%u pages) ...     ",
-		nr_to_write);
+		nr_threads, nr_to_write);
 	m = nr_to_write / 100;
 	if (!m)
 		m = 1;
@@ -480,55 +653,83 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	bio = NULL;
 	do_gettimeofday(&start);
 	for (;;) {
-		for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
-			ret = snapshot_read_next(snapshot);
-			if (ret < 0)
-				goto out_finish;
-
-			if (!ret)
+		for (thr = 0; thr < nr_threads; thr++) {
+			for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
+				ret = snapshot_read_next(snapshot);
+				if (ret < 0)
+					goto out_finish;
+
+				if (!ret)
+					break;
+
+				memcpy(data[thr].unc + off,
+				       data_of(*snapshot), PAGE_SIZE);
+
+				if (!(nr_pages % m))
+					printk(KERN_CONT "\b\b\b\b%3d%%",
+				               nr_pages / m);
+				nr_pages++;
+			}
+			if (!off)
 				break;
 
-			memcpy(unc + off, data_of(*snapshot), PAGE_SIZE);
+			data[thr].unc_len = off;
 
-			if (!(nr_pages % m))
-				printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
-			nr_pages++;
+			atomic_set(&data[thr].ready, 1);
+			wake_up(&data[thr].go);
 		}
 
-		if (!off)
+		if (!thr)
 			break;
 
-		unc_len = off;
-		ret = lzo1x_1_compress(unc, unc_len,
-		                       cmp + LZO_HEADER, &cmp_len, wrk);
-		if (ret < 0) {
-			printk(KERN_ERR "PM: LZO compression failed\n");
-			break;
-		}
+		crc->run_threads = thr;
+		atomic_set(&crc->ready, 1);
+		wake_up(&crc->go);
 
-		if (unlikely(!cmp_len ||
-		             cmp_len > lzo1x_worst_compress(unc_len))) {
-			printk(KERN_ERR "PM: Invalid LZO compressed length\n");
-			ret = -1;
-			break;
-		}
+		for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
+			wait_event(data[thr].done,
+			           atomic_read(&data[thr].stop));
+			atomic_set(&data[thr].stop, 0);
 
-		*(size_t *)cmp = cmp_len;
+			ret = data[thr].ret;
 
-		/*
-		 * Given we are writing one page at a time to disk, we copy
-		 * that much from the buffer, although the last bit will likely
-		 * be smaller than full page. This is OK - we saved the length
-		 * of the compressed data, so any garbage at the end will be
-		 * discarded when we read it.
-		 */
-		for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
-			memcpy(page, cmp + off, PAGE_SIZE);
+			if (ret < 0) {
+				printk(KERN_ERR "PM: LZO compression failed\n");
+				goto out_finish;
+			}
 
-			ret = swap_write_page(handle, page, &bio);
-			if (ret)
+			if (unlikely(!data[thr].cmp_len ||
+			             data[thr].cmp_len >
+			             lzo1x_worst_compress(data[thr].unc_len))) {
+				printk(KERN_ERR
+				       "PM: Invalid LZO compressed length\n");
+				ret = -1;
 				goto out_finish;
+			}
+
+			*(size_t *)data[thr].cmp = data[thr].cmp_len;
+
+			/*
+			 * Given we are writing one page at a time to disk, we
+			 * copy that much from the buffer, although the last
+			 * bit will likely be smaller than full page. This is
+			 * OK - we saved the length of the compressed data, so
+			 * any garbage at the end will be discarded when we
+			 * read it.
+			 */
+			for (off = 0;
+			     off < LZO_HEADER + data[thr].cmp_len;
+			     off += PAGE_SIZE) {
+				memcpy(page, data[thr].cmp + off, PAGE_SIZE);
+
+				ret = swap_write_page(handle, page, &bio);
+				if (ret)
+					goto out_finish;
+			}
 		}
+
+		wait_event(crc->done, atomic_read(&crc->stop));
+		atomic_set(&crc->stop, 0);
 	}
 
 out_finish:
@@ -536,16 +737,25 @@ out_finish:
 	do_gettimeofday(&stop);
 	if (!ret)
 		ret = err2;
-	if (!ret)
+	if (!ret) {
 		printk(KERN_CONT "\b\b\b\bdone\n");
-	else
+	} else {
 		printk(KERN_CONT "\n");
+	}
 	swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
-
-	vfree(cmp);
-	vfree(unc);
-	vfree(wrk);
-	free_page((unsigned long)page);
+out_clean:
+	if (crc) {
+		if (crc->thr)
+			kthread_stop(crc->thr);
+		kfree(crc);
+	}
+	if (data) {
+		for (thr = 0; thr < nr_threads; thr++)
+			if (data[thr].thr)
+				kthread_stop(data[thr].thr);
+		vfree(data);
+	}
+	if (page) free_page((unsigned long)page);
 
 	return ret;
 }
@@ -625,8 +835,15 @@ out_finish:
 
 static void release_swap_reader(struct swap_map_handle *handle)
 {
-	if (handle->cur)
-		free_page((unsigned long)handle->cur);
+	struct swap_map_page_list *tmp;
+
+	while (handle->maps) {
+		if (handle->maps->map)
+			free_page((unsigned long)handle->maps->map);
+		tmp = handle->maps;
+		handle->maps = handle->maps->next;
+		kfree(tmp);
+	}
 	handle->cur = NULL;
 }
 
@@ -634,22 +851,46 @@ static int get_swap_reader(struct swap_map_handle *handle,
 		unsigned int *flags_p)
 {
 	int error;
+	struct swap_map_page_list *tmp, *last;
+	sector_t offset;
 
 	*flags_p = swsusp_header->flags;
 
 	if (!swsusp_header->image) /* how can this happen? */
 		return -EINVAL;
 
-	handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
-	if (!handle->cur)
-		return -ENOMEM;
+	handle->cur = NULL;
+	last = handle->maps = NULL;
+	offset = swsusp_header->image;
+	while (offset) {
+		tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL);
+		if (!tmp) {
+			release_swap_reader(handle);
+			return -ENOMEM;
+		}
+		memset(tmp, 0, sizeof(*tmp));
+		if (!handle->maps)
+			handle->maps = tmp;
+		if (last)
+			last->next = tmp;
+		last = tmp;
+
+		tmp->map = (struct swap_map_page *)
+		           __get_free_page(__GFP_WAIT | __GFP_HIGH);
+		if (!tmp->map) {
+			release_swap_reader(handle);
+			return -ENOMEM;
+		}
 
-	error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL);
-	if (error) {
-		release_swap_reader(handle);
-		return error;
+		error = hib_bio_read_page(offset, tmp->map, NULL);
+		if (error) {
+			release_swap_reader(handle);
+			return error;
+		}
+		offset = tmp->map->next_swap;
 	}
 	handle->k = 0;
+	handle->cur = handle->maps->map;
 	return 0;
 }
 
@@ -658,6 +899,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
 {
 	sector_t offset;
 	int error;
+	struct swap_map_page_list *tmp;
 
 	if (!handle->cur)
 		return -EINVAL;
@@ -668,13 +910,15 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
 	if (error)
 		return error;
 	if (++handle->k >= MAP_PAGE_ENTRIES) {
-		error = hib_wait_on_bio_chain(bio_chain);
 		handle->k = 0;
-		offset = handle->cur->next_swap;
-		if (!offset)
+		free_page((unsigned long)handle->maps->map);
+		tmp = handle->maps;
+		handle->maps = handle->maps->next;
+		kfree(tmp);
+		if (!handle->maps)
 			release_swap_reader(handle);
-		else if (!error)
-			error = hib_bio_read_page(offset, handle->cur, NULL);
+		else
+			handle->cur = handle->maps->map;
 	}
 	return error;
 }
@@ -697,7 +941,7 @@ static int load_image(struct swap_map_handle *handle,
                       unsigned int nr_to_read)
 {
 	unsigned int m;
-	int error = 0;
+	int ret = 0;
 	struct timeval start;
 	struct timeval stop;
 	struct bio *bio;
@@ -713,15 +957,15 @@ static int load_image(struct swap_map_handle *handle,
 	bio = NULL;
 	do_gettimeofday(&start);
 	for ( ; ; ) {
-		error = snapshot_write_next(snapshot);
-		if (error <= 0)
+		ret = snapshot_write_next(snapshot);
+		if (ret <= 0)
 			break;
-		error = swap_read_page(handle, data_of(*snapshot), &bio);
-		if (error)
+		ret = swap_read_page(handle, data_of(*snapshot), &bio);
+		if (ret)
 			break;
 		if (snapshot->sync_read)
-			error = hib_wait_on_bio_chain(&bio);
-		if (error)
+			ret = hib_wait_on_bio_chain(&bio);
+		if (ret)
 			break;
 		if (!(nr_pages % m))
 			printk("\b\b\b\b%3d%%", nr_pages / m);
@@ -729,17 +973,61 @@ static int load_image(struct swap_map_handle *handle,
 	}
 	err2 = hib_wait_on_bio_chain(&bio);
 	do_gettimeofday(&stop);
-	if (!error)
-		error = err2;
-	if (!error) {
+	if (!ret)
+		ret = err2;
+	if (!ret) {
 		printk("\b\b\b\bdone\n");
 		snapshot_write_finalize(snapshot);
 		if (!snapshot_image_loaded(snapshot))
-			error = -ENODATA;
+			ret = -ENODATA;
 	} else
 		printk("\n");
 	swsusp_show_speed(&start, &stop, nr_to_read, "Read");
-	return error;
+	return ret;
+}
+
+/**
+ * Structure used for LZO data decompression.
+ */
+struct dec_data {
+	struct task_struct *thr;                  /* thread */
+	atomic_t ready;                           /* ready to start flag */
+	atomic_t stop;                            /* ready to stop flag */
+	int ret;                                  /* return code */
+	wait_queue_head_t go;                     /* start decompression */
+	wait_queue_head_t done;                   /* decompression done */
+	size_t unc_len;                           /* uncompressed length */
+	size_t cmp_len;                           /* compressed length */
+	unsigned char unc[LZO_UNC_SIZE];          /* uncompressed buffer */
+	unsigned char cmp[LZO_CMP_SIZE];          /* compressed buffer */
+};
+
+/**
+ * Deompression function that runs in its own thread.
+ */
+static int lzo_decompress_threadfn(void *data)
+{
+	struct dec_data *d = data;
+
+	while (1) {
+		wait_event(d->go, atomic_read(&d->ready) ||
+		                  kthread_should_stop());
+		if (kthread_should_stop()) {
+			d->thr = NULL;
+			d->ret = -1;
+			atomic_set(&d->stop, 1);
+			wake_up(&d->done);
+			break;
+		}
+		atomic_set(&d->ready, 0);
+
+		d->unc_len = LZO_UNC_SIZE;
+		d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
+		                               d->unc, &d->unc_len);
+		atomic_set(&d->stop, 1);
+		wake_up(&d->done);
+	}
+	return 0;
 }
 
 /**
@@ -753,50 +1041,120 @@ static int load_image_lzo(struct swap_map_handle *handle,
                           unsigned int nr_to_read)
 {
 	unsigned int m;
-	int error = 0;
+	int ret = 0;
+	int eof = 0;
 	struct bio *bio;
 	struct timeval start;
 	struct timeval stop;
 	unsigned nr_pages;
-	size_t i, off, unc_len, cmp_len;
-	unsigned char *unc, *cmp, *page[LZO_CMP_PAGES];
-
-	for (i = 0; i < LZO_CMP_PAGES; i++) {
-		page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
-		if (!page[i]) {
-			printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+	size_t off;
+	unsigned i, thr, run_threads, nr_threads;
+	unsigned ring = 0, pg = 0, ring_size = 0,
+	         have = 0, want, need, asked = 0;
+	unsigned long read_pages;
+	unsigned char **page = NULL;
+	struct dec_data *data = NULL;
+	struct crc_data *crc = NULL;
+
+	/*
+	 * We'll limit the number of threads for decompression to limit memory
+	 * footprint.
+	 */
+	nr_threads = num_online_cpus() - 1;
+	nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
+
+	page = vmalloc(sizeof(*page) * LZO_READ_PAGES);
+	if (!page) {
+		printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+		ret = -ENOMEM;
+		goto out_clean;
+	}
 
-			while (i)
-				free_page((unsigned long)page[--i]);
+	data = vmalloc(sizeof(*data) * nr_threads);
+	if (!data) {
+		printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+		ret = -ENOMEM;
+		goto out_clean;
+	}
+	for (thr = 0; thr < nr_threads; thr++)
+		memset(&data[thr], 0, offsetof(struct dec_data, go));
 
-			return -ENOMEM;
+	crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+	if (!crc) {
+		printk(KERN_ERR "PM: Failed to allocate crc\n");
+		ret = -ENOMEM;
+		goto out_clean;
+	}
+	memset(crc, 0, offsetof(struct crc_data, go));
+
+	/*
+	 * Start the decompression threads.
+	 */
+	for (thr = 0; thr < nr_threads; thr++) {
+		init_waitqueue_head(&data[thr].go);
+		init_waitqueue_head(&data[thr].done);
+
+		data[thr].thr = kthread_run(lzo_decompress_threadfn,
+		                            &data[thr],
+		                            "image_decompress/%u", thr);
+		if (IS_ERR(data[thr].thr)) {
+			data[thr].thr = NULL;
+			printk(KERN_ERR
+			       "PM: Cannot start decompression threads\n");
+			ret = -ENOMEM;
+			goto out_clean;
 		}
 	}
 
-	unc = vmalloc(LZO_UNC_SIZE);
-	if (!unc) {
-		printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
-
-		for (i = 0; i < LZO_CMP_PAGES; i++)
-			free_page((unsigned long)page[i]);
-
-		return -ENOMEM;
+	/*
+	 * Start the CRC32 thread.
+	 */
+	init_waitqueue_head(&crc->go);
+	init_waitqueue_head(&crc->done);
+
+	handle->crc32 = 0;
+	crc->crc32 = &handle->crc32;
+	for (thr = 0; thr < nr_threads; thr++) {
+		crc->unc[thr] = data[thr].unc;
+		crc->unc_len[thr] = &data[thr].unc_len;
 	}
 
-	cmp = vmalloc(LZO_CMP_SIZE);
-	if (!cmp) {
-		printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
+	crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
+	if (IS_ERR(crc->thr)) {
+		crc->thr = NULL;
+		printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+		ret = -ENOMEM;
+		goto out_clean;
+	}
 
-		vfree(unc);
-		for (i = 0; i < LZO_CMP_PAGES; i++)
-			free_page((unsigned long)page[i]);
+	/*
+	 * Adjust number of pages for read buffering, in case we are short.
+	 */
+	read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1;
+	read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES);
 
-		return -ENOMEM;
+	for (i = 0; i < read_pages; i++) {
+		page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
+		                                  __GFP_WAIT | __GFP_HIGH :
+		                                  __GFP_WAIT);
+		if (!page[i]) {
+			if (i < LZO_CMP_PAGES) {
+				ring_size = i;
+				printk(KERN_ERR
+				       "PM: Failed to allocate LZO pages\n");
+				ret = -ENOMEM;
+				goto out_clean;
+			} else {
+				break;
+			}
+		}
 	}
+	want = ring_size = i;
 
 	printk(KERN_INFO
+		"PM: Using %u thread(s) for decompression.\n"
 		"PM: Loading and decompressing image data (%u pages) ...     ",
-		nr_to_read);
+		nr_threads, nr_to_read);
 	m = nr_to_read / 100;
 	if (!m)
 		m = 1;
@@ -804,85 +1162,189 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	bio = NULL;
 	do_gettimeofday(&start);
 
-	error = snapshot_write_next(snapshot);
-	if (error <= 0)
+	ret = snapshot_write_next(snapshot);
+	if (ret <= 0)
 		goto out_finish;
 
-	for (;;) {
-		error = swap_read_page(handle, page[0], NULL); /* sync */
-		if (error)
-			break;
-
-		cmp_len = *(size_t *)page[0];
-		if (unlikely(!cmp_len ||
-		             cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
-			printk(KERN_ERR "PM: Invalid LZO compressed length\n");
-			error = -1;
-			break;
+	for(;;) {
+		for (i = 0; !eof && i < want; i++) {
+			ret = swap_read_page(handle, page[ring], &bio);
+			if (ret) {
+				/*
+				 * On real read error, finish. On end of data,
+				 * set EOF flag and just exit the read loop.
+				 */
+				if (handle->cur &&
+				    handle->cur->entries[handle->k]) {
+					goto out_finish;
+				} else {
+					eof = 1;
+					break;
+				}
+			}
+			if (++ring >= ring_size)
+				ring = 0;
 		}
+		asked += i;
+		want -= i;
 
-		for (off = PAGE_SIZE, i = 1;
-		     off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
-			error = swap_read_page(handle, page[i], &bio);
-			if (error)
+		/*
+		 * We are out of data, wait for some more.
+		 */
+		if (!have) {
+			if (!asked)
+				break;
+
+			ret = hib_wait_on_bio_chain(&bio);
+			if (ret)
 				goto out_finish;
+			have += asked;
+			asked = 0;
+			if (eof)
+				eof = 2;
 		}
 
-		error = hib_wait_on_bio_chain(&bio); /* need all data now */
-		if (error)
-			goto out_finish;
-
-		for (off = 0, i = 0;
-		     off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
-			memcpy(cmp + off, page[i], PAGE_SIZE);
+		if (crc->run_threads) {
+			wait_event(crc->done, atomic_read(&crc->stop));
+			atomic_set(&crc->stop, 0);
+			crc->run_threads = 0;
 		}
 
-		unc_len = LZO_UNC_SIZE;
-		error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len,
-		                              unc, &unc_len);
-		if (error < 0) {
-			printk(KERN_ERR "PM: LZO decompression failed\n");
-			break;
+		for (thr = 0; have && thr < nr_threads; thr++) {
+			data[thr].cmp_len = *(size_t *)page[pg];
+			if (unlikely(!data[thr].cmp_len ||
+			             data[thr].cmp_len >
+			             lzo1x_worst_compress(LZO_UNC_SIZE))) {
+				printk(KERN_ERR
+				       "PM: Invalid LZO compressed length\n");
+				ret = -1;
+				goto out_finish;
+			}
+
+			need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER,
+			                    PAGE_SIZE);
+			if (need > have) {
+				if (eof > 1) {
+					ret = -1;
+					goto out_finish;
+				}
+				break;
+			}
+
+			for (off = 0;
+			     off < LZO_HEADER + data[thr].cmp_len;
+			     off += PAGE_SIZE) {
+				memcpy(data[thr].cmp + off,
+				       page[pg], PAGE_SIZE);
+				have--;
+				want++;
+				if (++pg >= ring_size)
+					pg = 0;
+			}
+
+			atomic_set(&data[thr].ready, 1);
+			wake_up(&data[thr].go);
 		}
 
-		if (unlikely(!unc_len ||
-		             unc_len > LZO_UNC_SIZE ||
-		             unc_len & (PAGE_SIZE - 1))) {
-			printk(KERN_ERR "PM: Invalid LZO uncompressed length\n");
-			error = -1;
-			break;
+		/*
+		 * Wait for more data while we are decompressing.
+		 */
+		if (have < LZO_CMP_PAGES && asked) {
+			ret = hib_wait_on_bio_chain(&bio);
+			if (ret)
+				goto out_finish;
+			have += asked;
+			asked = 0;
+			if (eof)
+				eof = 2;
 		}
 
-		for (off = 0; off < unc_len; off += PAGE_SIZE) {
-			memcpy(data_of(*snapshot), unc + off, PAGE_SIZE);
+		for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
+			wait_event(data[thr].done,
+			           atomic_read(&data[thr].stop));
+			atomic_set(&data[thr].stop, 0);
+
+			ret = data[thr].ret;
 
-			if (!(nr_pages % m))
-				printk("\b\b\b\b%3d%%", nr_pages / m);
-			nr_pages++;
+			if (ret < 0) {
+				printk(KERN_ERR
+				       "PM: LZO decompression failed\n");
+				goto out_finish;
+			}
 
-			error = snapshot_write_next(snapshot);
-			if (error <= 0)
+			if (unlikely(!data[thr].unc_len ||
+			             data[thr].unc_len > LZO_UNC_SIZE ||
+			             data[thr].unc_len & (PAGE_SIZE - 1))) {
+				printk(KERN_ERR
+				       "PM: Invalid LZO uncompressed length\n");
+				ret = -1;
 				goto out_finish;
+			}
+
+			for (off = 0;
+			     off < data[thr].unc_len; off += PAGE_SIZE) {
+				memcpy(data_of(*snapshot),
+				       data[thr].unc + off, PAGE_SIZE);
+
+				if (!(nr_pages % m))
+					printk("\b\b\b\b%3d%%", nr_pages / m);
+				nr_pages++;
+
+				ret = snapshot_write_next(snapshot);
+				if (ret <= 0) {
+					crc->run_threads = thr + 1;
+					atomic_set(&crc->ready, 1);
+					wake_up(&crc->go);
+					goto out_finish;
+				}
+			}
 		}
+
+		crc->run_threads = thr;
+		atomic_set(&crc->ready, 1);
+		wake_up(&crc->go);
 	}
 
 out_finish:
+	if (crc->run_threads) {
+		wait_event(crc->done, atomic_read(&crc->stop));
+		atomic_set(&crc->stop, 0);
+	}
 	do_gettimeofday(&stop);
-	if (!error) {
+	if (!ret) {
 		printk("\b\b\b\bdone\n");
 		snapshot_write_finalize(snapshot);
 		if (!snapshot_image_loaded(snapshot))
-			error = -ENODATA;
+			ret = -ENODATA;
+		if (!ret) {
+			if (swsusp_header->flags & SF_CRC32_MODE) {
+				if(handle->crc32 != swsusp_header->crc32) {
+					printk(KERN_ERR
+					       "PM: Invalid image CRC32!\n");
+					ret = -ENODATA;
+				}
+			}
+		}
 	} else
 		printk("\n");
 	swsusp_show_speed(&start, &stop, nr_to_read, "Read");
-
-	vfree(cmp);
-	vfree(unc);
-	for (i = 0; i < LZO_CMP_PAGES; i++)
+out_clean:
+	for (i = 0; i < ring_size; i++)
 		free_page((unsigned long)page[i]);
+	if (crc) {
+		if (crc->thr)
+			kthread_stop(crc->thr);
+		kfree(crc);
+	}
+	if (data) {
+		for (thr = 0; thr < nr_threads; thr++)
+			if (data[thr].thr)
+				kthread_stop(data[thr].thr);
+		vfree(data);
+	}
+	if (page) vfree(page);
 
-	return error;
+	return ret;
 }
 
 /**
-- 
cgit v1.2.2


From 9bab0b7fbaceec47d32db51cd9e59c82fb071f5a Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Mon, 3 Oct 2011 15:37:00 +0100
Subject: genirq: Add IRQF_RESUME_EARLY and resume such IRQs earlier

This adds a mechanism to resume selected IRQs during syscore_resume
instead of dpm_resume_noirq.

Under Xen we need to resume IRQs associated with IPIs early enough
that the resched IPI is unmasked and we can therefore schedule
ourselves out of the stop_machine where the suspend/resume takes
place.

This issue was introduced by 676dc3cf5bc3 "xen: Use IRQF_FORCE_RESUME".

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Jeremy Fitzhardinge <Jeremy.Fitzhardinge@citrix.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Link: http://lkml.kernel.org/r/1318713254.11016.52.camel@dagon.hellion.org.uk
Cc: stable@kernel.org (at least to 2.6.32.y)
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/pm.c | 48 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 41 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index f76fc00c9877..15e53b1766a6 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -9,6 +9,7 @@
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
+#include <linux/syscore_ops.h>
 
 #include "internals.h"
 
@@ -39,25 +40,58 @@ void suspend_device_irqs(void)
 }
 EXPORT_SYMBOL_GPL(suspend_device_irqs);
 
-/**
- * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
- *
- * Enable all interrupt lines previously disabled by suspend_device_irqs() that
- * have the IRQS_SUSPENDED flag set.
- */
-void resume_device_irqs(void)
+static void resume_irqs(bool want_early)
 {
 	struct irq_desc *desc;
 	int irq;
 
 	for_each_irq_desc(irq, desc) {
 		unsigned long flags;
+		bool is_early = desc->action &&
+			desc->action->flags & IRQF_EARLY_RESUME;
+
+		if (is_early != want_early)
+			continue;
 
 		raw_spin_lock_irqsave(&desc->lock, flags);
 		__enable_irq(desc, irq, true);
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
 	}
 }
+
+/**
+ * irq_pm_syscore_ops - enable interrupt lines early
+ *
+ * Enable all interrupt lines with %IRQF_EARLY_RESUME set.
+ */
+static void irq_pm_syscore_resume(void)
+{
+	resume_irqs(true);
+}
+
+static struct syscore_ops irq_pm_syscore_ops = {
+	.resume		= irq_pm_syscore_resume,
+};
+
+static int __init irq_pm_init_ops(void)
+{
+	register_syscore_ops(&irq_pm_syscore_ops);
+	return 0;
+}
+
+device_initcall(irq_pm_init_ops);
+
+/**
+ * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
+ *
+ * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously
+ * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag
+ * set as well as those with %IRQF_FORCE_RESUME.
+ */
+void resume_device_irqs(void)
+{
+	resume_irqs(false);
+}
 EXPORT_SYMBOL_GPL(resume_device_irqs);
 
 /**
-- 
cgit v1.2.2


From 825de2e9007439977aed63771db570fc2235e8cd Mon Sep 17 00:00:00 2001
From: Nobuhiro Iwamatsu <nobuhiro.iwamatsu.yj@renesas.com>
Date: Mon, 17 Oct 2011 11:08:46 +0900
Subject: irq: Add EXPORT_SYMBOL_GPL to function of irq generic-chip

Some functions of irq generic-chip is undefined, because
EXPORT_SYMBOL_GPL is not set to these.

ERROR: "irq_setup_generic_chip" [drivers/gpio/gpio-pch.ko] undefined!
ERROR: "irq_alloc_generic_chip" [drivers/gpio/gpio-pch.ko] undefined!
ERROR: "irq_setup_generic_chip" [drivers/gpio/gpio-ml-ioh.ko] undefined!
ERROR: "irq_alloc_generic_chip" [drivers/gpio/gpio-ml-ioh.ko] undefined!

This is revised that EXPORT_SYMBOL_GPL can be added and referred
to in functions.

Signed-off-by: Nobuhiro Iwamatsu <nobuhiro.iwamatsu.yj@renesas.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 kernel/irq/generic-chip.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index e38544dddb18..6cb7613e4bf4 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -211,6 +211,7 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
 	}
 	return gc;
 }
+EXPORT_SYMBOL_GPL(irq_alloc_generic_chip);
 
 /*
  * Separate lockdep class for interrupt chip which can nest irq_desc
@@ -258,6 +259,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
 	}
 	gc->irq_cnt = i - gc->irq_base;
 }
+EXPORT_SYMBOL_GPL(irq_setup_generic_chip);
 
 /**
  * irq_setup_alt_chip - Switch to alternative chip
@@ -281,6 +283,7 @@ int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
 	}
 	return -EINVAL;
 }
+EXPORT_SYMBOL_GPL(irq_setup_alt_chip);
 
 /**
  * irq_remove_generic_chip - Remove a chip
@@ -311,6 +314,7 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
 		irq_modify_status(i, clr, set);
 	}
 }
+EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
 
 #ifdef CONFIG_PM
 static int irq_gc_suspend(void)
-- 
cgit v1.2.2


From 189c3fd68c7016e37c1ffd7a00009e2c944a9d06 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 30 Sep 2011 09:10:54 -0700
Subject: stop_machine: make stop_machine safe and efficient to call early

Make stop_machine() safe to call early in boot, before stop_machine()
has been set up, by simply calling the callback function directly if
there's only one CPU online.

[ Fixes from AKPM:
   - add comment
   - local_irq_flags, not save_flags
   - also call hard_irq_disable() for systems which need it

  Tejun suggested using an explicit flag rather than just looking at
  the online cpu count. ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/stop_machine.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index ba5070ce5765..d3f960a6ca56 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -41,6 +41,7 @@ struct cpu_stopper {
 };
 
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+static bool stop_machine_initialized = false;
 
 static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
 {
@@ -386,6 +387,8 @@ static int __init cpu_stop_init(void)
 	cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
 	register_cpu_notifier(&cpu_stop_cpu_notifier);
 
+	stop_machine_initialized = true;
+
 	return 0;
 }
 early_initcall(cpu_stop_init);
@@ -485,6 +488,25 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 					    .num_threads = num_online_cpus(),
 					    .active_cpus = cpus };
 
+	if (!stop_machine_initialized) {
+		/*
+		 * Handle the case where stop_machine() is called
+		 * early in boot before stop_machine() has been
+		 * initialized.
+		 */
+ 		unsigned long flags;
+		int ret;
+
+		WARN_ON_ONCE(smdata.num_threads != 1);
+
+		local_irq_save(flags);
+		hard_irq_disable();
+		ret = (*fn)(data);
+		local_irq_restore(flags);
+
+		return ret;
+	}
+
 	/* Set the initial state and stop all online cpus. */
 	set_state(&smdata, STOPMACHINE_PREPARE);
 	return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
-- 
cgit v1.2.2


From 37348804e0289087d21ae8bff4c0732030a3c6ac Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 29 Sep 2011 11:10:05 -0700
Subject: jump_label: if a key has already been initialized, don't nop it out

If a key has been enabled before jump_label_init() is called, don't
nop it out.

This removes arch_jump_label_text_poke_early() (which can only nop
out a site) and uses arch_jump_label_transform() instead.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Jason Baron <jbaron@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/jump_label.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index a8ce45097f3d..059202d5b77a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -121,13 +121,6 @@ static void __jump_label_update(struct jump_label_key *key,
 	}
 }
 
-/*
- * Not all archs need this.
- */
-void __weak arch_jump_label_text_poke_early(jump_label_t addr)
-{
-}
-
 static __init int jump_label_init(void)
 {
 	struct jump_entry *iter_start = __start___jump_table;
@@ -139,12 +132,15 @@ static __init int jump_label_init(void)
 	jump_label_sort_entries(iter_start, iter_stop);
 
 	for (iter = iter_start; iter < iter_stop; iter++) {
-		arch_jump_label_text_poke_early(iter->code);
-		if (iter->key == (jump_label_t)(unsigned long)key)
+		struct jump_label_key *iterk;
+
+		iterk = (struct jump_label_key *)(unsigned long)iter->key;
+		arch_jump_label_transform(iter, jump_label_enabled(iterk) ?
+					  JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
+		if (iterk == key)
 			continue;
 
-		key = (struct jump_label_key *)(unsigned long)iter->key;
-		atomic_set(&key->enabled, 0);
+		key = iterk;
 		key->entries = iter;
 #ifdef CONFIG_MODULES
 		key->next = NULL;
@@ -212,7 +208,7 @@ void jump_label_apply_nops(struct module *mod)
 		return;
 
 	for (iter = iter_start; iter < iter_stop; iter++)
-		arch_jump_label_text_poke_early(iter->code);
+		arch_jump_label_transform(iter, JUMP_LABEL_DISABLE);
 }
 
 static int jump_label_add_module(struct module *mod)
-- 
cgit v1.2.2


From 20284aa77c0f6227da4783a920b72dc61d4bcc09 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 3 Oct 2011 11:01:46 -0700
Subject: jump_label: add arch_jump_label_transform_static() to optimise
 non-live code updates

When updating a newly loaded module, the code is definitely not yet
executing on any processor, so it can be updated with no need for any
heavyweight synchronization.

This patch adds arch_jump_label_static() which is implemented as
arch_jump_label_transform() by default, but architectures can override
it if it avoids, say, a call to stop_machine().

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Jason Baron <jbaron@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/jump_label.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 059202d5b77a..ff2028f35aa8 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -104,6 +104,18 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
 	return 0;
 }
 
+/* 
+ * Update code which is definitely not currently executing.
+ * Architectures which need heavyweight synchronization to modify
+ * running code can override this to make the non-live update case
+ * cheaper.
+ */
+void __weak arch_jump_label_transform_static(struct jump_entry *entry,
+					    enum jump_label_type type)
+{
+	arch_jump_label_transform(entry, type);	
+}
+
 static void __jump_label_update(struct jump_label_key *key,
 				struct jump_entry *entry,
 				struct jump_entry *stop, int enable)
@@ -135,8 +147,8 @@ static __init int jump_label_init(void)
 		struct jump_label_key *iterk;
 
 		iterk = (struct jump_label_key *)(unsigned long)iter->key;
-		arch_jump_label_transform(iter, jump_label_enabled(iterk) ?
-					  JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
+		arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
+						 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
 		if (iterk == key)
 			continue;
 
@@ -208,7 +220,7 @@ void jump_label_apply_nops(struct module *mod)
 		return;
 
 	for (iter = iter_start; iter < iter_stop; iter++)
-		arch_jump_label_transform(iter, JUMP_LABEL_DISABLE);
+		arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
 }
 
 static int jump_label_add_module(struct module *mod)
-- 
cgit v1.2.2


From 97ce2c88f9ad42e3c60a9beb9fca87abf3639faa Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 12 Oct 2011 16:17:54 -0700
Subject: jump-label: initialize jump-label subsystem much earlier

Initialize jump_labels much, much earlier, so they're available for use
during system setup.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/jump_label.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index ff2028f35aa8..bbdfe2a462a0 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -133,7 +133,7 @@ static void __jump_label_update(struct jump_label_key *key,
 	}
 }
 
-static __init int jump_label_init(void)
+void __init jump_label_init(void)
 {
 	struct jump_entry *iter_start = __start___jump_table;
 	struct jump_entry *iter_stop = __stop___jump_table;
@@ -159,10 +159,7 @@ static __init int jump_label_init(void)
 #endif
 	}
 	jump_label_unlock();
-
-	return 0;
 }
-early_initcall(jump_label_init);
 
 #ifdef CONFIG_MODULES
 
-- 
cgit v1.2.2


From 37252db6aa576c34fd794a5a54fb32d7a8b3a07a Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Wed, 26 Oct 2011 13:10:39 +1030
Subject: kmod: prevent kmod_loop_msg overflow in __request_module()

Due to post-increment in condition of kmod_loop_msg in __request_module(),
the system log can be spammed by much more than 5 instances of the 'runaway
loop' message if the number of events triggering it makes the kmod_loop_msg
to overflow.

Fix that by making sure we never increment it past the threshold.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
CC: stable@kernel.org
---
 kernel/kmod.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index ddc7644c1305..a4bea97c75b6 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -114,10 +114,12 @@ int __request_module(bool wait, const char *fmt, ...)
 	atomic_inc(&kmod_concurrent);
 	if (atomic_read(&kmod_concurrent) > max_modprobes) {
 		/* We may be blaming an innocent here, but unlikely */
-		if (kmod_loop_msg++ < 5)
+		if (kmod_loop_msg < 5) {
 			printk(KERN_ERR
 			       "request_module: runaway loop modprobe %s\n",
 			       module_name);
+			kmod_loop_msg++;
+		}
 		atomic_dec(&kmod_concurrent);
 		return -ENOMEM;
 	}
-- 
cgit v1.2.2


From b1e4d20cbf2ef8e27515da032b95fdcbb5b06bf1 Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Mon, 10 Oct 2011 00:03:37 +0200
Subject: params: make dashes and underscores in parameter names truly equal

The user may use "foo-bar" for a kernel parameter defined as "foo_bar".
Make sure it works the other way around too.

Apply the equality of dashes and underscores on early_params and __setup
params as well.

The example given in Documentation/kernel-parameters.txt indicates that
this is the intended behaviour.

With the patch the kernel accepts "log-buf-len=1M" as expected.
https://bugzilla.redhat.com/show_bug.cgi?id=744545

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (neatened implementations)
---
 kernel/params.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 22df3e0d142a..821788947e40 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -67,20 +67,27 @@ static void maybe_kfree_parameter(void *param)
 	}
 }
 
-static inline char dash2underscore(char c)
+static char dash2underscore(char c)
 {
 	if (c == '-')
 		return '_';
 	return c;
 }
 
-static inline int parameq(const char *input, const char *paramname)
+bool parameqn(const char *a, const char *b, size_t n)
 {
-	unsigned int i;
-	for (i = 0; dash2underscore(input[i]) == paramname[i]; i++)
-		if (input[i] == '\0')
-			return 1;
-	return 0;
+	size_t i;
+
+	for (i = 0; i < n; i++) {
+		if (dash2underscore(a[i]) != dash2underscore(b[i]))
+			return false;
+	}
+	return true;
+}
+
+bool parameq(const char *a, const char *b)
+{
+	return parameqn(a, b, strlen(a)+1);
 }
 
 static int parse_one(char *param,
-- 
cgit v1.2.2


From 3d214faea6e4f9b6018bf8589f4b245126349c0a Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Sun, 30 Oct 2011 15:16:36 +0100
Subject: [S390] kdump: Add KEXEC_CRASH_CONTROL_MEMORY_LIMIT

On s390 there is a different KEXEC_CONTROL_MEMORY_LIMIT for the normal and
the kdump kexec case. Therefore this patch introduces a new macro
KEXEC_CRASH_CONTROL_MEMORY_LIMIT. This is set to
KEXEC_CONTROL_MEMORY_LIMIT for all architectures that do not define
KEXEC_CRASH_CONTROL_MEMORY_LIMIT.

Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 kernel/kexec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 296fbc84d659..7204fb982ed5 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -498,7 +498,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 	while (hole_end <= crashk_res.end) {
 		unsigned long i;
 
-		if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
+		if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
 			break;
 		if (hole_end > crashk_res.end)
 			break;
-- 
cgit v1.2.2


From d3bf37955d46718ee1a7f1fc69f953d2328ba7c2 Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Sun, 30 Oct 2011 15:16:37 +0100
Subject: [S390] kdump: Add size to elfcorehdr kernel parameter

Currently only the address of the pre-allocated ELF header is passed with
the elfcorehdr= kernel parameter. In order to reserve memory for the header
in the 2nd kernel also the size is required. Current kdump architecture
backends use different methods to do that, e.g. x86 uses the memmap= kernel
parameter. On s390 there is no easy way to transfer this information.
Therefore the elfcorehdr kernel parameter is extended to also pass the size.
This now can also be used as standard mechanism by all future kdump
architecture backends.

The syntax of the kernel parameter is extended as follows:

elfcorehdr=[size[KMG]@]offset[KMG]

This change is backward compatible because elfcorehdr=size is still allowed.

Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 kernel/crash_dump.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index 5f85690285d4..69ebf3380bac 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -19,9 +19,16 @@ unsigned long saved_max_pfn;
  */
 unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
 
+/*
+ * stores the size of elf header of crash image
+ */
+unsigned long long elfcorehdr_size;
+
 /*
  * elfcorehdr= specifies the location of elf core header stored by the crashed
  * kernel. This option will be passed by kexec loader to the capture kernel.
+ *
+ * Syntax: elfcorehdr=[size[KMG]@]offset[KMG]
  */
 static int __init setup_elfcorehdr(char *arg)
 {
@@ -29,6 +36,10 @@ static int __init setup_elfcorehdr(char *arg)
 	if (!arg)
 		return -EINVAL;
 	elfcorehdr_addr = memparse(arg, &end);
+	if (*end == '@') {
+		elfcorehdr_size = elfcorehdr_addr;
+		elfcorehdr_addr = memparse(end + 1, &end);
+	}
 	return end > arg ? 0 : -EINVAL;
 }
 early_param("elfcorehdr", setup_elfcorehdr);
-- 
cgit v1.2.2


From fa8ff292bb4844cc0b66b7b24d611eb7177f7ded Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Sun, 30 Oct 2011 15:16:41 +0100
Subject: [S390] kdump: Initialize vmcoreinfo note at startup

Currently the vmcoreinfo note is only initialized in case of kdump. On s390
it is possible to create kernel dumps with other dump mechanisms than kdump
(e.g. via hypervisor dump or stand-alone dump tools). For those dumps it
would also be desirable to include the vmcoreinfo data. To accomplish this,
with this patch the vmcoreinfo ELF note is always initialized, not only in
case of a (kdump) crash. On s390 we will add an ABI defined pointer at
a well known address to vmcoreinfo so that dump analysis tools are able to
find this information.

In particular on s390 we have a tool named zgetdump. With this tool it is
possible to convert dump formats on the fly using fuse. E.g. you can mount a
s390 stand-alone dump as ELF dump. When this is done, the tool finds the
vmcoreinfo in the stand-alone dump via the well known ABI defined address and
it creates the respective VMCOREINFO ELF note in the output ELF dump. This then
can be used e.g. by makedumpfile for dump filtering.  No more need for a
vmlinux file with debug information.

So this will look like the following:
$ zgetdump --mount standalone.dump -f elf /mnt
$ ls /mnt
  dump.elf
$ readelf -n /mnt/dump.elf
$ ...
  VMCOREINFO            0x00000474      Unknown note type: (0x00000000)
$ makedumpfile -c -d 31 /mnt/dump.elf dump.kdump

Cc: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 kernel/kexec.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 7204fb982ed5..d3b8a4ceb90b 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1380,24 +1380,23 @@ int __init parse_crashkernel(char 		 *cmdline,
 }
 
 
-
-void crash_save_vmcoreinfo(void)
+static void update_vmcoreinfo_note(void)
 {
-	u32 *buf;
+	u32 *buf = vmcoreinfo_note;
 
 	if (!vmcoreinfo_size)
 		return;
-
-	vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
-
-	buf = (u32 *)vmcoreinfo_note;
-
 	buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
 			      vmcoreinfo_size);
-
 	final_note(buf);
 }
 
+void crash_save_vmcoreinfo(void)
+{
+	vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
+	update_vmcoreinfo_note();
+}
+
 void vmcoreinfo_append_str(const char *fmt, ...)
 {
 	va_list args;
@@ -1483,6 +1482,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_NUMBER(PG_swapcache);
 
 	arch_crash_save_vmcoreinfo();
+	update_vmcoreinfo_note();
 
 	return 0;
 }
-- 
cgit v1.2.2


From 558df7209e7997275f6b8ad37737494cf2da1512 Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Sun, 30 Oct 2011 15:16:43 +0100
Subject: [S390] kdump: Add infrastructure for unmapping crashkernel memory

This patch introduces a mechanism that allows architecture backends to
remove page tables for the crashkernel memory. This can protect the loaded
kdump kernel from being overwritten by broken kernel code.  Two new
functions crash_map_reserved_pages() and crash_unmap_reserved_pages() are
added that can be implemented by architecture code.  The
crash_map_reserved_pages() function is called before and
crash_unmap_reserved_pages() after the crashkernel segments are loaded.  The
functions are also called in crash_shrink_memory() to create/remove page
tables when the crashkernel memory size is reduced.

To support architectures that have large pages this patch also introduces
a new define KEXEC_CRASH_MEM_ALIGN. The crashkernel start and size must
always be aligned with KEXEC_CRASH_MEM_ALIGN.

Cc: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 kernel/kexec.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index d3b8a4ceb90b..dc7bc0829286 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -999,6 +999,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 			kimage_free(xchg(&kexec_crash_image, NULL));
 			result = kimage_crash_alloc(&image, entry,
 						     nr_segments, segments);
+			crash_map_reserved_pages();
 		}
 		if (result)
 			goto out;
@@ -1015,6 +1016,8 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 				goto out;
 		}
 		kimage_terminate(image);
+		if (flags & KEXEC_ON_CRASH)
+			crash_unmap_reserved_pages();
 	}
 	/* Install the new kernel, and  Uninstall the old */
 	image = xchg(dest_image, image);
@@ -1026,6 +1029,18 @@ out:
 	return result;
 }
 
+/*
+ * Add and remove page tables for crashkernel memory
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak crash_map_reserved_pages(void)
+{}
+
+void __weak crash_unmap_reserved_pages(void)
+{}
+
 #ifdef CONFIG_COMPAT
 asmlinkage long compat_sys_kexec_load(unsigned long entry,
 				unsigned long nr_segments,
@@ -1134,14 +1149,16 @@ int crash_shrink_memory(unsigned long new_size)
 		goto unlock;
 	}
 
-	start = roundup(start, PAGE_SIZE);
-	end = roundup(start + new_size, PAGE_SIZE);
+	start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
+	end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
 
+	crash_map_reserved_pages();
 	crash_free_reserved_phys_range(end, crashk_res.end);
 
 	if ((start == end) && (crashk_res.parent != NULL))
 		release_resource(&crashk_res);
 	crashk_res.end = end - 1;
+	crash_unmap_reserved_pages();
 
 unlock:
 	mutex_unlock(&kexec_mutex);
-- 
cgit v1.2.2


From 638ad34a8811119b32247b7722288ef8b90907d1 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Sun, 30 Oct 2011 15:17:13 +0100
Subject: [S390] sparse: fix sparse warnings about missing prototypes

Add prototypes and includes for functions used in different modules.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 kernel/sysctl.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2d2ecdcc8cdb..2fe2bc2a57ea 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -151,14 +151,6 @@ extern int pwrsw_enabled;
 extern int unaligned_enabled;
 #endif
 
-#ifdef CONFIG_S390
-#ifdef CONFIG_MATHEMU
-extern int sysctl_ieee_emulation_warnings;
-#endif
-extern int sysctl_userprocess_debug;
-extern int spin_retry;
-#endif
-
 #ifdef CONFIG_IA64
 extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
-- 
cgit v1.2.2


From 6d274309d0e64bdbdb6c50945ca2964596e8fa5a Mon Sep 17 00:00:00 2001
From: Rob Herring <rob.herring@calxeda.com>
Date: Fri, 30 Sep 2011 10:48:38 -0500
Subject: irq: support domains with non-zero hwirq base

Interrupt controllers can have non-zero starting value for h/w irq numbers.
Adding support in irq_domain allows the domain hwirq numbering to match
the interrupt controllers' numbering.

As this makes looping over irqs for a domain more complicated, add loop
iterators to iterate over all hwirqs and irqs for a domain.

Signed-off-by: Rob Herring <rob.herring@calxeda.com>
Reviewed-by: Jamie Iles <jamie@jamieiles.com>
Tested-by: Thomas Abraham <thomas.abraham@linaro.org>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/irqdomain.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index b57a3776de44..200ce832c585 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -20,15 +20,15 @@ static DEFINE_MUTEX(irq_domain_mutex);
 void irq_domain_add(struct irq_domain *domain)
 {
 	struct irq_data *d;
-	int hwirq;
+	int hwirq, irq;
 
 	/*
 	 * This assumes that the irq_domain owner has already allocated
 	 * the irq_descs.  This block will be removed when support for dynamic
 	 * allocation of irq_descs is added to irq_domain.
 	 */
-	for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
-		d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
+	irq_domain_for_each_irq(domain, hwirq, irq) {
+		d = irq_get_irq_data(irq);
 		if (!d) {
 			WARN(1, "error: assigning domain to non existant irq_desc");
 			return;
@@ -54,15 +54,15 @@ void irq_domain_add(struct irq_domain *domain)
 void irq_domain_del(struct irq_domain *domain)
 {
 	struct irq_data *d;
-	int hwirq;
+	int hwirq, irq;
 
 	mutex_lock(&irq_domain_mutex);
 	list_del(&domain->list);
 	mutex_unlock(&irq_domain_mutex);
 
 	/* Clear the irq_domain assignments */
-	for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
-		d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
+	irq_domain_for_each_irq(domain, hwirq, irq) {
+		d = irq_get_irq_data(irq);
 		d->domain = NULL;
 	}
 }
-- 
cgit v1.2.2


From 9a418455134f5dc23f124d2818b2e8e1cea997a1 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Wed, 25 May 2011 09:52:21 -0400
Subject: range: fix bogus misuse of module.h to get printk()

This file isn't doing anything with modules and so it should
not be including <linux/module.h> just to get basic stuff
like printk() and min/max.  Revector it to <linux/kernel.h>.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 kernel/range.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/range.c b/kernel/range.c
index 37fa9b99ad58..9b8ae2d6ed68 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -1,7 +1,7 @@
 /*
  * Range add and subtract
  */
-#include <linux/module.h>
+#include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sort.h>
 
-- 
cgit v1.2.2


From 9984de1a5a8a96275fcab818f7419af5a3c86e71 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 23 May 2011 14:51:41 -0400
Subject: kernel: Map most files to use export.h instead of module.h

The changed files were only including linux/module.h for the
EXPORT_SYMBOL infrastructure, and nothing else.  Revector them
onto the isolated export header for faster compile times.

Nothing to see here but a whole lot of instances of:

  -#include <linux/module.h>
  +#include <linux/export.h>

This commit is only changing the kernel dir; next targets
will probably be mm, fs, the arch dirs, etc.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 kernel/async.c                | 2 +-
 kernel/audit.c                | 2 +-
 kernel/auditsc.c              | 2 +-
 kernel/capability.c           | 2 +-
 kernel/cgroup_freezer.c       | 2 +-
 kernel/cpu.c                  | 2 +-
 kernel/cpuset.c               | 2 +-
 kernel/crash_dump.c           | 2 +-
 kernel/cred.c                 | 2 +-
 kernel/dma.c                  | 2 +-
 kernel/freezer.c              | 2 +-
 kernel/futex.c                | 2 +-
 kernel/groups.c               | 2 +-
 kernel/hrtimer.c              | 2 +-
 kernel/hung_task.c            | 2 +-
 kernel/irq_work.c             | 2 +-
 kernel/kfifo.c                | 2 +-
 kernel/kprobes.c              | 2 +-
 kernel/ksysfs.c               | 2 +-
 kernel/kthread.c              | 2 +-
 kernel/latencytop.c           | 2 +-
 kernel/lockdep_proc.c         | 2 +-
 kernel/module.c               | 2 +-
 kernel/mutex-debug.c          | 2 +-
 kernel/mutex.c                | 2 +-
 kernel/notifier.c             | 2 +-
 kernel/nsproxy.c              | 2 +-
 kernel/padata.c               | 2 +-
 kernel/pid.c                  | 2 +-
 kernel/posix-timers.c         | 2 +-
 kernel/profile.c              | 2 +-
 kernel/ptrace.c               | 2 +-
 kernel/rcupdate.c             | 2 +-
 kernel/rcutiny.c              | 2 +-
 kernel/rcutree.c              | 2 +-
 kernel/relay.c                | 2 +-
 kernel/resource.c             | 2 +-
 kernel/rtmutex-debug.c        | 2 +-
 kernel/rtmutex-tester.c       | 2 +-
 kernel/rtmutex.c              | 2 +-
 kernel/rwsem.c                | 2 +-
 kernel/sched_clock.c          | 2 +-
 kernel/semaphore.c            | 2 +-
 kernel/signal.c               | 2 +-
 kernel/smp.c                  | 2 +-
 kernel/softirq.c              | 2 +-
 kernel/spinlock.c             | 2 +-
 kernel/srcu.c                 | 2 +-
 kernel/stacktrace.c           | 2 +-
 kernel/stop_machine.c         | 2 +-
 kernel/sys.c                  | 2 +-
 kernel/time.c                 | 2 +-
 kernel/timer.c                | 2 +-
 kernel/up.c                   | 2 +-
 kernel/user-return-notifier.c | 2 +-
 kernel/user.c                 | 2 +-
 kernel/user_namespace.c       | 2 +-
 kernel/utsname.c              | 2 +-
 kernel/utsname_sysctl.c       | 2 +-
 kernel/wait.c                 | 2 +-
 kernel/workqueue.c            | 2 +-
 61 files changed, 61 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/kernel/async.c b/kernel/async.c
index 4c2843c0043e..80b74b88fefe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -51,7 +51,7 @@ asynchronous and synchronous parts of the kernel.
 #include <linux/async.h>
 #include <linux/atomic.h>
 #include <linux/ktime.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
diff --git a/kernel/audit.c b/kernel/audit.c
index 0a1355ca3d79..09fae2677a45 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -45,7 +45,7 @@
 #include <asm/types.h>
 #include <linux/atomic.h>
 #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/kthread.h>
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ce4b054acee5..47b7fc1ea893 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -48,7 +48,7 @@
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/socket.h>
diff --git a/kernel/capability.c b/kernel/capability.c
index 283c529f8b1c..b463871a4e69 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -10,7 +10,7 @@
 #include <linux/audit.h>
 #include <linux/capability.h>
 #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e691818d7e45..5e828a2ca8e6 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -14,7 +14,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 12b7458f23b1..6a81ca906a06 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,7 +10,7 @@
 #include <linux/sched.h>
 #include <linux/unistd.h>
 #include <linux/cpu.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 10131fdaff70..d970fb508e34 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -37,7 +37,7 @@
 #include <linux/mempolicy.h>
 #include <linux/mm.h>
 #include <linux/memory.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index 5f85690285d4..20e699265cef 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -2,7 +2,7 @@
 #include <linux/crash_dump.h>
 #include <linux/init.h>
 #include <linux/errno.h>
-#include <linux/module.h>
+#include <linux/export.h>
 
 /*
  * If we have booted due to a crash, max_pfn will be a very low value. We need
diff --git a/kernel/cred.c b/kernel/cred.c
index bb55d052d858..5791612a4045 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -8,7 +8,7 @@
  * as published by the Free Software Foundation; either version
  * 2 of the Licence, or (at your option) any later version.
  */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/cred.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
diff --git a/kernel/dma.c b/kernel/dma.c
index f903189c5304..68a2306522c8 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -9,7 +9,7 @@
  *   [It also happened to remove the sizeof(char *) == sizeof(int)
  *   assumption introduced because of those /proc/dma patches. -- Hennus]
  */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/spinlock.h>
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 66a594e8ad2f..f24aa0005530 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -6,7 +6,7 @@
 
 #include <linux/interrupt.h>
 #include <linux/suspend.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/syscalls.h>
 #include <linux/freezer.h>
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 1511dff0cfd6..ea87f4d2f455 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -55,7 +55,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/magic.h>
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
diff --git a/kernel/groups.c b/kernel/groups.c
index 1cc476d52dd3..99b53d1eb7ea 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -2,7 +2,7 @@
  * Supplementary group IDs
  */
 #include <linux/cred.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index a9205e32a059..422e567eecf6 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -32,7 +32,7 @@
  */
 
 #include <linux/cpu.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/percpu.h>
 #include <linux/hrtimer.h>
 #include <linux/notifier.h>
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ea640120ab86..8b1748d0172c 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -13,7 +13,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/lockdep.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sysctl.h>
 
 /*
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 0e2cde4f380b..3e460ea44955 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -6,7 +6,7 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/irq_work.h>
 #include <linux/hardirq.h>
 
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 01a0700e873f..c744b88c44e2 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -20,7 +20,7 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/log2.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 2f193d0ba7f2..e5d84644823b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -36,7 +36,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/stddef.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/moduleloader.h>
 #include <linux/kallsyms.h>
 #include <linux/freezer.h>
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3b053c04dd86..6771de3a655d 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -11,7 +11,7 @@
 #include <linux/kobject.h>
 #include <linux/string.h>
 #include <linux/sysfs.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/init.h>
 #include <linux/kexec.h>
 #include <linux/profile.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4ba7cccb4994..b6d216a92639 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -12,7 +12,7 @@
 #include <linux/cpuset.h>
 #include <linux/unistd.h>
 #include <linux/file.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/freezer.h>
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 4ac8ebfcab59..a462b317f9a0 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -53,7 +53,7 @@
 #include <linux/notifier.h>
 #include <linux/spinlock.h>
 #include <linux/proc_fs.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/list.h>
 #include <linux/stacktrace.h>
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 71edd2f60c02..91c32a0b612c 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -11,7 +11,7 @@
  * Code for /proc/lockdep and /proc/lockdep_stats:
  *
  */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
diff --git a/kernel/module.c b/kernel/module.c
index 93342d992f34..84205ae1607a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -16,7 +16,7 @@
     along with this program; if not, write to the Free Software
     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/moduleloader.h>
 #include <linux/ftrace_event.h>
 #include <linux/init.h>
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 73da83aff418..7e3443fe1f48 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -14,7 +14,7 @@
  */
 #include <linux/mutex.h>
 #include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/poison.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
diff --git a/kernel/mutex.c b/kernel/mutex.c
index d607ed5dd441..89096dd8786f 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -19,7 +19,7 @@
  */
 #include <linux/mutex.h>
 #include <linux/sched.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 8d7b435806c9..2d5cc4ccff7f 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -1,6 +1,6 @@
 #include <linux/kdebug.h>
 #include <linux/kprobes.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/vmalloc.h>
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 9aeab4b98c64..b576f7f14bc6 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -14,7 +14,7 @@
  */
 
 #include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/nsproxy.h>
 #include <linux/init_task.h>
 #include <linux/mnt_namespace.h>
diff --git a/kernel/padata.c b/kernel/padata.c
index b91941df5e63..b45259931512 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -18,7 +18,7 @@
  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/cpumask.h>
 #include <linux/err.h>
 #include <linux/cpu.h>
diff --git a/kernel/pid.c b/kernel/pid.c
index 8cafe7e72ad2..fa5f72227e5f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -27,7 +27,7 @@
  */
 
 #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/rculist.h>
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4556182527f3..69185ae6b701 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -46,7 +46,7 @@
 #include <linux/syscalls.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
-#include <linux/module.h>
+#include <linux/export.h>
 
 /*
  * Management arrays for POSIX timers.	 Timers are kept in slab memory
diff --git a/kernel/profile.c b/kernel/profile.c
index 961b389fe52f..76b8e77773ee 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -13,7 +13,7 @@
  *	to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/profile.h>
 #include <linux/bootmem.h>
 #include <linux/notifier.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a70d2a5d8c7b..24d04477b257 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -8,7 +8,7 @@
  */
 
 #include <linux/capability.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index ca0d23b6b3e8..c5b98e565aee 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -43,7 +43,7 @@
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/hardirq.h>
 
 #define CREATE_TRACE_POINTS
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index da775c87f27f..b5e525d67fe3 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -28,7 +28,7 @@
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/types.h>
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e234eb92a177..6b76d812740c 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -38,7 +38,7 @@
 #include <linux/nmi.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/completion.h>
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
diff --git a/kernel/relay.c b/kernel/relay.c
index 859ea5a9605f..226fade4d727 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -15,7 +15,7 @@
 #include <linux/errno.h>
 #include <linux/stddef.h>
 #include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/string.h>
 #include <linux/relay.h>
 #include <linux/vmalloc.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index c8dc249da5ce..7640b3a947d0 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,7 +7,7 @@
  * Arbitrary resource management.
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index a2e7e7210f3e..8eafd1bd273e 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -18,7 +18,7 @@
  */
 #include <linux/sched.h>
 #include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/syscalls.h>
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 5c9ccd380966..3d9f31cd79e7 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -7,7 +7,7 @@
  *
  */
 #include <linux/kthread.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/sysdev.h>
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 5e8d9cce7470..f9d8482dd487 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -11,7 +11,7 @@
  *  See Documentation/rt-mutex-design.txt for details.
  */
 #include <linux/spinlock.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/timer.h>
 
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 9f48f3d82e9b..b152f74f02de 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -7,7 +7,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/rwsem.h>
 
 #include <asm/system.h>
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 9d8af0b3fb64..c685e31492df 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -62,7 +62,7 @@
  */
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/percpu.h>
 #include <linux/ktime.h>
 #include <linux/sched.h>
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index d831841e55a7..60636a4e25c3 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -27,7 +27,7 @@
 
 #include <linux/compiler.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index d252be2d3de5..b3f78d09a105 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -11,7 +11,7 @@
  */
 
 #include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
diff --git a/kernel/smp.c b/kernel/smp.c
index fb67dfa8394e..db197d60489b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -6,7 +6,7 @@
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/gfp.h>
diff --git a/kernel/softirq.c b/kernel/softirq.c
index fca82c32042b..2c71d91efff0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -10,7 +10,7 @@
  *	Remote softirq infrastructure is by Jens Axboe.
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kernel_stat.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index be6517fb9c14..84c7d96918bf 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -19,7 +19,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
-#include <linux/module.h>
+#include <linux/export.h>
 
 /*
  * If lockdep is enabled then we use the non-preemption spin-ops
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 73ce23feaea9..0febf61e1aa3 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -24,7 +24,7 @@
  *
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index d20c6983aad9..00fe55cc5a82 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -7,7 +7,7 @@
  */
 #include <linux/sched.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
 
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index ba5070ce5765..e78db365fa83 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -12,7 +12,7 @@
 #include <linux/cpu.h>
 #include <linux/init.h>
 #include <linux/kthread.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/stop_machine.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 58459509b14c..4a0286241829 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -4,7 +4,7 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/utsname.h>
 #include <linux/mman.h>
diff --git a/kernel/time.c b/kernel/time.c
index d77606214529..73e416db0a1e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -27,7 +27,7 @@
  *	with nanosecond accuracy
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/timex.h>
 #include <linux/capability.h>
 #include <linux/clocksource.h>
diff --git a/kernel/timer.c b/kernel/timer.c
index 8cff36119e4d..dbaa62422b13 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -20,7 +20,7 @@
  */
 
 #include <linux/kernel_stat.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
diff --git a/kernel/up.c b/kernel/up.c
index 1ff27a28bb7d..c54c75e9faf7 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -4,7 +4,7 @@
 
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/smp.h>
 
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 92cb706c7fc8..1744bb80f1fb 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -2,7 +2,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/percpu.h>
 #include <linux/sched.h>
-#include <linux/module.h>
+#include <linux/export.h>
 
 static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
 
diff --git a/kernel/user.c b/kernel/user.c
index 9e03e9c1df8d..71dd2363ab0f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -14,7 +14,7 @@
 #include <linux/bitops.h>
 #include <linux/key.h>
 #include <linux/interrupt.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/user_namespace.h>
 
 /*
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9da289c34f22..3b906e98b1db 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -5,7 +5,7 @@
  *  License.
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index bff131b9510a..405caf91aad5 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -9,7 +9,7 @@
  *  License.
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <linux/err.h>
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index a2cd77e70d4d..5a709452ec19 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -9,7 +9,7 @@
  *  License.
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
diff --git a/kernel/wait.c b/kernel/wait.c
index f45ea8d2a1ce..26fa7797f90f 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -4,7 +4,7 @@
  * (C) 2004 William Irwin, Oracle
  */
 #include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/wait.h>
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1783aabc6128..42fa9ad0a810 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -23,7 +23,7 @@
  * Please read Documentation/workqueue.txt for details.
  */
 
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/init.h>
-- 
cgit v1.2.2


From 56d82e000cdfb51aa92241d4682302f78c35cd92 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Thu, 26 May 2011 17:53:52 -0400
Subject: kernel: Add <linux/module.h> to files using it implicitly

These files are doing things like module_put and try_module_get
so they need to call out the module.h for explicit inclusion,
rather than getting it via <linux/device.h> which we ideally want
to remove the module.h inclusion from.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 kernel/trace/ftrace.c         | 1 +
 kernel/trace/trace_syscalls.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 077d85387908..900b409543db 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,6 +22,7 @@
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
 #include <linux/uaccess.h>
+#include <linux/module.h>
 #include <linux/ftrace.h>
 #include <linux/sysctl.h>
 #include <linux/slab.h>
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index ee7b5a0bb9f8..cb654542c1a1 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -2,6 +2,7 @@
 #include <trace/events/syscalls.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
+#include <linux/module.h>	/* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
 #include <asm/syscall.h>
-- 
cgit v1.2.2


From 74da1ff71350f3638c51613085f89c0865d7fe08 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Thu, 26 May 2011 12:48:41 -0400
Subject: kernel: fix several implicit usasges of kmod.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These files were implicitly relying on <linux/kmod.h> coming in via
module.h, as without it we get things like:

kernel/power/suspend.c:100: error: implicit declaration of function ‘usermodehelper_disable’
kernel/power/suspend.c:109: error: implicit declaration of function ‘usermodehelper_enable’
kernel/power/user.c:254: error: implicit declaration of function ‘usermodehelper_disable’
kernel/power/user.c:261: error: implicit declaration of function ‘usermodehelper_enable’

kernel/sys.c:317: error: implicit declaration of function ‘usermodehelper_disable’
kernel/sys.c:1816: error: implicit declaration of function ‘call_usermodehelper_setup’
kernel/sys.c:1822: error: implicit declaration of function ‘call_usermodehelper_setfns’
kernel/sys.c:1824: error: implicit declaration of function ‘call_usermodehelper_exec’

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 kernel/power/suspend.c | 1 +
 kernel/power/user.c    | 1 +
 kernel/sys.c           | 1 +
 3 files changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index fdd4263b995d..31aae3219f89 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -12,6 +12,7 @@
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/init.h>
+#include <linux/kmod.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 42ddbc6f0de6..6d8f535c2b88 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -12,6 +12,7 @@
 #include <linux/suspend.h>
 #include <linux/syscalls.h>
 #include <linux/reboot.h>
+#include <linux/kmod.h>
 #include <linux/string.h>
 #include <linux/device.h>
 #include <linux/miscdevice.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 4a0286241829..d2cb1cda823a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -12,6 +12,7 @@
 #include <linux/prctl.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
+#include <linux/kmod.h>
 #include <linux/perf_event.h>
 #include <linux/resource.h>
 #include <linux/kernel.h>
-- 
cgit v1.2.2


From 967d1f90625ed9c1ab205d3f738fedf9d852e1fd Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 18 Jul 2011 13:03:04 -0400
Subject: kernel: fix two implicit header assumptions in irq_work.c

Up until now, this file was getting percpu.h because nearly every
file was implicitly getting module.h (and all its sub-includes).
But we want to clean that up, so call out percpu.h explicitly.
Otherwise we'll get things like this on an ARM build:

kernel/irq_work.c:48: error: expected declaration specifiers or '...' before 'irq_work_list'
kernel/irq_work.c:48: warning: type defaults to 'int' in declaration of 'DEFINE_PER_CPU'

The same thing was happening for builds on ARM for asm/processor.h

kernel/irq_work.c: In function 'irq_work_sync':
kernel/irq_work.c:166: error: implicit declaration of function 'cpu_relax'

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 kernel/irq_work.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 3e460ea44955..c3c46c72046e 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -8,7 +8,9 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/irq_work.h>
+#include <linux/percpu.h>
 #include <linux/hardirq.h>
+#include <asm/processor.h>
 
 /*
  * An entry can be in one of four states:
-- 
cgit v1.2.2


From 1596425fd7545abccb7b5059376e4d56c3d6be4d Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Thu, 28 Jul 2011 14:22:29 -0400
Subject: kernel: ksysfs.c is implicitly using stat.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the module.h usage cleanup, we'll get this:

kernel/ksysfs.c:161: error: ‘S_IRUGO’ undeclared here (not in a function)
make[2]: *** [kernel/ksysfs.o] Error 1

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 kernel/ksysfs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6771de3a655d..4e316e1acf58 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/kexec.h>
 #include <linux/profile.h>
+#include <linux/stat.h>
 #include <linux/sched.h>
 #include <linux/capability.h>
 
-- 
cgit v1.2.2


From 72a59aaada499d9bbf19f2fb68daa37502e4a9bb Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Thu, 26 May 2011 21:07:10 -0400
Subject: kernel: params.c needs module.h not moduleparam.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Through various other implicit include paths, some files were
getting the full module.h file, and hence living the illusion
that they really only needed moduleparam.h -- but the reality
is that once you remove the module.h presence, these show up:

kernel/params.c:583: warning: ‘struct module_kobject’ declared inside parameter list

Such files really require module.h so simply make it so.  As the
file module.h grabs moduleparam.h on the fly, all will be well.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 kernel/params.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 821788947e40..65aae11eb93f 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,7 @@
     along with this program; if not, write to the Free Software
     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
-#include <linux/moduleparam.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
-- 
cgit v1.2.2


From bdfa97bf7263657b83bc5b68567a3a60dde84c5b Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Tue, 25 Oct 2011 13:13:57 -0400
Subject: kernel: fix up module header handling in rcutiny files

The file rcutiny.c does not need moduleparam.h header, as
there are no modparams in this file.

However rcutiny_plugin.h does define a module_init() and
a module_exit() and it uses the various MODULE_ macros, so
it really does need module.h included.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 kernel/rcutiny.c        | 1 -
 kernel/rcutiny_plugin.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index b5e525d67fe3..636af6d9c6e5 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -22,7 +22,6 @@
  * For detailed explanation of Read-Copy Update mechanism see -
  *		Documentation/RCU
  */
-#include <linux/moduleparam.h>
 #include <linux/completion.h>
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 02aa7139861c..2b0484a5dc28 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -23,6 +23,7 @@
  */
 
 #include <linux/kthread.h>
+#include <linux/module.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
-- 
cgit v1.2.2


From 6e5fdeedca610df600aabc393c4b1f44b128fe49 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Thu, 26 May 2011 16:00:52 -0400
Subject: kernel: Fix files explicitly needing EXPORT_SYMBOL infrastructure

These files were getting <linux/module.h> via an implicit non-obvious
path, but we want to crush those out of existence since they cost
time during compiles of processing thousands of lines of headers
for no reason.  Give them the lightweight header that just contains
the EXPORT_SYMBOL infrastructure.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 kernel/compat.c                 | 1 +
 kernel/debug/kdb/kdb_debugger.c | 1 +
 kernel/events/core.c            | 1 +
 kernel/irq/generic-chip.c       | 1 +
 kernel/power/hibernate.c        | 1 +
 kernel/power/main.c             | 1 +
 kernel/power/qos.c              | 1 +
 kernel/power/suspend.c          | 1 +
 kernel/time/posix-clock.c       | 1 +
 kernel/trace/blktrace.c         | 1 +
 10 files changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index e2435ee9993a..f346cedfe24d 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,6 +21,7 @@
 #include <linux/unistd.h>
 #include <linux/security.h>
 #include <linux/timex.h>
+#include <linux/export.h>
 #include <linux/migrate.h>
 #include <linux/posix-timers.h>
 #include <linux/times.h>
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index d9ca9aa481ec..8b68ce78ff17 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -11,6 +11,7 @@
 #include <linux/kgdb.h>
 #include <linux/kdb.h>
 #include <linux/kdebug.h>
+#include <linux/export.h>
 #include "kdb_private.h"
 #include "../debug_core.h"
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d1a1bee35228..92b8811f2234 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -25,6 +25,7 @@
 #include <linux/reboot.h>
 #include <linux/vmstat.h>
 #include <linux/device.h>
+#include <linux/export.h>
 #include <linux/vmalloc.h>
 #include <linux/hardirq.h>
 #include <linux/rculist.h>
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 6cb7613e4bf4..c89295a8f668 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -6,6 +6,7 @@
 #include <linux/io.h>
 #include <linux/irq.h>
 #include <linux/slab.h>
+#include <linux/export.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/syscore_ops.h>
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1c53f7fad5f7..b4511b6d3ef9 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -9,6 +9,7 @@
  * This file is released under the GPLv2.
  */
 
+#include <linux/export.h>
 #include <linux/suspend.h>
 #include <linux/syscalls.h>
 #include <linux/reboot.h>
diff --git a/kernel/power/main.c b/kernel/power/main.c
index a52e88425a31..71f49fe4377e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,6 +8,7 @@
  *
  */
 
+#include <linux/export.h>
 #include <linux/kobject.h>
 #include <linux/string.h>
 #include <linux/resume-trace.h>
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 1c1797dd1d1d..2c0a65e27626 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -43,6 +43,7 @@
 #include <linux/kernel.h>
 
 #include <linux/uaccess.h>
+#include <linux/export.h>
 
 /*
  * locking rule: all changes to constraints or notifiers lists
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 31aae3219f89..4953dc054c53 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,7 @@
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/export.h>
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
 #include <trace/events/power.h>
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index c340ca658f37..ce033c7aa2e8 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -18,6 +18,7 @@
  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 #include <linux/device.h>
+#include <linux/export.h>
 #include <linux/file.h>
 #include <linux/posix-clock.h>
 #include <linux/slab.h>
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7c910a5593a6..16fc34a0806f 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,6 +23,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/debugfs.h>
+#include <linux/export.h>
 #include <linux/time.h>
 #include <linux/uaccess.h>
 
-- 
cgit v1.2.2


From ec53cf23c0ddb0c29950b9a4ac46964c4c6c6c2f Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 19 Sep 2011 20:33:19 -0400
Subject: irq: don't put module.h into irq.h for tracking irqgen modules.

Recent commit "irq: Track the  owner of irq descriptor" in
commit ID b6873807a7143b7 placed module.h into linux/irq.h
but we are trying to limit module.h inclusion to just C files
that really need it, due to its size and number of children
includes.  This targets just reversing that include.

Add in the basic "struct module" since that is all we really need
to ensure things compile.  In theory, b687380 should have added the
module.h include to the irqdesc.h header as well, but the implicit
module.h everywhere presence masked this from showing up.  So give
it the "struct module" as well.

As for the C files, irqdesc.c is only using THIS_MODULE, so it
does not need module.h - give it export.h instead.  The C file
irq/manage.c is now (as of b687380) using try_module_get and
module_put and so it needs module.h (which it already has).

Also convert the irq_alloc_descs variants to macros, since all
they really do is is call the __irq_alloc_descs primitive.
This avoids including export.h and no debug info is lost.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 kernel/irq/irqdesc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 1550e8447a16..d86e254b95eb 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -9,7 +9,7 @@
  */
 #include <linux/irq.h>
 #include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/radix-tree.h>
-- 
cgit v1.2.2


From fcf634098c00dd9cd247447368495f0b79be12d1 Mon Sep 17 00:00:00 2001
From: Christopher Yeoh <cyeoh@au1.ibm.com>
Date: Mon, 31 Oct 2011 17:06:39 -0700
Subject: Cross Memory Attach

The basic idea behind cross memory attach is to allow MPI programs doing
intra-node communication to do a single copy of the message rather than a
double copy of the message via shared memory.

The following patch attempts to achieve this by allowing a destination
process, given an address and size from a source process, to copy memory
directly from the source process into its own address space via a system
call.  There is also a symmetrical ability to copy from the current
process's address space into a destination process's address space.

- Use of /proc/pid/mem has been considered, but there are issues with
  using it:
  - Does not allow for specifying iovecs for both src and dest, assuming
    preadv or pwritev was implemented either the area read from or
  written to would need to be contiguous.
  - Currently mem_read allows only processes who are currently
  ptrace'ing the target and are still able to ptrace the target to read
  from the target. This check could possibly be moved to the open call,
  but its not clear exactly what race this restriction is stopping
  (reason  appears to have been lost)
  - Having to send the fd of /proc/self/mem via SCM_RIGHTS on unix
  domain socket is a bit ugly from a userspace point of view,
  especially when you may have hundreds if not (eventually) thousands
  of processes  that all need to do this with each other
  - Doesn't allow for some future use of the interface we would like to
  consider adding in the future (see below)
  - Interestingly reading from /proc/pid/mem currently actually
  involves two copies! (But this could be fixed pretty easily)

As mentioned previously use of vmsplice instead was considered, but has
problems.  Since you need the reader and writer working co-operatively if
the pipe is not drained then you block.  Which requires some wrapping to
do non blocking on the send side or polling on the receive.  In all to all
communication it requires ordering otherwise you can deadlock.  And in the
example of many MPI tasks writing to one MPI task vmsplice serialises the
copying.

There are some cases of MPI collectives where even a single copy interface
does not get us the performance gain we could.  For example in an
MPI_Reduce rather than copy the data from the source we would like to
instead use it directly in a mathops (say the reduce is doing a sum) as
this would save us doing a copy.  We don't need to keep a copy of the data
from the source.  I haven't implemented this, but I think this interface
could in the future do all this through the use of the flags - eg could
specify the math operation and type and the kernel rather than just
copying the data would apply the specified operation between the source
and destination and store it in the destination.

Although we don't have a "second user" of the interface (though I've had
some nibbles from people who may be interested in using it for intra
process messaging which is not MPI).  This interface is something which
hardware vendors are already doing for their custom drivers to implement
fast local communication.  And so in addition to this being useful for
OpenMPI it would mean the driver maintainers don't have to fix things up
when the mm changes.

There was some discussion about how much faster a true zero copy would
go. Here's a link back to the email with some testing I did on that:

http://marc.info/?l=linux-mm&m=130105930902915&w=2

There is a basic man page for the proposed interface here:

http://ozlabs.org/~cyeoh/cma/process_vm_readv.txt

This has been implemented for x86 and powerpc, other architecture should
mainly (I think) just need to add syscall numbers for the process_vm_readv
and process_vm_writev. There are 32 bit compatibility versions for
64-bit kernels.

For arch maintainers there are some simple tests to be able to quickly
verify that the syscalls are working correctly here:

http://ozlabs.org/~cyeoh/cma/cma-test-20110718.tgz

Signed-off-by: Chris Yeoh <yeohc@au1.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: James Morris <jmorris@namei.org>
Cc: <linux-man@vger.kernel.org>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys_ni.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a9a5de07c4f1..47bfa16430d7 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -145,6 +145,10 @@ cond_syscall(sys_io_submit);
 cond_syscall(sys_io_cancel);
 cond_syscall(sys_io_getevents);
 cond_syscall(sys_syslog);
+cond_syscall(sys_process_vm_readv);
+cond_syscall(sys_process_vm_writev);
+cond_syscall(compat_sys_process_vm_readv);
+cond_syscall(compat_sys_process_vm_writev);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
-- 
cgit v1.2.2


From c9f01245b6a7d77d17deaa71af10f6aca14fa24e Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 31 Oct 2011 17:07:15 -0700
Subject: oom: remove oom_disable_count

This removes mm->oom_disable_count entirely since it's unnecessary and
currently buggy.  The counter was intended to be per-process but it's
currently decremented in the exit path for each thread that exits, causing
it to underflow.

The count was originally intended to prevent oom killing threads that
share memory with threads that cannot be killed since it doesn't lead to
future memory freeing.  The counter could be fixed to represent all
threads sharing the same mm, but it's better to remove the count since:

 - it is possible that the OOM_DISABLE thread sharing memory with the
   victim is waiting on that thread to exit and will actually cause
   future memory freeing, and

 - there is no guarantee that a thread is disabled from oom killing just
   because another thread sharing its mm is oom disabled.

Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Ying Han <yinghan@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c |  2 --
 kernel/fork.c | 10 +---------
 2 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 2913b3509d42..d0b7d988f873 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -681,8 +681,6 @@ static void exit_mm(struct task_struct * tsk)
 	enter_lazy_tlb(mm, current);
 	/* We don't want this task to be frozen prematurely */
 	clear_freeze_flag(tsk);
-	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-		atomic_dec(&mm->oom_disable_count);
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
 	mmput(mm);
diff --git a/kernel/fork.c b/kernel/fork.c
index 8e6b6f4fb272..70d76191afb9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -501,7 +501,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	mm->cached_hole_size = ~0UL;
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
-	atomic_set(&mm->oom_disable_count, 0);
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
@@ -816,8 +815,6 @@ good_mm:
 	/* Initializing for Swap token stuff */
 	mm->token_priority = 0;
 	mm->last_interval = 0;
-	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-		atomic_inc(&mm->oom_disable_count);
 
 	tsk->mm = mm;
 	tsk->active_mm = mm;
@@ -1391,13 +1388,8 @@ bad_fork_cleanup_io:
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-	if (p->mm) {
-		task_lock(p);
-		if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-			atomic_dec(&p->mm->oom_disable_count);
-		task_unlock(p);
+	if (p->mm)
 		mmput(p->mm);
-	}
 bad_fork_cleanup_signal:
 	if (!(clone_flags & CLONE_THREAD))
 		free_signal_struct(p->signal);
-- 
cgit v1.2.2


From bc3e53f682d93df677dbd5006a404722b3adfe18 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Mon, 31 Oct 2011 17:07:30 -0700
Subject: mm: distinguish between mlocked and pinned pages

Some kernel components pin user space memory (infiniband and perf) (by
increasing the page count) and account that memory as "mlocked".

The difference between mlocking and pinning is:

A. mlocked pages are marked with PG_mlocked and are exempt from
   swapping. Page migration may move them around though.
   They are kept on a special LRU list.

B. Pinned pages cannot be moved because something needs to
   directly access physical memory. They may not be on any
   LRU list.

I recently saw an mlockalled process where mm->locked_vm became
bigger than the virtual size of the process (!) because some
memory was accounted for twice:

Once when the page was mlocked and once when the Infiniband
layer increased the refcount because it needt to pin the RDMA
memory.

This patch introduces a separate counter for pinned pages and
accounts them seperately.

Signed-off-by: Christoph Lameter <cl@linux.com>
Cc: Mike Marciniszyn <infinipath@qlogic.com>
Cc: Roland Dreier <roland@kernel.org>
Cc: Sean Hefty <sean.hefty@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/events/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d1a1bee35228..12a0287e0358 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3544,7 +3544,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 		struct ring_buffer *rb = event->rb;
 
 		atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
-		vma->vm_mm->locked_vm -= event->mmap_locked;
+		vma->vm_mm->pinned_vm -= event->mmap_locked;
 		rcu_assign_pointer(event->rb, NULL);
 		mutex_unlock(&event->mmap_mutex);
 
@@ -3625,7 +3625,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
 	lock_limit = rlimit(RLIMIT_MEMLOCK);
 	lock_limit >>= PAGE_SHIFT;
-	locked = vma->vm_mm->locked_vm + extra;
+	locked = vma->vm_mm->pinned_vm + extra;
 
 	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
 		!capable(CAP_IPC_LOCK)) {
@@ -3651,7 +3651,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	atomic_long_add(user_extra, &user->locked_vm);
 	event->mmap_locked = extra;
 	event->mmap_user = get_current_user();
-	vma->vm_mm->locked_vm += event->mmap_locked;
+	vma->vm_mm->pinned_vm += event->mmap_locked;
 
 unlock:
 	if (!ret)
-- 
cgit v1.2.2


From f445027e4e692bd885118460b292d08027fd5501 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 31 Oct 2011 17:11:15 -0700
Subject: stop_machine: make stop_machine safe and efficient to call early

Make stop_machine() safe to call early in boot, before SMP has been set
up, by simply calling the callback function directly if there's only one
CPU online.

[ Fixes from AKPM:
   - add comment
   - local_irq_flags, not save_flags
   - also call hard_irq_disable() for systems which need it

  Tejun suggested using an explicit flag rather than just looking at
  the online cpu count. ]

Cc: Tejun Heo <tj@kernel.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Tejun Heo <htejun@gmail.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/stop_machine.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index ba5070ce5765..5b0951aa0496 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -41,6 +41,7 @@ struct cpu_stopper {
 };
 
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+static bool stop_machine_initialized = false;
 
 static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
 {
@@ -386,6 +387,8 @@ static int __init cpu_stop_init(void)
 	cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
 	register_cpu_notifier(&cpu_stop_cpu_notifier);
 
+	stop_machine_initialized = true;
+
 	return 0;
 }
 early_initcall(cpu_stop_init);
@@ -485,6 +488,25 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 					    .num_threads = num_online_cpus(),
 					    .active_cpus = cpus };
 
+	if (!stop_machine_initialized) {
+		/*
+		 * Handle the case where stop_machine() is called
+		 * early in boot before stop_machine() has been
+		 * initialized.
+		 */
+		unsigned long flags;
+		int ret;
+
+		WARN_ON_ONCE(smdata.num_threads != 1);
+
+		local_irq_save(flags);
+		hard_irq_disable();
+		ret = (*fn)(data);
+		local_irq_restore(flags);
+
+		return ret;
+	}
+
 	/* Set the initial state and stop all online cpus. */
 	set_state(&smdata, STOPMACHINE_PREPARE);
 	return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
-- 
cgit v1.2.2


From 4ff819515b203f937cc6c8a0215a37a68d1ee71f Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@sw.ru>
Date: Mon, 31 Oct 2011 17:11:18 -0700
Subject: watchdog: move watchdog_*_all_cpus under CONFIG_SYSCTL

Fix compilation warnings for CONFIG_SYSCTL=n:

fixed compilation warnings in case of disabled CONFIG_SYSCTL
kernel/watchdog.c:483:13: warning: `watchdog_enable_all_cpus' defined but not used
kernel/watchdog.c:500:13: warning: `watchdog_disable_all_cpus' defined but not used

these functions are static and are used only in sysctl handler, so move
them inside #ifdef CONFIG_SYSCTL too

Signed-off-by: Vasily Averin <vvs@sw.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d680381b0e9c..1d7bca7f4f52 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -481,6 +481,8 @@ static void watchdog_disable(int cpu)
 	}
 }
 
+/* sysctl functions */
+#ifdef CONFIG_SYSCTL
 static void watchdog_enable_all_cpus(void)
 {
 	int cpu;
@@ -510,8 +512,6 @@ static void watchdog_disable_all_cpus(void)
 }
 
 
-/* sysctl functions */
-#ifdef CONFIG_SYSCTL
 /*
  * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
  */
-- 
cgit v1.2.2


From 73efc0394e148d0e15583e13712637831f926720 Mon Sep 17 00:00:00 2001
From: Dan Ballard <dan@mindstab.net>
Date: Mon, 31 Oct 2011 17:11:20 -0700
Subject: kernel/sysctl.c: add cap_last_cap to /proc/sys/kernel

Userspace needs to know the highest valid capability of the running
kernel, which right now cannot reliably be retrieved from the header files
only.  The fact that this value cannot be determined properly right now
creates various problems for libraries compiled on newer header files
which are run on older kernels.  They assume capabilities are available
which actually aren't.  libcap-ng is one example.  And we ran into the
same problem with systemd too.

Now the capability is exported in /proc/sys/kernel/cap_last_cap.

[akpm@linux-foundation.org: make cap_last_cap const, per Ulrich]
Signed-off-by: Dan Ballard <dan@mindstab.net>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Ulrich Drepper <drepper@akkadia.org>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2d2ecdcc8cdb..c49d66658ec0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -57,6 +57,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
 #include <linux/kmod.h>
+#include <linux/capability.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -134,6 +135,7 @@ static int minolduid;
 static int min_percpu_pagelist_fract = 8;
 
 static int ngroups_max = NGROUPS_MAX;
+static const int cap_last_cap = CAP_LAST_CAP;
 
 #ifdef CONFIG_INOTIFY_USER
 #include <linux/inotify.h>
@@ -740,6 +742,13 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "cap_last_cap",
+		.data		= (void *)&cap_last_cap,
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec,
+	},
 #if defined(CONFIG_LOCKUP_DETECTOR)
 	{
 		.procname       = "watchdog",
-- 
cgit v1.2.2


From 0eca6b7c78fd997e02bd9850e608102382b7822e Mon Sep 17 00:00:00 2001
From: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Date: Mon, 31 Oct 2011 17:11:25 -0700
Subject: printk: add module parameter ignore_loglevel to control
 ignore_loglevel

We are enabling some power features on medfield.  To test suspend-2-RAM
conveniently, we need turn on/off ignore_loglevel frequently without
rebooting.

Add a module parameter, so users can change it by:
/sys/module/printk/parameters/ignore_loglevel

Signed-off-by: Yanmin Zhang <yanmin.zhang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index b7da18391c38..e62f949ec140 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -532,6 +532,9 @@ static int __init ignore_loglevel_setup(char *str)
 }
 
 early_param("ignore_loglevel", ignore_loglevel_setup);
+module_param_named(ignore_loglevel, ignore_loglevel, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
+	"print all kernel messages to the console.");
 
 /*
  * Write out chars from start to end - 1 inclusive
-- 
cgit v1.2.2


From 134620f7a865b3bc9e3d56d460603592b70ede21 Mon Sep 17 00:00:00 2001
From: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Date: Mon, 31 Oct 2011 17:11:27 -0700
Subject: printk: add console_suspend module parameter

We are enabling some power features on medfield.  To test suspend-2-RAM
conveniently, we need turn on/off console_suspend_enabled frequently.

Add a module parameter, so users could change it by:
/sys/module/printk/parameters/console_suspend

Signed-off-by: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index e62f949ec140..6d9dedd11450 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1111,6 +1111,10 @@ static int __init console_suspend_disable(char *str)
 	return 1;
 }
 __setup("no_console_suspend", console_suspend_disable);
+module_param_named(console_suspend, console_suspend_enabled,
+		bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
+	" and hibernate operations");
 
 /**
  * suspend_console - suspend the console subsystem
-- 
cgit v1.2.2


From 48e41899e4a3592746e5263c14681bf5c1393563 Mon Sep 17 00:00:00 2001
From: William Douglas <william.r.douglas@gmail.com>
Date: Mon, 31 Oct 2011 17:11:29 -0700
Subject: printk: fix bounds checking for log_prefix

Currently log_prefix is testing that the first character of the log level
and facility is less than '0' and greater than '9' (which is always
false).  It should be testing to see if the character less than '0' or
greater than '9' instead.  This patch makes that change.

The code being changed worked because strtoul bombs out (endp isn't
updated) and 0 is returned anyway.

Signed-off-by: William Douglas <william.douglas@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 6d9dedd11450..286d2c7be52c 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -595,7 +595,7 @@ static size_t log_prefix(const char *p, unsigned int *level, char *special)
 		/* multi digit including the level and facility number */
 		char *endp = NULL;
 
-		if (p[1] < '0' && p[1] > '9')
+		if (p[1] < '0' || p[1] > '9')
 			return 0;
 
 		lev = (simple_strtoul(&p[1], &endp, 10) & 7);
-- 
cgit v1.2.2


From ae29bc92da01a2e9d278a9a58c3b307d41cc0254 Mon Sep 17 00:00:00 2001
From: William Douglas <william.r.douglas@gmail.com>
Date: Mon, 31 Oct 2011 17:11:31 -0700
Subject: printk: remove bounds checking for log_prefix

Currently log_prefix is testing that the first character of the log level
and facility is less than '0' and greater than '9' (which is always
false).

Since the code being updated works because strtoul bombs out (endp isn't
updated) and 0 is returned anyway just remove the check and don't change
the behavior of the function.

Signed-off-by: William Douglas <william.douglas@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 286d2c7be52c..1455a0d4eedd 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -595,9 +595,6 @@ static size_t log_prefix(const char *p, unsigned int *level, char *special)
 		/* multi digit including the level and facility number */
 		char *endp = NULL;
 
-		if (p[1] < '0' || p[1] > '9')
-			return 0;
-
 		lev = (simple_strtoul(&p[1], &endp, 10) & 7);
 		if (endp == NULL || endp[0] != '>')
 			return 0;
-- 
cgit v1.2.2


From 50e1499f468fd74c6db95deb2e1e6bfee578ae70 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 31 Oct 2011 17:12:51 -0700
Subject: kgdb: follow rename pack_hex_byte() to hex_byte_pack()

There is no functional change.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/debug/gdbstub.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 34872482315e..c22d8c28ad84 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -217,7 +217,7 @@ void gdbstub_msg_write(const char *s, int len)
 
 		/* Pack in hex chars */
 		for (i = 0; i < wcount; i++)
-			bufptr = pack_hex_byte(bufptr, s[i]);
+			bufptr = hex_byte_pack(bufptr, s[i]);
 		*bufptr = '\0';
 
 		/* Move up */
@@ -249,7 +249,7 @@ char *kgdb_mem2hex(char *mem, char *buf, int count)
 	if (err)
 		return NULL;
 	while (count > 0) {
-		buf = pack_hex_byte(buf, *tmp);
+		buf = hex_byte_pack(buf, *tmp);
 		tmp++;
 		count--;
 	}
@@ -411,14 +411,14 @@ static char *pack_threadid(char *pkt, unsigned char *id)
 	limit = id + (BUF_THREAD_ID_SIZE / 2);
 	while (id < limit) {
 		if (!lzero || *id != 0) {
-			pkt = pack_hex_byte(pkt, *id);
+			pkt = hex_byte_pack(pkt, *id);
 			lzero = 0;
 		}
 		id++;
 	}
 
 	if (lzero)
-		pkt = pack_hex_byte(pkt, 0);
+		pkt = hex_byte_pack(pkt, 0);
 
 	return pkt;
 }
@@ -486,7 +486,7 @@ static void gdb_cmd_status(struct kgdb_state *ks)
 	dbg_remove_all_break();
 
 	remcom_out_buffer[0] = 'S';
-	pack_hex_byte(&remcom_out_buffer[1], ks->signo);
+	hex_byte_pack(&remcom_out_buffer[1], ks->signo);
 }
 
 static void gdb_get_regs_helper(struct kgdb_state *ks)
@@ -954,7 +954,7 @@ int gdb_serial_stub(struct kgdb_state *ks)
 		/* Reply to host that an exception has occurred */
 		ptr = remcom_out_buffer;
 		*ptr++ = 'T';
-		ptr = pack_hex_byte(ptr, ks->signo);
+		ptr = hex_byte_pack(ptr, ks->signo);
 		ptr += strlen(strcpy(ptr, "thread:"));
 		int_to_threadref(thref, shadow_pid(current->pid));
 		ptr = pack_threadid(ptr, thref);
-- 
cgit v1.2.2


From 33ef6b6984403a688189317ef46bb3caab3b70e0 Mon Sep 17 00:00:00 2001
From: Ben Blum <bblum@andrew.cmu.edu>
Date: Wed, 2 Nov 2011 13:38:05 -0700
Subject: cgroups: more safe tasklist locking in cgroup_attach_proc

Fix unstable tasklist locking in cgroup_attach_proc.

According to this thread - https://lkml.org/lkml/2011/7/27/243 - RCU is
not sufficient to guarantee the tasklist is stable w.r.t.  de_thread and
exit.  Taking tasklist_lock for reading, instead of rcu_read_lock, ensures
proper exclusion.

Signed-off-by: Ben Blum <bblum@andrew.cmu.edu>
Acked-by: Paul Menage <paul@paulmenage.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 453100a4159d..64b0e73402df 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2027,7 +2027,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 		goto out_free_group_list;
 
 	/* prevent changes to the threadgroup list while we take a snapshot. */
-	rcu_read_lock();
+	read_lock(&tasklist_lock);
 	if (!thread_group_leader(leader)) {
 		/*
 		 * a race with de_thread from another thread's exec() may strip
@@ -2036,7 +2036,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 		 * throw this task away and try again (from cgroup_procs_write);
 		 * this is "double-double-toil-and-trouble-check locking".
 		 */
-		rcu_read_unlock();
+		read_unlock(&tasklist_lock);
 		retval = -EAGAIN;
 		goto out_free_group_list;
 	}
@@ -2057,7 +2057,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	} while_each_thread(leader, tsk);
 	/* remember the number of threads in the array for later. */
 	group_size = i;
-	rcu_read_unlock();
+	read_unlock(&tasklist_lock);
 
 	/*
 	 * step 1: check that we can legitimately attach to the cgroup.
-- 
cgit v1.2.2


From 77ceab8ea590d7dc6c8f055ce43dfebd74428107 Mon Sep 17 00:00:00 2001
From: Ben Blum <bblum@andrew.cmu.edu>
Date: Wed, 2 Nov 2011 13:38:07 -0700
Subject: cgroups: don't attach task to subsystem if migration failed

If a task has exited to the point it has called cgroup_exit() already,
then we can't migrate it to another cgroup anymore.

This can happen when we are attaching a task to a new cgroup between the
call to ->can_attach_task() on subsystems and the migration that is
eventually tried in cgroup_task_migrate().

In this case cgroup_task_migrate() returns -ESRCH and we don't want to
attach the task to the subsystems because the attachment to the new cgroup
itself failed.

Fix this by only calling ->attach_task() on the subsystems if the cgroup
migration succeeded.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ben Blum <bblum@andrew.cmu.edu>
Acked-by: Paul Menage <paul@paulmenage.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 64b0e73402df..8386b21224ef 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2135,14 +2135,17 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 		oldcgrp = task_cgroup_from_root(tsk, root);
 		if (cgrp == oldcgrp)
 			continue;
-		/* attach each task to each subsystem */
-		for_each_subsys(root, ss) {
-			if (ss->attach_task)
-				ss->attach_task(cgrp, tsk);
-		}
 		/* if the thread is PF_EXITING, it can just get skipped. */
 		retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
-		BUG_ON(retval != 0 && retval != -ESRCH);
+		if (retval == 0) {
+			/* attach each task to each subsystem */
+			for_each_subsys(root, ss) {
+				if (ss->attach_task)
+					ss->attach_task(cgrp, tsk);
+			}
+		} else {
+			BUG_ON(retval != -ESRCH);
+		}
 	}
 	/* nothing is sensitive to fork() after this point. */
 
-- 
cgit v1.2.2


From 89e8a244b97e48f1f30e898b6f32acca477f2a13 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 2 Nov 2011 13:38:39 -0700
Subject: cpusets: avoid looping when storing to mems_allowed if one node
 remains set

{get,put}_mems_allowed() exist so that general kernel code may locklessly
access a task's set of allowable nodes without having the chance that a
concurrent write will cause the nodemask to be empty on configurations
where MAX_NUMNODES > BITS_PER_LONG.

This could incur a significant delay, however, especially in low memory
conditions because the page allocator is blocking and reclaim requires
get_mems_allowed() itself.  It is not atypical to see writes to
cpuset.mems take over 2 seconds to complete, for example.  In low memory
conditions, this is problematic because it's one of the most imporant
times to change cpuset.mems in the first place!

The only way a task's set of allowable nodes may change is through cpusets
by writing to cpuset.mems and when attaching a task to a generic code is
not reading the nodemask with get_mems_allowed() at the same time, and
then clearing all the old nodes.  This prevents the possibility that a
reader will see an empty nodemask at the same time the writer is storing a
new nodemask.

If at least one node remains unchanged, though, it's possible to simply
set all new nodes and then clear all the old nodes.  Changing a task's
nodemask is protected by cgroup_mutex so it's guaranteed that two threads
are not changing the same task's nodemask at the same time, so the
nodemask is guaranteed to be stored before another thread changes it and
determines whether a node remains set or not.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Paul Menage <paul@paulmenage.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 10131fdaff70..ed0ff443f036 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -949,6 +949,8 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 static void cpuset_change_task_nodemask(struct task_struct *tsk,
 					nodemask_t *newmems)
 {
+	bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed);
+
 repeat:
 	/*
 	 * Allow tasks that have access to memory reserves because they have
@@ -963,7 +965,6 @@ repeat:
 	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
 	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
 
-
 	/*
 	 * ensure checking ->mems_allowed_change_disable after setting all new
 	 * allowed nodes.
@@ -980,9 +981,11 @@ repeat:
 
 	/*
 	 * Allocation of memory is very fast, we needn't sleep when waiting
-	 * for the read-side.
+	 * for the read-side.  No wait is necessary, however, if at least one
+	 * node remains unchanged.
 	 */
-	while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
+	while (masks_disjoint &&
+			ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
 		task_unlock(tsk);
 		if (!task_curr(tsk))
 			yield();
-- 
cgit v1.2.2


From f1ecf06854a66ee663f4d4cf029c78cd62a15e04 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Date: Wed, 2 Nov 2011 13:39:22 -0700
Subject: sysctl: add support for poll()

Adding support for poll() in sysctl fs allows userspace to receive
notifications of changes in sysctl entries.  This adds a infrastructure to
allow files in sysctl fs to be pollable and implements it for hostname and
domainname.

[akpm@linux-foundation.org: s/declare/define/ for definitions]
Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Cc: Greg KH <gregkh@suse.de>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c            |  2 ++
 kernel/utsname_sysctl.c | 23 +++++++++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 58459509b14c..d06c091e0345 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1286,6 +1286,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 		memset(u->nodename + len, 0, sizeof(u->nodename) - len);
 		errno = 0;
 	}
+	uts_proc_notify(UTS_PROC_HOSTNAME);
 	up_write(&uts_sem);
 	return errno;
 }
@@ -1336,6 +1337,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
 		memset(u->domainname + len, 0, sizeof(u->domainname) - len);
 		errno = 0;
 	}
+	uts_proc_notify(UTS_PROC_DOMAINNAME);
 	up_write(&uts_sem);
 	return errno;
 }
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index a2cd77e70d4d..3b0d48ebf81d 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -13,6 +13,7 @@
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
+#include <linux/wait.h>
 
 static void *get_uts(ctl_table *table, int write)
 {
@@ -51,12 +52,19 @@ static int proc_do_uts_string(ctl_table *table, int write,
 	uts_table.data = get_uts(table, write);
 	r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
 	put_uts(table, write, uts_table.data);
+
+	if (write)
+		proc_sys_poll_notify(table->poll);
+
 	return r;
 }
 #else
 #define proc_do_uts_string NULL
 #endif
 
+static DEFINE_CTL_TABLE_POLL(hostname_poll);
+static DEFINE_CTL_TABLE_POLL(domainname_poll);
+
 static struct ctl_table uts_kern_table[] = {
 	{
 		.procname	= "ostype",
@@ -85,6 +93,7 @@ static struct ctl_table uts_kern_table[] = {
 		.maxlen		= sizeof(init_uts_ns.name.nodename),
 		.mode		= 0644,
 		.proc_handler	= proc_do_uts_string,
+		.poll		= &hostname_poll,
 	},
 	{
 		.procname	= "domainname",
@@ -92,6 +101,7 @@ static struct ctl_table uts_kern_table[] = {
 		.maxlen		= sizeof(init_uts_ns.name.domainname),
 		.mode		= 0644,
 		.proc_handler	= proc_do_uts_string,
+		.poll		= &domainname_poll,
 	},
 	{}
 };
@@ -105,6 +115,19 @@ static struct ctl_table uts_root_table[] = {
 	{}
 };
 
+#ifdef CONFIG_PROC_SYSCTL
+/*
+ * Notify userspace about a change in a certain entry of uts_kern_table,
+ * identified by the parameter proc.
+ */
+void uts_proc_notify(enum uts_proc proc)
+{
+	struct ctl_table *table = &uts_kern_table[proc];
+
+	proc_sys_poll_notify(table->poll);
+}
+#endif
+
 static int __init utsname_sysctl_init(void)
 {
 	register_sysctl_table(uts_root_table);
-- 
cgit v1.2.2


From c1e2ee2dc436574880758b3836fc96935b774c32 Mon Sep 17 00:00:00 2001
From: Andrew Bresticker <abrestic@google.com>
Date: Wed, 2 Nov 2011 13:40:29 -0700
Subject: memcg: replace ss->id_lock with a rwlock

While back-porting Johannes Weiner's patch "mm: memcg-aware global
reclaim" for an internal effort, we noticed a significant performance
regression during page-reclaim heavy workloads due to high contention of
the ss->id_lock.  This lock protects idr map, and serializes calls to
idr_get_next() in css_get_next() (which is used during the memcg hierarchy
walk).

Since idr_get_next() is just doing a look up, we need only serialize it
with respect to idr_remove()/idr_get_new().  By making the ss->id_lock a
rwlock, contention is greatly reduced and performance improves.

Tested: cat a 256m file from a ramdisk in a 128m container 50 times on
each core (one file + container per core) in parallel on a NUMA machine.
Result is the time for the test to complete in 1 of the containers.
Both kernels included Johannes' memcg-aware global reclaim patches.

Before rwlock patch: 1710.778s
After rwlock patch: 152.227s

Signed-off-by: Andrew Bresticker <abrestic@google.com>
Cc: Paul Menage <menage@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8386b21224ef..d9d5648f3cdc 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4883,9 +4883,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
 
 	rcu_assign_pointer(id->css, NULL);
 	rcu_assign_pointer(css->id, NULL);
-	spin_lock(&ss->id_lock);
+	write_lock(&ss->id_lock);
 	idr_remove(&ss->idr, id->id);
-	spin_unlock(&ss->id_lock);
+	write_unlock(&ss->id_lock);
 	kfree_rcu(id, rcu_head);
 }
 EXPORT_SYMBOL_GPL(free_css_id);
@@ -4911,10 +4911,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
 		error = -ENOMEM;
 		goto err_out;
 	}
-	spin_lock(&ss->id_lock);
+	write_lock(&ss->id_lock);
 	/* Don't use 0. allocates an ID of 1-65535 */
 	error = idr_get_new_above(&ss->idr, newid, 1, &myid);
-	spin_unlock(&ss->id_lock);
+	write_unlock(&ss->id_lock);
 
 	/* Returns error when there are no free spaces for new ID.*/
 	if (error) {
@@ -4929,9 +4929,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
 	return newid;
 remove_idr:
 	error = -ENOSPC;
-	spin_lock(&ss->id_lock);
+	write_lock(&ss->id_lock);
 	idr_remove(&ss->idr, myid);
-	spin_unlock(&ss->id_lock);
+	write_unlock(&ss->id_lock);
 err_out:
 	kfree(newid);
 	return ERR_PTR(error);
@@ -4943,7 +4943,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
 {
 	struct css_id *newid;
 
-	spin_lock_init(&ss->id_lock);
+	rwlock_init(&ss->id_lock);
 	idr_init(&ss->idr);
 
 	newid = get_new_cssid(ss, 0);
@@ -5038,9 +5038,9 @@ css_get_next(struct cgroup_subsys *ss, int id,
 		 * scan next entry from bitmap(tree), tmpid is updated after
 		 * idr_get_next().
 		 */
-		spin_lock(&ss->id_lock);
+		read_lock(&ss->id_lock);
 		tmp = idr_get_next(&ss->idr, &tmpid);
-		spin_unlock(&ss->id_lock);
+		read_unlock(&ss->id_lock);
 
 		if (!tmp)
 			break;
-- 
cgit v1.2.2


From 4536e4d1d21c8172402a2217b0fa1880665ace36 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 3 Nov 2011 07:44:04 -0700
Subject: Revert "perf: Add PM notifiers to fix CPU hotplug races"

This reverts commit 144060fee07e9c22e179d00819c83c86fbcbf82c.

It causes a resume regression for Andi on his Acer Aspire 1830T post
3.1.  The screen just stays black after wakeup.

Also, it really looks like the wrong way to suspend and resume perf
events: I think they should be done as part of the CPU suspend and
resume, rather than as a notifier that does smp_call_function().

Reported-by: Andi Kleen <andi@firstfloor.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/events/core.c | 97 ++--------------------------------------------------
 1 file changed, 2 insertions(+), 95 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 12a0287e0358..e1253faa34dd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -29,7 +29,6 @@
 #include <linux/hardirq.h>
 #include <linux/rculist.h>
 #include <linux/uaccess.h>
-#include <linux/suspend.h>
 #include <linux/syscalls.h>
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
@@ -6853,7 +6852,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
 	mutex_lock(&swhash->hlist_mutex);
-	if (swhash->hlist_refcount > 0 && !swhash->swevent_hlist) {
+	if (swhash->hlist_refcount > 0) {
 		struct swevent_hlist *hlist;
 
 		hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
@@ -6942,14 +6941,7 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (long)hcpu;
 
-	/*
-	 * Ignore suspend/resume action, the perf_pm_notifier will
-	 * take care of that.
-	 */
-	if (action & CPU_TASKS_FROZEN)
-		return NOTIFY_OK;
-
-	switch (action) {
+	switch (action & ~CPU_TASKS_FROZEN) {
 
 	case CPU_UP_PREPARE:
 	case CPU_DOWN_FAILED:
@@ -6968,90 +6960,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-static void perf_pm_resume_cpu(void *unused)
-{
-	struct perf_cpu_context *cpuctx;
-	struct perf_event_context *ctx;
-	struct pmu *pmu;
-	int idx;
-
-	idx = srcu_read_lock(&pmus_srcu);
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-		ctx = cpuctx->task_ctx;
-
-		perf_ctx_lock(cpuctx, ctx);
-		perf_pmu_disable(cpuctx->ctx.pmu);
-
-		cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-		if (ctx)
-			ctx_sched_out(ctx, cpuctx, EVENT_ALL);
-
-		perf_pmu_enable(cpuctx->ctx.pmu);
-		perf_ctx_unlock(cpuctx, ctx);
-	}
-	srcu_read_unlock(&pmus_srcu, idx);
-}
-
-static void perf_pm_suspend_cpu(void *unused)
-{
-	struct perf_cpu_context *cpuctx;
-	struct perf_event_context *ctx;
-	struct pmu *pmu;
-	int idx;
-
-	idx = srcu_read_lock(&pmus_srcu);
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-		ctx = cpuctx->task_ctx;
-
-		perf_ctx_lock(cpuctx, ctx);
-		perf_pmu_disable(cpuctx->ctx.pmu);
-
-		perf_event_sched_in(cpuctx, ctx, current);
-
-		perf_pmu_enable(cpuctx->ctx.pmu);
-		perf_ctx_unlock(cpuctx, ctx);
-	}
-	srcu_read_unlock(&pmus_srcu, idx);
-}
-
-static int perf_resume(void)
-{
-	get_online_cpus();
-	smp_call_function(perf_pm_resume_cpu, NULL, 1);
-	put_online_cpus();
-
-	return NOTIFY_OK;
-}
-
-static int perf_suspend(void)
-{
-	get_online_cpus();
-	smp_call_function(perf_pm_suspend_cpu, NULL, 1);
-	put_online_cpus();
-
-	return NOTIFY_OK;
-}
-
-static int perf_pm(struct notifier_block *self, unsigned long action, void *ptr)
-{
-	switch (action) {
-	case PM_POST_HIBERNATION:
-	case PM_POST_SUSPEND:
-		return perf_resume();
-	case PM_HIBERNATION_PREPARE:
-	case PM_SUSPEND_PREPARE:
-		return perf_suspend();
-	default:
-		return NOTIFY_DONE;
-	}
-}
-
-static struct notifier_block perf_pm_notifier = {
-	.notifier_call = perf_pm,
-};
-
 void __init perf_event_init(void)
 {
 	int ret;
@@ -7066,7 +6974,6 @@ void __init perf_event_init(void)
 	perf_tp_register();
 	perf_cpu_notifier(perf_cpu_notify);
 	register_reboot_notifier(&perf_reboot_notifier);
-	register_pm_notifier(&perf_pm_notifier);
 
 	ret = init_hw_breakpoint();
 	WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
-- 
cgit v1.2.2


From 79cfbdfa87e84992d509e6c1648a18e1d7e68c20 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Thu, 3 Nov 2011 00:59:25 +0100
Subject: PM / Sleep: Fix race between CPU hotplug and freezer

The CPU hotplug notifications sent out by the _cpu_up() and _cpu_down()
functions depend on the value of the 'tasks_frozen' argument passed to them
(which indicates whether tasks have been frozen or not).
(Examples for such CPU hotplug notifications: CPU_ONLINE, CPU_ONLINE_FROZEN,
CPU_DEAD, CPU_DEAD_FROZEN).

Thus, it is essential that while the callbacks for those notifications are
running, the state of the system with respect to the tasks being frozen or
not remains unchanged, *throughout that duration*. Hence there is a need for
synchronizing the CPU hotplug code with the freezer subsystem.

Since the freezer is involved only in the Suspend/Hibernate call paths, this
patch hooks the CPU hotplug code to the suspend/hibernate notifiers
PM_[SUSPEND|HIBERNATE]_PREPARE and PM_POST_[SUSPEND|HIBERNATE] to prevent
the race between CPU hotplug and freezer, thus ensuring that CPU hotplug
notifications will always be run with the state of the system really being
what the notifications indicate, _throughout_ their execution time.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/cpu.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 12b7458f23b1..aa39dd7a3846 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,6 +15,7 @@
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
 #include <linux/gfp.h>
+#include <linux/suspend.h>
 
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -476,6 +477,79 @@ static int alloc_frozen_cpus(void)
 	return 0;
 }
 core_initcall(alloc_frozen_cpus);
+
+/*
+ * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU
+ * hotplug when tasks are about to be frozen. Also, don't allow the freezer
+ * to continue until any currently running CPU hotplug operation gets
+ * completed.
+ * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the
+ * 'cpu_add_remove_lock'. And this same lock is also taken by the regular
+ * CPU hotplug path and released only after it is complete. Thus, we
+ * (and hence the freezer) will block here until any currently running CPU
+ * hotplug operation gets completed.
+ */
+void cpu_hotplug_disable_before_freeze(void)
+{
+	cpu_maps_update_begin();
+	cpu_hotplug_disabled = 1;
+	cpu_maps_update_done();
+}
+
+
+/*
+ * When tasks have been thawed, re-enable regular CPU hotplug (which had been
+ * disabled while beginning to freeze tasks).
+ */
+void cpu_hotplug_enable_after_thaw(void)
+{
+	cpu_maps_update_begin();
+	cpu_hotplug_disabled = 0;
+	cpu_maps_update_done();
+}
+
+/*
+ * When callbacks for CPU hotplug notifications are being executed, we must
+ * ensure that the state of the system with respect to the tasks being frozen
+ * or not, as reported by the notification, remains unchanged *throughout the
+ * duration* of the execution of the callbacks.
+ * Hence we need to prevent the freezer from racing with regular CPU hotplug.
+ *
+ * This synchronization is implemented by mutually excluding regular CPU
+ * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
+ * Hibernate notifications.
+ */
+static int
+cpu_hotplug_pm_callback(struct notifier_block *nb,
+			unsigned long action, void *ptr)
+{
+	switch (action) {
+
+	case PM_SUSPEND_PREPARE:
+	case PM_HIBERNATION_PREPARE:
+		cpu_hotplug_disable_before_freeze();
+		break;
+
+	case PM_POST_SUSPEND:
+	case PM_POST_HIBERNATION:
+		cpu_hotplug_enable_after_thaw();
+		break;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return NOTIFY_OK;
+}
+
+
+int cpu_hotplug_pm_sync_init(void)
+{
+	pm_notifier(cpu_hotplug_pm_callback, 0);
+	return 0;
+}
+core_initcall(cpu_hotplug_pm_sync_init);
+
 #endif /* CONFIG_PM_SLEEP_SMP */
 
 /**
-- 
cgit v1.2.2


From 6513fd6972f725291ee8ce62c7a39fb8a6c7391e Mon Sep 17 00:00:00 2001
From: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Date: Thu, 3 Nov 2011 10:12:36 +0100
Subject: PM / QoS: Remove redundant check

Remove an "if" check, that repeats an equivalent one 6 lines above.

Signed-off-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/qos.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 1c1797dd1d1d..5167d996cd02 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -386,8 +386,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
 		pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
 		filp->private_data = req;
 
-		if (filp->private_data)
-			return 0;
+		return 0;
 	}
 	return -EPERM;
 }
-- 
cgit v1.2.2


From d6cc76856d353a3a9c43bead33210b9216dce332 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 4 Nov 2011 01:04:52 +0100
Subject: PM / Freezer: Revert 27920651fe "PM / Freezer: Make
 fake_signal_wake_up() wake TASK_KILLABLE tasks too"

Commit 27920651fe "PM / Freezer: Make fake_signal_wake_up() wake
TASK_KILLABLE tasks too" updated fake_signal_wake_up() used by freezer
to wake up KILLABLE tasks.  Sending unsolicited wakeups to tasks in
killable sleep is dangerous as there are code paths which depend on
tasks not waking up spuriously from KILLABLE sleep.

For example. sys_read() or page can sleep in TASK_KILLABLE assuming
that wait/down/whatever _killable can only fail if we can not return
to the usermode.  TASK_TRACED is another obvious example.

The previous patch updated wait_event_freezekillable() such that it
doesn't depend on the spurious wakeup.  This patch reverts the
offending commit.

Note that the spurious KILLABLE wakeup had other implicit effects in
KILLABLE sleeps in nfs and cifs and those will need further updates to
regain freezekillable behavior.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/freezer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/freezer.c b/kernel/freezer.c
index 66a594e8ad2f..7b01de98bb6a 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -67,7 +67,7 @@ static void fake_signal_wake_up(struct task_struct *p)
 	unsigned long flags;
 
 	spin_lock_irqsave(&p->sighand->siglock, flags);
-	signal_wake_up(p, 1);
+	signal_wake_up(p, 0);
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
 }
 
-- 
cgit v1.2.2


From 1cd0d6c3021c8d76641b37203f504634b87fbabc Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Tue, 1 Nov 2011 03:59:33 +0000
Subject: module: Enable dynamic debugging regardless of taint

Dynamic debugging is currently disabled for tainted modules, except
for TAINT_CRAP.  This prevents use of dynamic debugging for
out-of-tree modules once the next patch is applied.

This condition was apparently intended to avoid a crash if a force-
loaded module has an incompatible definition of dynamic debug
structures.  However, a administrator that forces us to load a module
is claiming that it *is* compatible even though it fails our version
checks.  If they are mistaken, there are any number of ways the module
could crash the system.

As a side-effect, proprietary and other tainted modules can now use
dynamic_debug.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 93342d992f34..3c5509642847 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2878,8 +2878,7 @@ static struct module *load_module(void __user *umod,
 	}
 
 	/* This has to be done once we're sure module name is unique. */
-	if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
-		dynamic_debug_setup(info.debug, info.num_debug);
+	dynamic_debug_setup(info.debug, info.num_debug);
 
 	/* Find duplicate symbols */
 	err = verify_export_symbols(mod);
@@ -2915,8 +2914,7 @@ static struct module *load_module(void __user *umod,
 	module_bug_cleanup(mod);
 
  ddebug:
-	if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
-		dynamic_debug_remove(info.debug);
+	dynamic_debug_remove(info.debug);
  unlock:
 	mutex_unlock(&module_mutex);
 	synchronize_sched();
-- 
cgit v1.2.2


From 2449b8ba0745327c5fa49a8d9acffe03b2eded69 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Mon, 24 Oct 2011 15:12:28 +0200
Subject: module,bug: Add TAINT_OOT_MODULE flag for modules not built in-tree

Use of the GPL or a compatible licence doesn't necessarily make the code
any good.  We already consider staging modules to be suspect, and this
should also be true for out-of-tree modules which may receive very
little review.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Reviewed-by: Dave Jones <davej@redhat.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (patched oops-tracing.txt)
---
 kernel/module.c | 5 +++++
 kernel/panic.c  | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 3c5509642847..ef8cb70c6996 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2487,6 +2487,9 @@ static int check_modinfo(struct module *mod, struct load_info *info)
 		return -ENOEXEC;
 	}
 
+	if (!get_modinfo(info, "intree"))
+		add_taint_module(mod, TAINT_OOT_MODULE);
+
 	if (get_modinfo(info, "staging")) {
 		add_taint_module(mod, TAINT_CRAP);
 		printk(KERN_WARNING "%s: module is from the staging directory,"
@@ -3255,6 +3258,8 @@ static char *module_flags(struct module *mod, char *buf)
 		buf[bx++] = '(';
 		if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
 			buf[bx++] = 'P';
+		else if (mod->taints & (1 << TAINT_OOT_MODULE))
+			buf[bx++] = 'O';
 		if (mod->taints & (1 << TAINT_FORCED_MODULE))
 			buf[bx++] = 'F';
 		if (mod->taints & (1 << TAINT_CRAP))
diff --git a/kernel/panic.c b/kernel/panic.c
index d7bb6974efb5..b26593604214 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -177,6 +177,7 @@ static const struct tnt tnts[] = {
 	{ TAINT_WARN,			'W', ' ' },
 	{ TAINT_CRAP,			'C', ' ' },
 	{ TAINT_FIRMWARE_WORKAROUND,	'I', ' ' },
+	{ TAINT_OOT_MODULE,		'O', ' ' },
 };
 
 /**
@@ -194,6 +195,7 @@ static const struct tnt tnts[] = {
  *  'W' - Taint on warning.
  *  'C' - modules from drivers/staging are loaded.
  *  'I' - Working around severe firmware bug.
+ *  'O' - Out-of-tree module has been loaded.
  *
  *	The string is overwritten by the next call to print_tainted().
  */
-- 
cgit v1.2.2


From a6f05b97d1ba87326bd96f3da9fef994830d6994 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 6 Nov 2011 21:54:12 +0100
Subject: PM / QoS: Set cpu_dma_pm_qos->name

Since commit 4a31a334, the name of this misc device is not initialized,
which leads to a funny device named /dev/(null) being created and
/proc/misc containing an entry with just a number but no name. The latter
leads to complaints by cryptsetup, which caused me to investigate this
matter.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/qos.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 56db75147186..995e3bd3417b 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -70,6 +70,7 @@ static struct pm_qos_constraints cpu_dma_constraints = {
 };
 static struct pm_qos_object cpu_dma_pm_qos = {
 	.constraints = &cpu_dma_constraints,
+	.name = "cpu_dma_latency",
 };
 
 static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
-- 
cgit v1.2.2