From 47f61f397cc08b5a9a815bd03cb10c48dab66034 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 26 Jul 2005 11:21:38 -0600
Subject: [PATCH] Add missing device_suspsend(PMSG_FREEZE) calls.

In the recent addition of device_suspend calls into
sys_reboot two code paths were missed.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 9a24374c23bc..5fc10d3e3891 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -391,6 +391,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 	case LINUX_REBOOT_CMD_RESTART:
 		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
 		system_state = SYSTEM_RESTART;
+		device_suspend(PMSG_FREEZE);
 		device_shutdown();
 		printk(KERN_EMERG "Restarting system.\n");
 		machine_restart(NULL);
@@ -452,6 +453,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 		}
 		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
 		system_state = SYSTEM_RESTART;
+		device_suspend(PMSG_FREEZE);
 		device_shutdown();
 		printk(KERN_EMERG "Starting new kernel\n");
 		machine_shutdown();
-- 
cgit v1.2.2


From 4a00ea1e18228e5ef99d4780671fda97226bda30 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 26 Jul 2005 11:24:14 -0600
Subject: [PATCH] Refactor sys_reboot into reusable parts

Because the factors of sys_reboot don't exist people calling
into the reboot path duplicate the code badly, leading to
inconsistent expectations of code in the reboot path.

This patch should is just code motion.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 106 ++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 64 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 5fc10d3e3891..7e033809ef5f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -361,6 +361,62 @@ out_unlock:
 	return retval;
 }
 
+void kernel_restart(char *cmd)
+{
+	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
+	system_state = SYSTEM_RESTART;
+	device_suspend(PMSG_FREEZE);
+	device_shutdown();
+	if (!cmd) {
+		printk(KERN_EMERG "Restarting system.\n");
+	} else {
+		printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
+	}
+	printk(".\n");
+	machine_restart(cmd);
+}
+EXPORT_SYMBOL_GPL(kernel_restart);
+
+void kernel_kexec(void)
+{
+#ifdef CONFIG_KEXEC
+	struct kimage *image;
+	image = xchg(&kexec_image, 0);
+	if (!image) {
+		return;
+	}
+	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+	system_state = SYSTEM_RESTART;
+	device_suspend(PMSG_FREEZE);
+	device_shutdown();
+	printk(KERN_EMERG "Starting new kernel\n");
+	machine_shutdown();
+	machine_kexec(image);
+#endif
+}
+EXPORT_SYMBOL_GPL(kernel_kexec);
+
+void kernel_halt(void)
+{
+	notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
+	system_state = SYSTEM_HALT;
+	device_suspend(PMSG_SUSPEND);
+	device_shutdown();
+	printk(KERN_EMERG "System halted.\n");
+	machine_halt();
+}
+EXPORT_SYMBOL_GPL(kernel_halt);
+
+void kernel_power_off(void)
+{
+	notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
+	system_state = SYSTEM_POWER_OFF;
+	device_suspend(PMSG_SUSPEND);
+	device_shutdown();
+	printk(KERN_EMERG "Power down.\n");
+	machine_power_off();
+}
+EXPORT_SYMBOL_GPL(kernel_power_off);
 
 /*
  * Reboot system call: for obvious reasons only root may call it,
@@ -389,12 +445,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 	lock_kernel();
 	switch (cmd) {
 	case LINUX_REBOOT_CMD_RESTART:
-		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
-		system_state = SYSTEM_RESTART;
-		device_suspend(PMSG_FREEZE);
-		device_shutdown();
-		printk(KERN_EMERG "Restarting system.\n");
-		machine_restart(NULL);
+		kernel_restart(NULL);
 		break;
 
 	case LINUX_REBOOT_CMD_CAD_ON:
@@ -406,23 +457,13 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 		break;
 
 	case LINUX_REBOOT_CMD_HALT:
-		notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
-		system_state = SYSTEM_HALT;
-		device_suspend(PMSG_SUSPEND);
-		device_shutdown();
-		printk(KERN_EMERG "System halted.\n");
-		machine_halt();
+		kernel_halt();
 		unlock_kernel();
 		do_exit(0);
 		break;
 
 	case LINUX_REBOOT_CMD_POWER_OFF:
-		notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
-		system_state = SYSTEM_POWER_OFF;
-		device_suspend(PMSG_SUSPEND);
-		device_shutdown();
-		printk(KERN_EMERG "Power down.\n");
-		machine_power_off();
+		kernel_power_off();
 		unlock_kernel();
 		do_exit(0);
 		break;
@@ -434,33 +475,14 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 		}
 		buffer[sizeof(buffer) - 1] = '\0';
 
-		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
-		system_state = SYSTEM_RESTART;
-		device_suspend(PMSG_FREEZE);
-		device_shutdown();
-		printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
-		machine_restart(buffer);
+		kernel_restart(buffer);
 		break;
 
-#ifdef CONFIG_KEXEC
 	case LINUX_REBOOT_CMD_KEXEC:
-	{
-		struct kimage *image;
-		image = xchg(&kexec_image, 0);
-		if (!image) {
-			unlock_kernel();
-			return -EINVAL;
-		}
-		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
-		system_state = SYSTEM_RESTART;
-		device_suspend(PMSG_FREEZE);
-		device_shutdown();
-		printk(KERN_EMERG "Starting new kernel\n");
-		machine_shutdown();
-		machine_kexec(image);
-		break;
-	}
-#endif
+		kernel_kexec();
+		unlock_kernel();
+		return -EINVAL;
+
 #ifdef CONFIG_SOFTWARE_SUSPEND
 	case LINUX_REBOOT_CMD_SW_SUSPEND:
 		{
-- 
cgit v1.2.2


From abcd9e51f5b832439b119d530db1353c12fd4073 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 26 Jul 2005 11:27:34 -0600
Subject: [PATCH] Make ctrl_alt_del call kernel_restart to get a proper reboot.

It is obvious we wanted to call kernel_restart here
but since we don't have it the code was expanded inline and hasn't
been correct since sometime in 2.4.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 7e033809ef5f..31ac41a73329 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -502,8 +502,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 
 static void deferred_cad(void *dummy)
 {
-	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
-	machine_restart(NULL);
+	kernel_restart(NULL);
 }
 
 /*
-- 
cgit v1.2.2


From 7c9034735eccbf82608a4602c59aaf6053ea9416 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 26 Jul 2005 11:29:55 -0600
Subject: [PATCH] Add emergency_restart()

When the kernel is working well and we want to restart cleanly
kernel_restart is the function to use.   But in many instances
the kernel wants to reboot when thing are expected to be working
very badly such as from panic or a software watchdog handler.

This patch adds the function emergency_restart() so that
callers can be clear what semantics they expect when calling
restart.  emergency_restart() is expected to be callable
from interrupt context and possibly reliable in even more
trying circumstances.

This is an initial generic implementation for all architectures.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 31ac41a73329..a74039036fb4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -361,6 +361,12 @@ out_unlock:
 	return retval;
 }
 
+void emergency_restart(void)
+{
+	machine_emergency_restart();
+}
+EXPORT_SYMBOL_GPL(emergency_restart);
+
 void kernel_restart(char *cmd)
 {
 	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
-- 
cgit v1.2.2


From ff31977782a05504f2586ec9e3e5ab4b09a4c893 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 26 Jul 2005 11:47:32 -0600
Subject: [PATCH] Use kernel_power_off in sysrq-o

We already do all of the gymnastics to run from process context
to call the power off code so call into the power off code cleanly.

This especially helps acpi as part of it's shutdown logic should
run acpi_shutdown called from device_shutdown which was not
being called from here.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/poweroff.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 715081b2d829..7a4144ba3afd 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -9,6 +9,7 @@
 #include <linux/init.h>
 #include <linux/pm.h>
 #include <linux/workqueue.h>
+#include <linux/reboot.h>
 
 /*
  * When the user hits Sys-Rq o to power down the machine this is the
@@ -17,8 +18,7 @@
 
 static void do_poweroff(void *dummy)
 {
-	if (pm_power_off)
-		pm_power_off();
+	kernel_power_off();
 }
 
 static DECLARE_WORK(poweroff_work, do_poweroff, NULL);
-- 
cgit v1.2.2


From 2f048ea81df94f72dee0d42b3d9b941c03b8c9c5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 26 Jul 2005 11:49:23 -0600
Subject: [PATCH] Call emergency_reboot from panic

We know the system is in trouble so there is no question if this
is an emergecy :)

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/panic.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 74ba5f3e46c7..aabc5f86fa3f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -111,12 +111,11 @@ NORET_TYPE void panic(const char * fmt, ...)
 			mdelay(1);
 			i++;
 		}
-		/*
-		 *	Should we run the reboot notifier. For the moment Im
-		 *	choosing not too. It might crash, be corrupt or do
-		 *	more harm than good for other reasons.
+		/*	This will not be a clean reboot, with everything
+		 *	shutting down.  But if there is a chance of
+		 *	rebooting the system it will be rebooted.
 		 */
-		machine_restart(NULL);
+		emergency_restart();
 	}
 #ifdef __sparc__
 	{
-- 
cgit v1.2.2


From fdde86ac50357b6a811e3574e47d189e81a21444 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 26 Jul 2005 12:01:17 -0600
Subject: [PATCH] swpsuspend: Have suspend to disk use factors of sys_reboot

The suspend to disk code was a poor copy of the code in
sys_reboot now that we have kernel_power_off, kernel_restart
and kernel_halt use them instead of poorly duplicating them inline.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 3ec789c6b537..664eb0469b6e 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -59,16 +59,13 @@ static void power_down(suspend_disk_method_t mode)
 		error = pm_ops->enter(PM_SUSPEND_DISK);
 		break;
 	case PM_DISK_SHUTDOWN:
-		printk("Powering off system\n");
-		device_shutdown();
-		machine_power_off();
+		kernel_power_off();
 		break;
 	case PM_DISK_REBOOT:
-		device_shutdown();
-		machine_restart(NULL);
+		kernel_restart(NULL);
 		break;
 	}
-	machine_halt();
+	kernel_halt();
 	/* Valid image is on the disk, if we continue we risk serious data corruption
 	   after resume. */
 	printk(KERN_CRIT "Please power me down manually\n");
-- 
cgit v1.2.2


From 18586e721636527cb5177467fb17e2350615978a Mon Sep 17 00:00:00 2001
From: Andreas Steinmetz <ast@domdv.de>
Date: Sat, 23 Jul 2005 13:42:04 +0200
Subject: [PATCH] Fix RLIMIT_RTPRIO breakage

RLIMIT_RTPRIO is supposed to grant non privileged users the right to use
SCHED_FIFO/SCHED_RR scheduling policies with priorites bounded by the
RLIMIT_RTPRIO value via sched_setscheduler(). This is usually used by
audio users.

Unfortunately this is broken in 2.6.13rc3 as you can see in the excerpt
from sched_setscheduler below:

        /*
         * Allow unprivileged RT tasks to decrease priority:
         */
        if (!capable(CAP_SYS_NICE)) {
                /* can't change policy */
                if (policy != p->policy)
                        return -EPERM;

After the above unconditional test which causes sched_setscheduler to
fail with no regard to the RLIMIT_RTPRIO value the following check is made:

               /* can't increase priority */
                if (policy != SCHED_NORMAL &&
                    param->sched_priority > p->rt_priority &&
                    param->sched_priority >
                                p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
                        return -EPERM;

Thus I do believe that the RLIMIT_RTPRIO value must be taken into
account for the policy check, especially as the RLIMIT_RTPRIO limit is
of no use without this change.

The attached patch fixes this problem.

Signed-off-by: Andreas Steinmetz <ast@domdv.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 4107db0dc091..a5fb654ea590 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3528,7 +3528,8 @@ recheck:
 	 */
 	if (!capable(CAP_SYS_NICE)) {
 		/* can't change policy */
-		if (policy != p->policy)
+		if (policy != p->policy &&
+			!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
 			return -EPERM;
 		/* can't increase priority */
 		if (policy != SCHED_NORMAL &&
-- 
cgit v1.2.2


From d46523ea32a79fbc8cd1237f9441f45cc3f02456 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 25 Jul 2005 16:28:39 -0400
Subject: [PATCH] fix MAX_USER_RT_PRIO and MAX_RT_PRIO

Here's the patch again to fix the code to handle if the values between
MAX_USER_RT_PRIO and MAX_RT_PRIO are different.

Without this patch, an SMP system will crash if the values are
different.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Dean Nelson <dcn@sgi.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a5fb654ea590..a646e4f36c41 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3486,7 +3486,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
 	p->policy = policy;
 	p->rt_priority = prio;
 	if (policy != SCHED_NORMAL)
-		p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
+		p->prio = MAX_RT_PRIO-1 - p->rt_priority;
 	else
 		p->prio = p->static_prio;
 }
@@ -3518,7 +3518,8 @@ recheck:
 	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
 	 */
 	if (param->sched_priority < 0 ||
-	    param->sched_priority > MAX_USER_RT_PRIO-1)
+	    (p->mm &&  param->sched_priority > MAX_USER_RT_PRIO-1) ||
+	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
 	if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
 		return -EINVAL;
-- 
cgit v1.2.2


From bba0e4670a4e1841a96b561dcc60ebe335049891 Mon Sep 17 00:00:00 2001
From: Nigel Cunningham <ncunningham@cyclades.com>
Date: Wed, 27 Jul 2005 11:43:41 -0700
Subject: [PATCH] Address BUG: using smp_processor_id() in preemptible
 [00000001] code

This patch fixes a warning in the disable_nonboot_cpus call in
kernel/power/smp.c.

Signed-off by: Nigel Cunningham <nigel@suspend2.net>

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index bbe23079c62c..911fc62b8225 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -38,7 +38,7 @@ void disable_nonboot_cpus(void)
 		}
 		printk("Error taking cpu %d down: %d\n", cpu, error);
 	}
-	BUG_ON(smp_processor_id() != 0);
+	BUG_ON(raw_smp_processor_id() != 0);
 	if (error)
 		panic("cpus not sleeping");
 }
-- 
cgit v1.2.2


From d912d1ff218195c248c770eb677726695e07aa40 Mon Sep 17 00:00:00 2001
From: George Anzinger <george@mvista.com>
Date: Wed, 27 Jul 2005 11:43:44 -0700
Subject: [PATCH] itimer fixes

Fix the recent off-by-one fix in the itimer code:

1. The repeating timer is figured using the requested time
	(not +1 as we know where we are in the jiffie).

2. The tests for interval too large are left to the time_val to jiffie code.

Signed-off-by: George Anzinger <george@mvista.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/itimer.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/itimer.c b/kernel/itimer.c
index a72cb0e5aa4b..7c1b25e25e47 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -112,28 +112,11 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
 	return error;
 }
 
-/*
- * Called with P->sighand->siglock held and P->signal->real_timer inactive.
- * If interval is nonzero, arm the timer for interval ticks from now.
- */
-static inline void it_real_arm(struct task_struct *p, unsigned long interval)
-{
-	p->signal->it_real_value = interval; /* XXX unnecessary field?? */
-	if (interval == 0)
-		return;
-	if (interval > (unsigned long) LONG_MAX)
-		interval = LONG_MAX;
-	/* the "+ 1" below makes sure that the timer doesn't go off before
-	 * the interval requested. This could happen if
-	 * time requested % (usecs per jiffy) is more than the usecs left
-	 * in the current jiffy */
-	p->signal->real_timer.expires = jiffies + interval + 1;
-	add_timer(&p->signal->real_timer);
-}
 
 void it_real_fn(unsigned long __data)
 {
 	struct task_struct * p = (struct task_struct *) __data;
+	unsigned long inc = p->signal->it_real_incr;
 
 	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p);
 
@@ -141,14 +124,23 @@ void it_real_fn(unsigned long __data)
 	 * Now restart the timer if necessary.  We don't need any locking
 	 * here because do_setitimer makes sure we have finished running
 	 * before it touches anything.
+	 * Note, we KNOW we are (or should be) at a jiffie edge here so
+	 * we don't need the +1 stuff.  Also, we want to use the prior
+	 * expire value so as to not "slip" a jiffie if we are late.
+	 * Deal with requesting a time prior to "now" here rather than
+	 * in add_timer.
 	 */
-	it_real_arm(p, p->signal->it_real_incr);
+	if (!inc)
+		return;
+	while (time_before_eq(p->signal->real_timer.expires, jiffies))
+		p->signal->real_timer.expires += inc;
+	add_timer(&p->signal->real_timer);
 }
 
 int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 {
 	struct task_struct *tsk = current;
- 	unsigned long val, interval;
+ 	unsigned long val, interval, expires;
 	cputime_t cval, cinterval, nval, ninterval;
 
 	switch (which) {
@@ -164,7 +156,10 @@ again:
 		}
 		tsk->signal->it_real_incr =
 			timeval_to_jiffies(&value->it_interval);
-		it_real_arm(tsk, timeval_to_jiffies(&value->it_value));
+		expires = timeval_to_jiffies(&value->it_value);
+		if (expires)
+			mod_timer(&tsk->signal->real_timer,
+				  jiffies + 1 + expires);
 		spin_unlock_irq(&tsk->sighand->siglock);
 		if (ovalue) {
 			jiffies_to_timeval(val, &ovalue->it_value);
-- 
cgit v1.2.2


From 951f22d5b1f0eaae35dafc669e3774a0c2084d10 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 27 Jul 2005 11:44:57 -0700
Subject: [PATCH] s390: spin lock retry

Split spin lock and r/w lock implementation into a single try which is done
inline and an out of line function that repeatedly tries to get the lock
before doing the cpu_relax().  Add a system control to set the number of
retries before a cpu is yielded.

The reason for the spin lock retry is that the diagnose 0x44 that is used to
give up the virtual cpu is quite expensive.  For spin locks that are held only
for a short period of time the costs of the diagnoses outweights the savings
for spin locks that are held for a longer timer.  The default retry count is
1000.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e60b9c36f1f0..3e0bbee549ea 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -114,6 +114,7 @@ extern int unaligned_enabled;
 extern int sysctl_ieee_emulation_warnings;
 #endif
 extern int sysctl_userprocess_debug;
+extern int spin_retry;
 #endif
 
 extern int sysctl_hz_timer;
@@ -647,7 +648,16 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-
+#if defined(CONFIG_ARCH_S390)
+	{
+		.ctl_name	= KERN_SPIN_RETRY,
+		.procname	= "spin_retry",
+		.data		= &spin_retry,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
-- 
cgit v1.2.2


From 207a7ba8dc000e1b13acac97f3736810dd86e8e2 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 27 Jul 2005 11:45:10 -0700
Subject: [PATCH] kernel/capability.c: add kerneldoc

Add kerneldoc to kernel/capability.c

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/capability.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 64db1ee820c2..8986a37a67ea 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -31,8 +31,14 @@ static DEFINE_SPINLOCK(task_capability_lock);
  * uninteresting and/or not to be changed.
  */
 
-/*
+/**
  * sys_capget - get the capabilities of a given process.
+ * @header: pointer to struct that contains capability version and
+ *	target pid data
+ * @dataptr: pointer to struct that contains the effective, permitted,
+ *	and inheritable capabilities that are returned
+ *
+ * Returns 0 on success and < 0 on error.
  */
 asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
 {
@@ -141,8 +147,14 @@ static inline int cap_set_all(kernel_cap_t *effective,
      return ret;
 }
 
-/*
- * sys_capset - set capabilities for a given process, all processes, or all
+/**
+ * sys_capset - set capabilities for a process or a group of processes
+ * @header: pointer to struct that contains capability version and
+ *	target pid data
+ * @data: pointer to struct that contains the effective, permitted,
+ *	and inheritable capabilities
+ *
+ * Set capabilities for a given process, all processes, or all
  * processes in a given process group.
  *
  * The restrictions on setting capabilities are specified as:
@@ -152,6 +164,8 @@ static inline int cap_set_all(kernel_cap_t *effective,
  * I: any raised capabilities must be a subset of the (old current) permitted
  * P: any raised capabilities must be a subset of the (old current) permitted
  * E: must be set to a subset of (new target) permitted
+ *
+ * Returns 0 on success and < 0 on error.
  */
 asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 {
-- 
cgit v1.2.2


From d9fd8a6d443b509147280f058d4e59f0b796a323 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 27 Jul 2005 11:45:11 -0700
Subject: [PATCH] kernel/cpuset.c: add kerneldoc, fix typos

Add kerneldoc to kernel/cpuset.c

Fix cpuset typos in init/Kconfig

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 984c0bf3807f..805fb9097318 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1440,10 +1440,10 @@ void __init cpuset_init_smp(void)
 
 /**
  * cpuset_fork - attach newly forked task to its parents cpuset.
- * @p: pointer to task_struct of forking parent process.
+ * @tsk: pointer to task_struct of forking parent process.
  *
  * Description: By default, on fork, a task inherits its
- * parents cpuset.  The pointer to the shared cpuset is
+ * parent's cpuset.  The pointer to the shared cpuset is
  * automatically copied in fork.c by dup_task_struct().
  * This cpuset_fork() routine need only increment the usage
  * counter in that cpuset.
@@ -1471,7 +1471,6 @@ void cpuset_fork(struct task_struct *tsk)
  * by the cpuset_sem semaphore.  If you don't hold cpuset_sem,
  * then a zero cpuset use count is a license to any other task to
  * nuke the cpuset immediately.
- *
  **/
 
 void cpuset_exit(struct task_struct *tsk)
@@ -1521,7 +1520,9 @@ void cpuset_init_current_mems_allowed(void)
 	current->mems_allowed = NODE_MASK_ALL;
 }
 
-/*
+/**
+ * cpuset_update_current_mems_allowed - update mems parameters to new values
+ *
  * If the current tasks cpusets mems_allowed changed behind our backs,
  * update current->mems_allowed and mems_generation to the new value.
  * Do not call this routine if in_interrupt().
@@ -1540,13 +1541,20 @@ void cpuset_update_current_mems_allowed(void)
 	}
 }
 
+/**
+ * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed
+ * @nodes: pointer to a node bitmap that is and-ed with mems_allowed
+ */
 void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
 {
 	bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
 							MAX_NUMNODES);
 }
 
-/*
+/**
+ * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
+ * @zl: the zonelist to be checked
+ *
  * Are any of the nodes on zonelist zl allowed in current->mems_allowed?
  */
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
@@ -1562,8 +1570,12 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 	return 0;
 }
 
-/*
- * Is 'current' valid, and is zone z allowed in current->mems_allowed?
+/**
+ * cpuset_zone_allowed - is zone z allowed in current->mems_allowed
+ * @z: zone in question
+ *
+ * Is zone z allowed in current->mems_allowed, or is
+ * the CPU in interrupt context? (zone is always allowed in this case)
  */
 int cpuset_zone_allowed(struct zone *z)
 {
-- 
cgit v1.2.2


From e77e17161ccb8bd877bf83b3611cd318e451c605 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 27 Jul 2005 11:45:11 -0700
Subject: [PATCH] kernel/crash_dump.c: add kerneldoc

Add kerneldoc to kernel/crash_dump.c

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/crash_dump.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index 459ba49e376a..334c37f5218a 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -18,7 +18,16 @@
 /* Stores the physical address of elf header of crash image. */
 unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
 
-/*
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ *	space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ *	otherwise @buf is in kernel address space, use memcpy().
+ *
  * Copy a page from "oldmem". For this page, there is no pte mapped
  * in the current kernel. We stitch up a pte, similar to kmap_atomic.
  */
-- 
cgit v1.2.2


From 77933d7276ee8fa0e2947641941a6f7a100a327b Mon Sep 17 00:00:00 2001
From: Jesper Juhl <juhl@dif.dk>
Date: Wed, 27 Jul 2005 11:46:09 -0700
Subject: [PATCH] clean up inline static vs static inline

`gcc -W' likes to complain if the static keyword is not at the beginning of
the declaration.  This patch fixes all remaining occurrences of "inline
static" up with "static inline" in the entire kernel tree (140 occurrences in
47 files).

While making this change I came across a few lines with trailing whitespace
that I also fixed up, I have also added or removed a blank line or two here
and there, but there are no functional changes in the patch.

Signed-off-by: Jesper Juhl <juhl-lkml@dif.dk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index d4335c1c884c..dd5ae1162a8f 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -128,7 +128,7 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __us
  * as real UNIX machines always do it. This avoids all headaches about
  * daylight saving times and warping kernel clocks.
  */
-inline static void warp_clock(void)
+static inline void warp_clock(void)
 {
 	write_seqlock_irq(&xtime_lock);
 	wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
-- 
cgit v1.2.2


From e4ff4d7f9d85a2bc714307eb9113617182e62845 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 27 Jul 2005 10:41:23 -0700
Subject: [PATCH] Avoid device suspend on reboot

My fairly ordinary x86 test box gets stuck during reboot on the
wait_for_completion() in ide_do_drive_cmd():

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index a74039036fb4..8f255259ef9e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -371,7 +371,6 @@ void kernel_restart(char *cmd)
 {
 	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
 	system_state = SYSTEM_RESTART;
-	device_suspend(PMSG_FREEZE);
 	device_shutdown();
 	if (!cmd) {
 		printk(KERN_EMERG "Restarting system.\n");
-- 
cgit v1.2.2


From ed6b676ca8b50e0b538e61c283d52fd04f007abf Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 28 Jul 2005 21:15:49 -0700
Subject: [PATCH] x86_64: Switch to the interrupt stack when running a softirq
 in local_bh_enable()

This avoids some potential stack overflows with very deep softirq callchains.
i386 does this too.

TOADD CFI annotation

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/softirq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index b4ab6af1dea8..31007d6542cc 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -86,7 +86,7 @@ restart:
 	/* Reset the pending bitmask before enabling irqs */
 	local_softirq_pending() = 0;
 
-	local_irq_enable();
+	//local_irq_enable();
 
 	h = softirq_vec;
 
@@ -99,7 +99,7 @@ restart:
 		pending >>= 1;
 	} while (pending);
 
-	local_irq_disable();
+	//local_irq_disable();
 
 	pending = local_softirq_pending();
 	if (pending && --max_restart)
-- 
cgit v1.2.2


From 78fa74a23b16bdb0d944272b696915c4e0bb3ee1 Mon Sep 17 00:00:00 2001
From: George Anzinger <george@mvista.com>
Date: Thu, 28 Jul 2005 21:16:16 -0700
Subject: [PATCH] posix timers: fix normalization problem

(We found this (after a customer complained) and it is in the kernel.org
kernel.  Seems that for CLOCK_MONOTONIC absolute timers and clock_nanosleep
calls both the request time and wall_to_monotonic are subtracted prior to
the normalize resulting in an overflow in the existing normalize test.
This causes the result to be shifted ~4 seconds ahead instead of ~2 seconds
back in time.)

The normalize code in posix-timers.c fails when the tv_nsec member is ~1.2
seconds negative.  This can happen on absolute timers (and
clock_nanosleeps) requested on CLOCK_MONOTONIC (both the request time and
wall_to_monotonic are subtracted resulting in the possibility of a number
close to -2 seconds.)

This fix uses the set_normalized_timespec() (which does not have an
overflow problem) to fix the problem and as a side effect makes the code
cleaner.

Signed-off-by: George Anzinger <george@mvista.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-timers.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5b7b4736d82b..10b2ad749d14 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -896,21 +896,10 @@ static int adjust_abs_time(struct k_clock *clock, struct timespec *tp,
 			jiffies_64_f = get_jiffies_64();
 		}
 		/*
-		 * Take away now to get delta
+		 * Take away now to get delta and normalize
 		 */
-		oc.tv_sec -= now.tv_sec;
-		oc.tv_nsec -= now.tv_nsec;
-		/*
-		 * Normalize...
-		 */
-		while ((oc.tv_nsec - NSEC_PER_SEC) >= 0) {
-			oc.tv_nsec -= NSEC_PER_SEC;
-			oc.tv_sec++;
-		}
-		while ((oc.tv_nsec) < 0) {
-			oc.tv_nsec += NSEC_PER_SEC;
-			oc.tv_sec--;
-		}
+		set_normalized_timespec(&oc, oc.tv_sec - now.tv_sec,
+					oc.tv_nsec - now.tv_nsec);
 	}else{
 		jiffies_64_f = get_jiffies_64();
 	}
-- 
cgit v1.2.2


From 1108bae41e2ac596f46bc4cd8876b93063203d2b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 29 Jul 2005 12:50:57 -0600
Subject: [PATCH] reboot: remove device_suspend(PMSG_FREEZE) from kernel_kexec

If device_suspend(PMSG_FREEZE) is not ready to be called in
kernel_restart it is definitely not ready to be called in the even more
fickle kernel_kexec.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 8f255259ef9e..000e81ad2c1d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -392,7 +392,6 @@ void kernel_kexec(void)
 	}
 	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
 	system_state = SYSTEM_RESTART;
-	device_suspend(PMSG_FREEZE);
 	device_shutdown();
 	printk(KERN_EMERG "Starting new kernel\n");
 	machine_shutdown();
-- 
cgit v1.2.2


From c70f5d6610c601ea2ae4ae4e49f66c80801e895f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 30 Jul 2005 10:22:49 -0700
Subject: [PATCH] revert bogus softirq changes

This snuck in with an x86_64 change.  Thanks to Richard Purdie
<rpurdie@rpsys.net> for spotting it.

Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/softirq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 31007d6542cc..b4ab6af1dea8 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -86,7 +86,7 @@ restart:
 	/* Reset the pending bitmask before enabling irqs */
 	local_softirq_pending() = 0;
 
-	//local_irq_enable();
+	local_irq_enable();
 
 	h = softirq_vec;
 
@@ -99,7 +99,7 @@ restart:
 		pending >>= 1;
 	} while (pending);
 
-	//local_irq_disable();
+	local_irq_disable();
 
 	pending = local_softirq_pending();
 	if (pending && --max_restart)
-- 
cgit v1.2.2


From 6cb54819d7b1867053e2dfd8c0ca3a8dc65a7eff Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 1 Aug 2005 13:39:13 +0200
Subject: [PATCH] remove sys_set_zone_reclaim()

This removes sys_set_zone_reclaim() for now.  While i'm sure Martin is
trying to solve a real problem, we must not hard-code an incomplete and
insufficient approach into a syscall, because syscalls are pretty much
for eternity.  I am quite strongly convinced that this syscall must not
hit v2.6.13 in its current form.

Firstly, the syscall lacks basic syscall design: e.g. it allows the
global setting of VM policy for unprivileged users. (!) [ Imagine an
Oracle installation and a SAP installation on the same NUMA box fighting
over the 'optimal' setting for this flag. What will they do? Will they
try to set the flag to their own preferred value every second or so? ]

Secondly, it was added based on a single datapoint from Martin:

 http://marc.theaimsgroup.com/?l=linux-mm&m=111763597218177&w=2

where Martin characterizes the numbers the following way:

 ' Run-to-run variability for "make -j" is huge, so these numbers aren't
   terribly useful except to see that with reclaim the benchmark still
   finishes in a reasonable amount of time. '

in other words: the fundamental problem has likely not been solved, only
a tendential move into the right direction has been observed, and a
handful of numbers were picked out of a set of hugely variable results,
without showing the variability data. How much variance is there
run-to-run?

I'd really suggest to first walk the walk and see what's needed to get
stable & predictable kernel compilation numbers on that NUMA box, before
adding random syscalls to tune a particular aspect of the VM ... which
approach might not even matter once the whole picture has been analyzed
and understood!

The third, most important point is that the syscall exposes VM tuning
internals in a completely unstructured way. What sense does it make to
have a _GLOBAL_ per-node setting for 'should we go to another node for
reclaim'? If then it might make sense to do this per-app, via numalib or
so.

The change is minimalistic in that it doesnt remove the syscall and the
underlying infrastructure changes, only the user-visible changes.  We
could perhaps add a CAP_SYS_ADMIN-only sysctl for this hack, a'ka
/proc/sys/vm/swappiness, but even that looks quite counterproductive
when the generic approach is that we are trying to reduce the number of
external factors in the VM balance picture.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys_ni.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 42b40ae5eada..1ab2370e2efa 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -79,7 +79,6 @@ cond_syscall(sys_request_key);
 cond_syscall(sys_keyctl);
 cond_syscall(compat_sys_keyctl);
 cond_syscall(compat_sys_socketcall);
-cond_syscall(sys_set_zone_reclaim);
 cond_syscall(sys_inotify_init);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
-- 
cgit v1.2.2


From 842bbaaa7394820c8f1fe0629cd15478653caf86 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 1 Aug 2005 21:11:47 -0700
Subject: [PATCH] Module per-cpu alignment cannot always be met

The module code assumes noone will ever ask for a per-cpu area more than
SMP_CACHE_BYTES aligned.  However, as these cases show, gcc asks sometimes
asks for 32-byte alignment for the per-cpu section on a module, and if
CONFIG_X86_L1_CACHE_SHIFT is 4, we hit that BUG_ON().  This is obviously an
unusual combination, as there have been few reports, but better to warn
than die.

See:
	http://www.ussg.iu.edu/hypermail/linux/kernel/0409.0/0768.html

And more recently:
	http://bugs.gentoo.org/show_bug.cgi?id=97006

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 068e271ab3a5..c32995fbd8fd 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -250,13 +250,18 @@ static inline unsigned int block_size(int val)
 /* Created by linker magic */
 extern char __per_cpu_start[], __per_cpu_end[];
 
-static void *percpu_modalloc(unsigned long size, unsigned long align)
+static void *percpu_modalloc(unsigned long size, unsigned long align,
+			     const char *name)
 {
 	unsigned long extra;
 	unsigned int i;
 	void *ptr;
 
-	BUG_ON(align > SMP_CACHE_BYTES);
+	if (align > SMP_CACHE_BYTES) {
+		printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n",
+		       name, align, SMP_CACHE_BYTES);
+		align = SMP_CACHE_BYTES;
+	}
 
 	ptr = __per_cpu_start;
 	for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
@@ -348,7 +353,8 @@ static int percpu_modinit(void)
 }	
 __initcall(percpu_modinit);
 #else /* ... !CONFIG_SMP */
-static inline void *percpu_modalloc(unsigned long size, unsigned long align)
+static inline void *percpu_modalloc(unsigned long size, unsigned long align,
+				    const char *name)
 {
 	return NULL;
 }
@@ -1644,7 +1650,8 @@ static struct module *load_module(void __user *umod,
 	if (pcpuindex) {
 		/* We have a special allocation for this section. */
 		percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
-					 sechdrs[pcpuindex].sh_addralign);
+					 sechdrs[pcpuindex].sh_addralign,
+					 mod->name);
 		if (!percpu) {
 			err = -ENOMEM;
 			goto free_mod;
-- 
cgit v1.2.2


From c36f19e02a96488f550fdb678c92500afca3109b Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 4 Aug 2005 11:36:26 +0200
Subject: [PATCH] Remove suspend() calls from shutdown path

This removes the calls to device_suspend() from the shutdown path that
were added sometime during 2.6.13-rc*.  They aren't working properly on
a number of configs (I got reports from both ppc powerbook users and x86
users) causing the system to not shutdown anymore.

I think it isn't the right approach at the moment anyway.  We have
already a shutdown() callback for the drivers that actually care about
shutdown and the suspend() code isn't yet in a good enough shape to be
so much generalized.  Also, the semantics of suspend and shutdown are
slightly different on a number of setups and the way this was patched in
provides little way for drivers to cleanly differenciate.  It should
have been at least a different message.

For 2.6.13, I think we should revert to 2.6.12 behaviour and have a
working suspend back.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 000e81ad2c1d..0bcaed6560ac 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -404,7 +404,6 @@ void kernel_halt(void)
 {
 	notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
 	system_state = SYSTEM_HALT;
-	device_suspend(PMSG_SUSPEND);
 	device_shutdown();
 	printk(KERN_EMERG "System halted.\n");
 	machine_halt();
@@ -415,7 +414,6 @@ void kernel_power_off(void)
 {
 	notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
 	system_state = SYSTEM_POWER_OFF;
-	device_suspend(PMSG_SUSPEND);
 	device_shutdown();
 	printk(KERN_EMERG "Power down.\n");
 	machine_power_off();
-- 
cgit v1.2.2


From c306895167c8384b88bc02945a0d226a04218fa5 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 4 Aug 2005 16:49:32 -0700
Subject: [PATCH] revert "timer exit cleanup"

Revert this June 17 patch: it broke persistence of timers across execve().

Cc: Roland McGrath <roland@redhat.com>
Cc: george anzinger <george@mvista.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c         | 4 +++-
 kernel/posix-timers.c | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 9d1b10ed0135..5b0fb9f09f21 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -829,8 +829,10 @@ fastcall NORET_TYPE void do_exit(long code)
 	acct_update_integrals(tsk);
 	update_mem_hiwater(tsk);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
-	if (group_dead)
+	if (group_dead) {
+ 		del_timer_sync(&tsk->signal->real_timer);
 		acct_process(code);
+	}
 	exit_mm(tsk);
 
 	exit_sem(tsk);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 10b2ad749d14..38798a2ff994 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -1166,7 +1166,6 @@ void exit_itimers(struct signal_struct *sig)
 		tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
 		itimer_delete(tmr);
 	}
-	del_timer_sync(&sig->real_timer);
 }
 
 /*
-- 
cgit v1.2.2


From 3077a260e9f316b611436b1506eec9cc5c4f8aa6 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Tue, 9 Aug 2005 10:07:59 -0700
Subject: [PATCH] cpuset release ABBA deadlock fix

Fix possible cpuset_sem ABBA deadlock if 'notify_on_release' set.

For a particular usage pattern, creating and destroying cpusets fairly
frequently using notify_on_release, on a very large system, this deadlock
can be seen every few days.  If you are not using the cpuset
notify_on_release feature, you will never see this deadlock.

The existing code, on task exit (or cpuset deletion) did:

  get cpuset_sem
  if cpuset marked notify_on_release and is ready to release:
    compute cpuset path relative to /dev/cpuset mount point
    call_usermodehelper() forks /sbin/cpuset_release_agent with path
  drop cpuset_sem

Unfortunately, the fork in call_usermodehelper can allocate memory, and
allocating memory can require cpuset_sem, if the mems_generation values
changed in the interim.  This results in an ABBA deadlock, trying to obtain
cpuset_sem when it is already held by the current task.

To fix this, I put the cpuset path (which must be computed while holding
cpuset_sem) in a temporary buffer, to be used in the call_usermodehelper
call of /sbin/cpuset_release_agent only _after_ dropping cpuset_sem.

So the new logic is:

  get cpuset_sem
  if cpuset marked notify_on_release and is ready to release:
    compute cpuset path relative to /dev/cpuset mount point
    stash path in kmalloc'd buffer
  drop cpuset_sem
  call_usermodehelper() forks /sbin/cpuset_release_agent with path
  free path

The sharp eyed reader might notice that this patch does not contain any
calls to kmalloc.  The existing code in the check_for_release() routine was
already kmalloc'ing a buffer to hold the cpuset path.  In the old code, it
just held the buffer for a few lines, over the cpuset_release_agent() call
that in turn invoked call_usermodehelper().  In the new code, with the
application of this patch, it returns that buffer via the new char
**ppathbuf parameter, for later use and freeing in cpuset_release_agent(),
which is called after cpuset_sem is dropped.  Whereas the old code has just
one call to cpuset_release_agent(), right in the check_for_release()
routine, the new code has three calls to cpuset_release_agent(), from the
various places that a cpuset can be released.

This patch has been build and booted on SN2, and passed a stress test that
previously hit the deadlock within a few seconds.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 68 ++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 805fb9097318..21a4e3b2cbda 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -398,21 +398,31 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
  * to continue to serve a useful existence.  Next time it's released,
  * we will get notified again, if it still has 'notify_on_release' set.
  *
- * Note final arg to call_usermodehelper() is 0 - that means
- * don't wait.  Since we are holding the global cpuset_sem here,
- * and we are asking another thread (started from keventd) to rmdir a
- * cpuset, we can't wait - or we'd deadlock with the removing thread
- * on cpuset_sem.
+ * The final arg to call_usermodehelper() is 0, which means don't
+ * wait.  The separate /sbin/cpuset_release_agent task is forked by
+ * call_usermodehelper(), then control in this thread returns here,
+ * without waiting for the release agent task.  We don't bother to
+ * wait because the caller of this routine has no use for the exit
+ * status of the /sbin/cpuset_release_agent task, so no sense holding
+ * our caller up for that.
+ *
+ * The simple act of forking that task might require more memory,
+ * which might need cpuset_sem.  So this routine must be called while
+ * cpuset_sem is not held, to avoid a possible deadlock.  See also
+ * comments for check_for_release(), below.
  */
 
-static int cpuset_release_agent(char *cpuset_str)
+static void cpuset_release_agent(const char *pathbuf)
 {
 	char *argv[3], *envp[3];
 	int i;
 
+	if (!pathbuf)
+		return;
+
 	i = 0;
 	argv[i++] = "/sbin/cpuset_release_agent";
-	argv[i++] = cpuset_str;
+	argv[i++] = (char *)pathbuf;
 	argv[i] = NULL;
 
 	i = 0;
@@ -421,17 +431,29 @@ static int cpuset_release_agent(char *cpuset_str)
 	envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 	envp[i] = NULL;
 
-	return call_usermodehelper(argv[0], argv, envp, 0);
+	call_usermodehelper(argv[0], argv, envp, 0);
+	kfree(pathbuf);
 }
 
 /*
  * Either cs->count of using tasks transitioned to zero, or the
  * cs->children list of child cpusets just became empty.  If this
  * cs is notify_on_release() and now both the user count is zero and
- * the list of children is empty, send notice to user land.
+ * the list of children is empty, prepare cpuset path in a kmalloc'd
+ * buffer, to be returned via ppathbuf, so that the caller can invoke
+ * cpuset_release_agent() with it later on, once cpuset_sem is dropped.
+ * Call here with cpuset_sem held.
+ *
+ * This check_for_release() routine is responsible for kmalloc'ing
+ * pathbuf.  The above cpuset_release_agent() is responsible for
+ * kfree'ing pathbuf.  The caller of these routines is responsible
+ * for providing a pathbuf pointer, initialized to NULL, then
+ * calling check_for_release() with cpuset_sem held and the address
+ * of the pathbuf pointer, then dropping cpuset_sem, then calling
+ * cpuset_release_agent() with pathbuf, as set by check_for_release().
  */
 
-static void check_for_release(struct cpuset *cs)
+static void check_for_release(struct cpuset *cs, char **ppathbuf)
 {
 	if (notify_on_release(cs) && atomic_read(&cs->count) == 0 &&
 	    list_empty(&cs->children)) {
@@ -441,10 +463,9 @@ static void check_for_release(struct cpuset *cs)
 		if (!buf)
 			return;
 		if (cpuset_path(cs, buf, PAGE_SIZE) < 0)
-			goto out;
-		cpuset_release_agent(buf);
-out:
-		kfree(buf);
+			kfree(buf);
+		else
+			*ppathbuf = buf;
 	}
 }
 
@@ -727,14 +748,14 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 	return 0;
 }
 
-static int attach_task(struct cpuset *cs, char *buf)
+static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 {
 	pid_t pid;
 	struct task_struct *tsk;
 	struct cpuset *oldcs;
 	cpumask_t cpus;
 
-	if (sscanf(buf, "%d", &pid) != 1)
+	if (sscanf(pidbuf, "%d", &pid) != 1)
 		return -EIO;
 	if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
 		return -ENOSPC;
@@ -777,7 +798,7 @@ static int attach_task(struct cpuset *cs, char *buf)
 
 	put_task_struct(tsk);
 	if (atomic_dec_and_test(&oldcs->count))
-		check_for_release(oldcs);
+		check_for_release(oldcs, ppathbuf);
 	return 0;
 }
 
@@ -801,6 +822,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	struct cftype *cft = __d_cft(file->f_dentry);
 	cpuset_filetype_t type = cft->private;
 	char *buffer;
+	char *pathbuf = NULL;
 	int retval = 0;
 
 	/* Crude upper limit on largest legitimate cpulist user might write. */
@@ -841,7 +863,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 		retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
 		break;
 	case FILE_TASKLIST:
-		retval = attach_task(cs, buffer);
+		retval = attach_task(cs, buffer, &pathbuf);
 		break;
 	default:
 		retval = -EINVAL;
@@ -852,6 +874,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 		retval = nbytes;
 out2:
 	up(&cpuset_sem);
+	cpuset_release_agent(pathbuf);
 out1:
 	kfree(buffer);
 	return retval;
@@ -1357,6 +1380,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	struct cpuset *cs = dentry->d_fsdata;
 	struct dentry *d;
 	struct cpuset *parent;
+	char *pathbuf = NULL;
 
 	/* the vfs holds both inode->i_sem already */
 
@@ -1376,7 +1400,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 		update_cpu_domains(cs);
 	list_del(&cs->sibling);	/* delete my sibling from parent->children */
 	if (list_empty(&parent->children))
-		check_for_release(parent);
+		check_for_release(parent, &pathbuf);
 	spin_lock(&cs->dentry->d_lock);
 	d = dget(cs->dentry);
 	cs->dentry = NULL;
@@ -1384,6 +1408,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	cpuset_d_remove_dir(d);
 	dput(d);
 	up(&cpuset_sem);
+	cpuset_release_agent(pathbuf);
 	return 0;
 }
 
@@ -1483,10 +1508,13 @@ void cpuset_exit(struct task_struct *tsk)
 	task_unlock(tsk);
 
 	if (notify_on_release(cs)) {
+		char *pathbuf = NULL;
+
 		down(&cpuset_sem);
 		if (atomic_dec_and_test(&cs->count))
-			check_for_release(cs);
+			check_for_release(cs, &pathbuf);
 		up(&cpuset_sem);
+		cpuset_release_agent(pathbuf);
 	} else {
 		atomic_dec(&cs->count);
 	}
-- 
cgit v1.2.2


From 606867443764edac5a2c542f2fa0a12ef7a7c7fd Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@SteelEye.com>
Date: Wed, 10 Aug 2005 11:29:15 -0700
Subject: [PATCH] remove name length check in a workqueue

We have a chek in there to make sure that the name won't overflow
task_struct.comm[], but it's triggering for scsi with lots of HBAs, only
scsi is using single-threaded workqueues which don't append the "/%d"
anyway.

All too hard.  Just kill the BUG_ON.

Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>

[ kthread_create() uses vsnprintf() and limits the thing, so no
  actual overflow can actually happen regardless ]

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 259cf55da3c9..c7e36d4a70ca 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -308,8 +308,6 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	struct workqueue_struct *wq;
 	struct task_struct *p;
 
-	BUG_ON(strlen(name) > 10);
-
 	wq = kmalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
 		return NULL;
-- 
cgit v1.2.2