From de9e007d9105bf8fa613a89810feff32a43add03 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sat, 31 Dec 2005 17:00:29 -0800
Subject: sysctl: make sure to terminate strings with a NUL

This is a slightly more complete fix for the previous minimal sysctl
string fix.  It always terminates the returned string with a NUL, even
if the full result wouldn't fit in the user-supplied buffer.

The returned length is the full untruncated length, so that you can
tell when truncation has occurred.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e5102ea6e104..b53115b882e1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2192,27 +2192,32 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
 		  void __user *oldval, size_t __user *oldlenp,
 		  void __user *newval, size_t newlen, void **context)
 {
-	size_t l, len;
-	
 	if (!table->data || !table->maxlen) 
 		return -ENOTDIR;
 	
 	if (oldval && oldlenp) {
-		if (get_user(len, oldlenp))
+		size_t bufsize;
+		if (get_user(bufsize, oldlenp))
 			return -EFAULT;
-		if (len) {
-			l = strlen(table->data)+1;
-			if (len > l) len = l;
-			if (len >= table->maxlen)
+		if (bufsize) {
+			size_t len = strlen(table->data), copied;
+
+			/* This shouldn't trigger for a well-formed sysctl */
+			if (len > table->maxlen)
 				len = table->maxlen;
-			if(copy_to_user(oldval, table->data, len))
+
+			/* Copy up to a max of bufsize-1 bytes of the string */
+			copied = (len >= bufsize) ? bufsize - 1 : len;
+
+			if (copy_to_user(oldval, table->data, copied) ||
+			    put_user(0, (char __user *)(oldval + copied)))
 				return -EFAULT;
-			if(put_user(len, oldlenp))
+			if (put_user(len, oldlenp))
 				return -EFAULT;
 		}
 	}
 	if (newval && newlen) {
-		len = newlen;
+		size_t len = newlen;
 		if (len > table->maxlen)
 			len = table->maxlen;
 		if(copy_from_user(table->data, newval, len))
-- 
cgit v1.2.2


From febf7ea4bedcd36fba0843db726bba28d22bf89a Mon Sep 17 00:00:00 2001
From:  <sam@mars.ravnborg.org>
Date: Tue, 3 Jan 2006 11:35:26 +0100
Subject: gitignore: ignore more generated files

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
---
 kernel/.gitignore | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 kernel/.gitignore

(limited to 'kernel')

diff --git a/kernel/.gitignore b/kernel/.gitignore
new file mode 100644
index 000000000000..f2ab70073bd4
--- /dev/null
+++ b/kernel/.gitignore
@@ -0,0 +1,5 @@
+#
+# Generated files
+#
+config_data.h
+config_data.gz
-- 
cgit v1.2.2


From f4b09ebc8baa51ec8394c4173e3de9d62b2cc97a Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 3 Jan 2006 13:37:51 +0100
Subject: update the email address of Randy Dunlap

This patch removes all references to the bouncing address
rddunlap@osdl.org and one dead web page from the kernel.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Randy Dunlap <rdunlap@xenotime.net>
---
 kernel/configs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/configs.c b/kernel/configs.c
index 986f7af31e0a..009e1ebdcb88 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -3,7 +3,7 @@
  * Echo the kernel .config file used to build the kernel
  *
  * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com>
- * Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org>
+ * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
  * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
  * Copyright (C) 2002 Hewlett-Packard Company
  *
-- 
cgit v1.2.2


From 0296b2281352e4794e174b393c37f131502e09f0 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@suse.de>
Date: Fri, 11 Nov 2005 05:33:52 +0100
Subject: [PATCH] remove CONFIG_KOBJECT_UEVENT option

It makes zero sense to have hotplug, but not the netlink
events enabled today. Remove this option and merge the
kobject_uevent.h header into the kobject.h header file.

Signed-off-by: Kay Sievers <kay.sievers@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/sysctl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b53115b882e1..6a51e25d4466 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -31,6 +31,7 @@
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/kobject.h>
 #include <linux/net.h>
 #include <linux/sysrq.h>
 #include <linux/highuid.h>
@@ -83,9 +84,6 @@ static int ngroups_max = NGROUPS_MAX;
 #ifdef CONFIG_KMOD
 extern char modprobe_path[];
 #endif
-#ifdef CONFIG_HOTPLUG
-extern char hotplug_path[];
-#endif
 #ifdef CONFIG_CHR_DEV_SG
 extern int sg_big_buff;
 #endif
-- 
cgit v1.2.2


From 0f76e5acf9dc788e664056dda1e461f0bec93948 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@suse.de>
Date: Fri, 11 Nov 2005 04:58:04 +0100
Subject: [PATCH] add uevent_helper control in /sys/kernel/

This deprecates the /proc/sys/kernel/hotplug file, as all
this stuff should be in /sys some day, right? :)
In /sys/kernel/ we have now uevent_seqnum and uevent_helper.
The seqnum is no longer used by udev, as the version for this
kernel depends on netlink which events will never get
out-of-order.

Recent udev versions disable the /sbin/hotplug helper with
an init script, cause it leads to OOM on big boxes by running
hundreds of shells in parallel. It should be done now by:
  echo "" > /sys/kernel/uevent_helper

(Note that "-n" does not work, cause neighter proc nor sysfs
support truncate().)

Signed-off-by: Kay Sievers <kay.sievers@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/ksysfs.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 015fb69ad94d..e975a76a9d5b 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -23,11 +23,29 @@ static struct subsys_attribute _name##_attr = \
 	__ATTR(_name, 0644, _name##_show, _name##_store)
 
 #ifdef CONFIG_HOTPLUG
-static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page)
+/* current uevent sequence number */
+static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
 {
 	return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum);
 }
-KERNEL_ATTR_RO(hotplug_seqnum);
+KERNEL_ATTR_RO(uevent_seqnum);
+
+/* uevent helper program, used during early boo */
+static ssize_t uevent_helper_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%s\n", hotplug_path);
+}
+static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count)
+{
+	if (count+1 > HOTPLUG_PATH_LEN)
+		return -ENOENT;
+	memcpy(hotplug_path, page, count);
+	hotplug_path[count] = '\0';
+	if (count && hotplug_path[count-1] == '\n')
+		hotplug_path[count-1] = '\0';
+	return count;
+}
+KERNEL_ATTR_RW(uevent_helper);
 #endif
 
 #ifdef CONFIG_KEXEC
@@ -45,7 +63,8 @@ EXPORT_SYMBOL_GPL(kernel_subsys);
 
 static struct attribute * kernel_attrs[] = {
 #ifdef CONFIG_HOTPLUG
-	&hotplug_seqnum_attr.attr,
+	&uevent_seqnum_attr.attr,
+	&uevent_helper_attr.attr,
 #endif
 #ifdef CONFIG_KEXEC
 	&crash_notes_attr.attr,
-- 
cgit v1.2.2


From 312c004d36ce6c739512bac83b452f4c20ab1f62 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@suse.de>
Date: Wed, 16 Nov 2005 09:00:00 +0100
Subject: [PATCH] driver core: replace "hotplug" by "uevent"

Leave the overloaded "hotplug" word to susbsystems which are handling
real devices. The driver core does not "plug" anything, it just exports
the state to userspace and generates events.

Signed-off-by: Kay Sievers <kay.sievers@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/ksysfs.c | 14 +++++++-------
 kernel/sysctl.c |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e975a76a9d5b..bfb4a7a54e22 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -26,23 +26,23 @@ static struct subsys_attribute _name##_attr = \
 /* current uevent sequence number */
 static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
 {
-	return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum);
+	return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum);
 }
 KERNEL_ATTR_RO(uevent_seqnum);
 
 /* uevent helper program, used during early boo */
 static ssize_t uevent_helper_show(struct subsystem *subsys, char *page)
 {
-	return sprintf(page, "%s\n", hotplug_path);
+	return sprintf(page, "%s\n", uevent_helper);
 }
 static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count)
 {
-	if (count+1 > HOTPLUG_PATH_LEN)
+	if (count+1 > UEVENT_HELPER_PATH_LEN)
 		return -ENOENT;
-	memcpy(hotplug_path, page, count);
-	hotplug_path[count] = '\0';
-	if (count && hotplug_path[count-1] == '\n')
-		hotplug_path[count-1] = '\0';
+	memcpy(uevent_helper, page, count);
+	uevent_helper[count] = '\0';
+	if (count && uevent_helper[count-1] == '\n')
+		uevent_helper[count-1] = '\0';
 	return count;
 }
 KERNEL_ATTR_RW(uevent_helper);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6a51e25d4466..345f4a1d533f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -395,8 +395,8 @@ static ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_HOTPLUG,
 		.procname	= "hotplug",
-		.data		= &hotplug_path,
-		.maxlen		= HOTPLUG_PATH_LEN,
+		.data		= &uevent_helper,
+		.maxlen		= UEVENT_HELPER_PATH_LEN,
 		.mode		= 0644,
 		.proc_handler	= &proc_dostring,
 		.strategy	= &sysctl_string,
-- 
cgit v1.2.2


From f743ca5e10f4145e0b3e6d11b9b46171e16af7ce Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Tue, 22 Nov 2005 23:36:13 -0800
Subject: [PATCH] kobject_uevent CONFIG_NET=n fix

lib/lib.a(kobject_uevent.o)(.text+0x25f): In function `kobject_uevent':
: undefined reference to `__alloc_skb'
lib/lib.a(kobject_uevent.o)(.text+0x2a1): In function `kobject_uevent':
: undefined reference to `skb_over_panic'
lib/lib.a(kobject_uevent.o)(.text+0x31d): In function `kobject_uevent':
: undefined reference to `skb_over_panic'
lib/lib.a(kobject_uevent.o)(.text+0x356): In function `kobject_uevent':
: undefined reference to `netlink_broadcast'
lib/lib.a(kobject_uevent.o)(.init.text+0x9): In function `kobject_uevent_init':
: undefined reference to `netlink_kernel_create'
make: *** [.tmp_vmlinux1] Error 1

Netlink is unconditionally enabled if CONFIG_NET, so that's OK.

kobject_uevent.o is compiled even if !CONFIG_HOTPLUG, which is lazy.

Let's compound the sin.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/ksysfs.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index bfb4a7a54e22..99af8b05eeaa 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -15,6 +15,9 @@
 #include <linux/module.h>
 #include <linux/init.h>
 
+u64 uevent_seqnum;
+char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug";
+
 #define KERNEL_ATTR_RO(_name) \
 static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
 
-- 
cgit v1.2.2


From a576219aca70e6700705a9836e098dbecd25fb56 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 6 Jan 2006 00:09:50 -0800
Subject: [PATCH] swsusp: resume_store() retval fix

- This function returns -EINVAL all the time.  Fix.

- Decruftify it a bit too.

- Writing to it doesn't seem to do what it's suppoed to do.

Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 027322a564f4..4d944b281b28 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -363,30 +363,28 @@ static ssize_t resume_show(struct subsystem * subsys, char *buf)
 		       MINOR(swsusp_resume_device));
 }
 
-static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n)
+static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
 {
-	int len;
-	char *p;
 	unsigned int maj, min;
-	int error = -EINVAL;
 	dev_t res;
+	int ret = -EINVAL;
 
-	p = memchr(buf, '\n', n);
-	len = p ? p - buf : n;
+	if (sscanf(buf, "%u:%u", &maj, &min) != 2)
+		goto out;
 
-	if (sscanf(buf, "%u:%u", &maj, &min) == 2) {
-		res = MKDEV(maj,min);
-		if (maj == MAJOR(res) && min == MINOR(res)) {
-			down(&pm_sem);
-			swsusp_resume_device = res;
-			up(&pm_sem);
-			printk("Attempting manual resume\n");
-			noresume = 0;
-			software_resume();
-		}
-	}
+	res = MKDEV(maj,min);
+	if (maj != MAJOR(res) || min != MINOR(res))
+		goto out;
 
-	return error >= 0 ? n : error;
+	down(&pm_sem);
+	swsusp_resume_device = res;
+	up(&pm_sem);
+	printk("Attempting manual resume\n");
+	noresume = 0;
+	software_resume();
+	ret = n;
+out:
+	return ret;
 }
 
 power_attr(resume);
-- 
cgit v1.2.2


From 7ee1dd3fee22f15728f545d266403fc977e1eb99 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Jan 2006 00:11:44 -0800
Subject: [PATCH] FRV: Make futex code compilable on nommu [try #2]

Make the futex code compilable and usable on NOMMU by making the attempt to
handle page faults conditional on CONFIG_MMU.  If this is not enabled, then
we can assume that EFAULT returned from futex_atomic_op_inuser() is not
recoverable, and that the address lies outside of valid memory.

handle_mm_fault() is made to BUG if called on NOMMU without attempting to
invoke the actual handler (__handle_mm_fault).

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 5e71a6bf6f6b..5efa2f978032 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -356,6 +356,13 @@ retry:
 		if (bh1 != bh2)
 			spin_unlock(&bh2->lock);
 
+#ifndef CONFIG_MMU
+		/* we don't get EFAULT from MMU faults if we don't have an MMU,
+		 * but we might get them from range checking */
+		ret = op_ret;
+		goto out;
+#endif
+
 		if (unlikely(op_ret != -EFAULT)) {
 			ret = op_ret;
 			goto out;
-- 
cgit v1.2.2


From eee45269b0f5979c70bc151c6c2f4e5f4f5ababe Mon Sep 17 00:00:00 2001
From: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Date: Fri, 6 Jan 2006 00:12:21 -0800
Subject: [PATCH] Alpha: convert to generic irq framework (generic part)

Thanks to Christoph for doing most of the work.

This allows automatic SMP IRQ affinity assignment other than default "all
interrupts on all CPUs" which is rather expensive.  This might be useful if
the hardware can be programmed to distribute interrupts among different
CPUs, like Alpha does.

Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Richard Henderson <rth@twiddle.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/manage.c | 2 ++
 kernel/irq/proc.c   | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 81c49a4d679e..97d5559997d2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -366,6 +366,8 @@ int request_irq(unsigned int irq,
 	action->next = NULL;
 	action->dev_id = dev_id;
 
+	select_smp_affinity(irq);
+
 	retval = setup_irq(irq, action);
 	if (retval)
 		kfree(action);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index f26e534c6585..8a64a4844cde 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -68,7 +68,9 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
 	 */
 	cpus_and(tmp, new_value, cpu_online_map);
 	if (cpus_empty(tmp))
-		return -EINVAL;
+		/* Special case for empty set - allow the architecture
+		   code to set default SMP affinity. */
+		return select_smp_affinity(irq) ? -EINVAL : full_count;
 
 	proc_set_irq_affinity(irq, new_value);
 
-- 
cgit v1.2.2


From f2d97f02961e8b1f8a24befb88ab0e5c886586ff Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:12:24 -0800
Subject: [PATCH] swsusp: remove encryption

This patch removes the image encryption that is only used by swsusp instead of
zeroing the image after resume in order to prevent someone from reading some
confidential data from it in the future and it does not protect the image from
being read by an unauthorized person before resume.  The functionality it
provides should really belong to the user space and will possibly be
reimplemented after the swap-handling functionality of swsusp is moved to the
user space.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 163 ++------------------------------------------------
 1 file changed, 4 insertions(+), 159 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index c05f46e7348f..bd3097c583bf 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -30,9 +30,6 @@
  * Alex Badea <vampire@go.ro>:
  * Fixed runaway init
  *
- * Andreas Steinmetz <ast@domdv.de>:
- * Added encrypted suspend option
- *
  * More state savers are welcome. Especially for the scsi layer...
  *
  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
@@ -67,10 +64,6 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
-#include <linux/random.h>
-#include <linux/crypto.h>
-#include <asm/scatterlist.h>
-
 #include "power.h"
 
 #ifdef CONFIG_HIGHMEM
@@ -81,10 +74,6 @@ static int save_highmem(void) { return 0; }
 static int restore_highmem(void) { return 0; }
 #endif
 
-#define CIPHER "aes"
-#define MAXKEY 32
-#define MAXIV  32
-
 extern char resume_file[];
 
 /* Local variables that should not be affected by save */
@@ -102,8 +91,7 @@ suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
 #define SWSUSP_SIG	"S1SUSPEND"
 
 static struct swsusp_header {
-	char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)];
-	u8 key_iv[MAXKEY+MAXIV];
+	char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
 	swp_entry_t swsusp_info;
 	char	orig_sig[10];
 	char	sig[10];
@@ -123,131 +111,6 @@ static struct swsusp_info swsusp_info;
 static unsigned short swapfile_used[MAX_SWAPFILES];
 static unsigned short root_swap;
 
-static int write_page(unsigned long addr, swp_entry_t *loc);
-static int bio_read_page(pgoff_t page_off, void *page);
-
-static u8 key_iv[MAXKEY+MAXIV];
-
-#ifdef CONFIG_SWSUSP_ENCRYPT
-
-static int crypto_init(int mode, void **mem)
-{
-	int error = 0;
-	int len;
-	char *modemsg;
-	struct crypto_tfm *tfm;
-
-	modemsg = mode ? "suspend not possible" : "resume not possible";
-
-	tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC);
-	if(!tfm) {
-		printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg);
-		error = -EINVAL;
-		goto out;
-	}
-
-	if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) {
-		printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg);
-		error = -ENOKEY;
-		goto fail;
-	}
-
-	if (mode)
-		get_random_bytes(key_iv, MAXKEY+MAXIV);
-
-	len = crypto_tfm_alg_max_keysize(tfm);
-	if (len > MAXKEY)
-		len = MAXKEY;
-
-	if (crypto_cipher_setkey(tfm, key_iv, len)) {
-		printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg);
-		error = -EKEYREJECTED;
-		goto fail;
-	}
-
-	len = crypto_tfm_alg_ivsize(tfm);
-
-	if (MAXIV < len) {
-		printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg);
-		error = -EOVERFLOW;
-		goto fail;
-	}
-
-	crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len);
-
-	*mem=(void *)tfm;
-
-	goto out;
-
-fail:	crypto_free_tfm(tfm);
-out:	return error;
-}
-
-static __inline__ void crypto_exit(void *mem)
-{
-	crypto_free_tfm((struct crypto_tfm *)mem);
-}
-
-static __inline__ int crypto_write(struct pbe *p, void *mem)
-{
-	int error = 0;
-	struct scatterlist src, dst;
-
-	src.page   = virt_to_page(p->address);
-	src.offset = 0;
-	src.length = PAGE_SIZE;
-	dst.page   = virt_to_page((void *)&swsusp_header);
-	dst.offset = 0;
-	dst.length = PAGE_SIZE;
-
-	error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src,
-					PAGE_SIZE);
-
-	if (!error)
-		error = write_page((unsigned long)&swsusp_header,
-				&(p->swap_address));
-	return error;
-}
-
-static __inline__ int crypto_read(struct pbe *p, void *mem)
-{
-	int error = 0;
-	struct scatterlist src, dst;
-
-	error = bio_read_page(swp_offset(p->swap_address), (void *)p->address);
-	if (!error) {
-		src.offset = 0;
-		src.length = PAGE_SIZE;
-		dst.offset = 0;
-		dst.length = PAGE_SIZE;
-		src.page = dst.page = virt_to_page((void *)p->address);
-
-		error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst,
-						&src, PAGE_SIZE);
-	}
-	return error;
-}
-#else
-static __inline__ int crypto_init(int mode, void *mem)
-{
-	return 0;
-}
-
-static __inline__ void crypto_exit(void *mem)
-{
-}
-
-static __inline__ int crypto_write(struct pbe *p, void *mem)
-{
-	return write_page(p->address, &(p->swap_address));
-}
-
-static __inline__ int crypto_read(struct pbe *p, void *mem)
-{
-	return bio_read_page(swp_offset(p->swap_address), (void *)p->address);
-}
-#endif
-
 static int mark_swapfiles(swp_entry_t prev)
 {
 	int error;
@@ -259,7 +122,6 @@ static int mark_swapfiles(swp_entry_t prev)
 	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
 		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
-		memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV);
 		swsusp_header.swsusp_info = prev;
 		error = rw_swap_page_sync(WRITE,
 					  swp_entry(root_swap, 0),
@@ -405,10 +267,6 @@ static int data_write(void)
 	int error = 0, i = 0;
 	unsigned int mod = nr_copy_pages / 100;
 	struct pbe *p;
-	void *tfm;
-
-	if ((error = crypto_init(1, &tfm)))
-		return error;
 
 	if (!mod)
 		mod = 1;
@@ -417,14 +275,11 @@ static int data_write(void)
 	for_each_pbe (p, pagedir_nosave) {
 		if (!(i%mod))
 			printk( "\b\b\b\b%3d%%", i / mod );
-		if ((error = crypto_write(p, tfm))) {
-			crypto_exit(tfm);
+		if ((error = write_page(p->address, &p->swap_address)))
 			return error;
-		}
 		i++;
 	}
 	printk("\b\b\b\bdone\n");
-	crypto_exit(tfm);
 	return error;
 }
 
@@ -550,7 +405,6 @@ static int write_suspend_image(void)
 	if ((error = close_swap()))
 		goto FreePagedir;
  Done:
-	memset(key_iv, 0, MAXKEY+MAXIV);
 	return error;
  FreePagedir:
 	free_pagedir_entries();
@@ -812,8 +666,6 @@ static int check_sig(void)
 		return error;
 	if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
-		memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV);
-		memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV);
 
 		/*
 		 * Reset swap signature now.
@@ -840,10 +692,6 @@ static int data_read(struct pbe *pblist)
 	int error = 0;
 	int i = 0;
 	int mod = swsusp_info.image_pages / 100;
-	void *tfm;
-
-	if ((error = crypto_init(0, &tfm)))
-		return error;
 
 	if (!mod)
 		mod = 1;
@@ -855,15 +703,13 @@ static int data_read(struct pbe *pblist)
 		if (!(i % mod))
 			printk("\b\b\b\b%3d%%", i / mod);
 
-		if ((error = crypto_read(p, tfm))) {
-			crypto_exit(tfm);
+		if ((error = bio_read_page(swp_offset(p->swap_address),
+						(void *)p->address)))
 			return error;
-		}
 
 		i++;
 	}
 	printk("\b\b\b\bdone\n");
-	crypto_exit(tfm);
 	return error;
 }
 
@@ -986,7 +832,6 @@ int swsusp_read(void)
 
 	error = read_suspend_image();
 	blkdev_put(resume_bdev);
-	memset(key_iv, 0, MAXKEY+MAXIV);
 
 	if (!error)
 		pr_debug("swsusp: Reading resume file was successful\n");
-- 
cgit v1.2.2


From 7088a5c00103ef48782d6c359cd12b13a10666e6 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:13:05 -0800
Subject: [PATCH] swsusp: introduce the swap map structure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces the swap map structure that can be used by swsusp for
keeping tracks of data pages written to the swap.   The structure itself is
described in a comment within the patch.

The overall idea is to reduce the amount of metadata written to the swap and
to write and read the image pages sequentially, in a file-alike way.  This
makes the swap-handling part of swsusp fairly independent of its
snapshot-handling part and will hopefully allow us to completely separate
these two parts in the future.

This patch is needed to remove the suspend image size limit imposed by the
limited size of the swsusp_info structure, which is essential for x86-64
systems with more than 512 MB of RAM.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c     |   8 +-
 kernel/power/power.h    |  13 +-
 kernel/power/snapshot.c |  14 +-
 kernel/power/swsusp.c   | 558 ++++++++++++++++++++++++++++++++++--------------
 4 files changed, 417 insertions(+), 176 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 4d944b281b28..76a5131b0e80 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -25,9 +25,9 @@
 extern suspend_disk_method_t pm_disk_mode;
 
 extern int swsusp_suspend(void);
-extern int swsusp_write(void);
+extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
 extern int swsusp_check(void);
-extern int swsusp_read(void);
+extern int swsusp_read(struct pbe **pblist_ptr);
 extern void swsusp_close(void);
 extern int swsusp_resume(void);
 
@@ -176,7 +176,7 @@ int pm_suspend_disk(void)
 	if (in_suspend) {
 		device_resume();
 		pr_debug("PM: writing image.\n");
-		error = swsusp_write();
+		error = swsusp_write(pagedir_nosave, nr_copy_pages);
 		if (!error)
 			power_down(pm_disk_mode);
 		else {
@@ -247,7 +247,7 @@ static int software_resume(void)
 
 	pr_debug("PM: Reading swsusp image.\n");
 
-	if ((error = swsusp_read())) {
+	if ((error = swsusp_read(&pagedir_nosave))) {
 		swsusp_free();
 		goto Thaw;
 	}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 6c042b5ee14b..977877c6dcfc 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -9,19 +9,14 @@
 #define SUSPEND_CONSOLE	(MAX_NR_CONSOLES-1)
 #endif
 
-#define MAX_PBES	((PAGE_SIZE - sizeof(struct new_utsname) \
-			- 4 - 3*sizeof(unsigned long) - sizeof(int) \
-			- sizeof(void *)) / sizeof(swp_entry_t))
-
 struct swsusp_info {
 	struct new_utsname	uts;
 	u32			version_code;
 	unsigned long		num_physpages;
 	int			cpus;
 	unsigned long		image_pages;
-	unsigned long		pagedir_pages;
-	suspend_pagedir_t	* suspend_pagedir;
-	swp_entry_t		pagedir[MAX_PBES];
+	unsigned long		pages;
+	swp_entry_t		start;
 } __attribute__((aligned(PAGE_SIZE)));
 
 
@@ -67,6 +62,8 @@ extern asmlinkage int swsusp_arch_resume(void);
 
 extern void free_pagedir(struct pbe *pblist);
 extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
-extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
 extern void swsusp_free(void);
 extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
+extern unsigned int snapshot_nr_pages(void);
+extern struct pbe *snapshot_pblist(void);
+extern void snapshot_pblist_set(struct pbe *pblist);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4a6dbcefd378..152d56cdf017 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -33,6 +33,9 @@
 
 #include "power.h"
 
+struct pbe *pagedir_nosave;
+unsigned int nr_copy_pages;
+
 #ifdef CONFIG_HIGHMEM
 struct highmem_page {
 	char *data;
@@ -244,7 +247,7 @@ static inline void fill_pb_page(struct pbe *pbpage)
  *	of memory pages allocated with alloc_pagedir()
  */
 
-void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
+static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 {
 	struct pbe *pbpage, *p;
 	unsigned int num = PBES_PER_PAGE;
@@ -261,7 +264,6 @@ void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 			p->next = p + 1;
 		p->next = NULL;
 	}
-	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
 }
 
 /**
@@ -332,7 +334,8 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
 	if (!pbe) { /* get_zeroed_page() failed */
 		free_pagedir(pblist);
 		pblist = NULL;
-        }
+        } else
+        	create_pbe_list(pblist, nr_pages);
 	return pblist;
 }
 
@@ -395,7 +398,6 @@ static struct pbe *swsusp_alloc(unsigned int nr_pages)
 		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
 		return NULL;
 	}
-	create_pbe_list(pblist, nr_pages);
 
 	if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
 		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
@@ -421,10 +423,6 @@ asmlinkage int swsusp_save(void)
 		 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
 		 PAGES_FOR_IO, nr_free_pages());
 
-	/* This is needed because of the fixed size of swsusp_info */
-	if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE)
-		return -ENOSPC;
-
 	if (!enough_free_mem(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free memory\n");
 		return -ENOMEM;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index bd3097c583bf..b09bd7c0998d 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -30,6 +30,9 @@
  * Alex Badea <vampire@go.ro>:
  * Fixed runaway init
  *
+ * Rafael J. Wysocki <rjw@sisk.pl>
+ * Added the swap map data structure and reworked the handling of swap
+ *
  * More state savers are welcome. Especially for the scsi layer...
  *
  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
@@ -76,18 +79,6 @@ static int restore_highmem(void) { return 0; }
 
 extern char resume_file[];
 
-/* Local variables that should not be affected by save */
-unsigned int nr_copy_pages __nosavedata = 0;
-
-/* Suspend pagedir is allocated before final copy, therefore it
-   must be freed after resume
-
-   Warning: this is even more evil than it seems. Pagedirs this file
-   talks about are completely different from page directories used by
-   MMU hardware.
- */
-suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
-
 #define SWSUSP_SIG	"S1SUSPEND"
 
 static struct swsusp_header {
@@ -238,48 +229,205 @@ static int write_page(unsigned long addr, swp_entry_t *loc)
 }
 
 /**
- *	data_free - Free the swap entries used by the saved image.
+ *	Swap map-handling functions
+ *
+ *	The swap map is a data structure used for keeping track of each page
+ *	written to the swap.  It consists of many swap_map_page structures
+ *	that contain each an array of MAP_PAGE_SIZE swap entries.
+ *	These structures are linked together with the help of either the
+ *	.next (in memory) or the .next_swap (in swap) member.
  *
- *	Walk the list of used swap entries and free each one.
- *	This is only used for cleanup when suspend fails.
+ *	The swap map is created during suspend.  At that time we need to keep
+ *	it in memory, because we have to free all of the allocated swap
+ *	entries if an error occurs.  The memory needed is preallocated
+ *	so that we know in advance if there's enough of it.
+ *
+ *	The first swap_map_page structure is filled with the swap entries that
+ *	correspond to the first MAP_PAGE_SIZE data pages written to swap and
+ *	so on.  After the all of the data pages have been written, the order
+ *	of the swap_map_page structures in the map is reversed so that they
+ *	can be read from swap in the original order.  This causes the data
+ *	pages to be loaded in exactly the same order in which they have been
+ *	saved.
+ *
+ *	During resume we only need to use one swap_map_page structure
+ *	at a time, which means that we only need to use two memory pages for
+ *	reading the image - one for reading the swap_map_page structures
+ *	and the second for reading the data pages from swap.
  */
-static void data_free(void)
+
+#define MAP_PAGE_SIZE	((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \
+			/ sizeof(swp_entry_t))
+
+struct swap_map_page {
+	swp_entry_t		entries[MAP_PAGE_SIZE];
+	swp_entry_t		next_swap;
+	struct swap_map_page	*next;
+};
+
+static inline void free_swap_map(struct swap_map_page *swap_map)
 {
-	swp_entry_t entry;
-	struct pbe *p;
+	struct swap_map_page *swp;
 
-	for_each_pbe (p, pagedir_nosave) {
-		entry = p->swap_address;
-		if (entry.val)
-			swap_free(entry);
-		else
-			break;
+	while (swap_map) {
+		swp = swap_map->next;
+		free_page((unsigned long)swap_map);
+		swap_map = swp;
+	}
+}
+
+static struct swap_map_page *alloc_swap_map(unsigned int nr_pages)
+{
+	struct swap_map_page *swap_map, *swp;
+	unsigned n = 0;
+
+	if (!nr_pages)
+		return NULL;
+
+	pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages);
+	swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+	swp = swap_map;
+	for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) {
+		swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+		swp = swp->next;
+		if (!swp) {
+			free_swap_map(swap_map);
+			return NULL;
+		}
 	}
+	return swap_map;
 }
 
 /**
- *	data_write - Write saved image to swap.
- *
- *	Walk the list of pages in the image and sync each one to swap.
+ *	reverse_swap_map - reverse the order of pages in the swap map
+ *	@swap_map
  */
-static int data_write(void)
+
+static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
 {
-	int error = 0, i = 0;
-	unsigned int mod = nr_copy_pages / 100;
-	struct pbe *p;
+	struct swap_map_page *prev, *next;
+
+	prev = NULL;
+	while (swap_map) {
+		next = swap_map->next;
+		swap_map->next = prev;
+		prev = swap_map;
+		swap_map = next;
+	}
+	return prev;
+}
 
-	if (!mod)
-		mod = 1;
+/**
+ *	free_swap_map_entries - free the swap entries allocated to store
+ *	the swap map @swap_map (this is only called in case of an error)
+ */
+static inline void free_swap_map_entries(struct swap_map_page *swap_map)
+{
+	while (swap_map) {
+		if (swap_map->next_swap.val)
+			swap_free(swap_map->next_swap);
+		swap_map = swap_map->next;
+	}
+}
 
-	printk( "Writing data to swap (%d pages)...     ", nr_copy_pages );
-	for_each_pbe (p, pagedir_nosave) {
-		if (!(i%mod))
-			printk( "\b\b\b\b%3d%%", i / mod );
-		if ((error = write_page(p->address, &p->swap_address)))
+/**
+ *	save_swap_map - save the swap map used for tracing the data pages
+ *	stored in the swap
+ */
+
+static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
+{
+	swp_entry_t entry = (swp_entry_t){0};
+	int error;
+
+	while (swap_map) {
+		swap_map->next_swap = entry;
+		if ((error = write_page((unsigned long)swap_map, &entry)))
 			return error;
-		i++;
+		swap_map = swap_map->next;
 	}
-	printk("\b\b\b\bdone\n");
+	*start = entry;
+	return 0;
+}
+
+/**
+ *	free_image_entries - free the swap entries allocated to store
+ *	the image data pages (this is only called in case of an error)
+ */
+
+static inline void free_image_entries(struct swap_map_page *swp)
+{
+	unsigned k;
+
+	while (swp) {
+		for (k = 0; k < MAP_PAGE_SIZE; k++)
+			if (swp->entries[k].val)
+				swap_free(swp->entries[k]);
+		swp = swp->next;
+	}
+}
+
+/**
+ *	The swap_map_handle structure is used for handling the swap map in
+ *	a file-alike way
+ */
+
+struct swap_map_handle {
+	struct swap_map_page *cur;
+	unsigned int k;
+};
+
+static inline void init_swap_map_handle(struct swap_map_handle *handle,
+                                        struct swap_map_page *map)
+{
+	handle->cur = map;
+	handle->k = 0;
+}
+
+static inline int swap_map_write_page(struct swap_map_handle *handle,
+                                      unsigned long addr)
+{
+	int error;
+
+	error = write_page(addr, handle->cur->entries + handle->k);
+	if (error)
+		return error;
+	if (++handle->k >= MAP_PAGE_SIZE) {
+		handle->cur = handle->cur->next;
+		handle->k = 0;
+	}
+	return 0;
+}
+
+/**
+ *	save_image_data - save the data pages pointed to by the PBEs
+ *	from the list @pblist using the swap map handle @handle
+ *	(assume there are @nr_pages data pages to save)
+ */
+
+static int save_image_data(struct pbe *pblist,
+                           struct swap_map_handle *handle,
+                           unsigned int nr_pages)
+{
+	unsigned int m;
+	struct pbe *p;
+	int error = 0;
+
+	printk("Saving image data pages (%u pages) ...     ", nr_pages);
+	m = nr_pages / 100;
+	if (!m)
+		m = 1;
+	nr_pages = 0;
+	for_each_pbe (p, pblist) {
+		error = swap_map_write_page(handle, p->address);
+		if (error)
+			break;
+		if (!(nr_pages % m))
+			printk("\b\b\b\b%3d%%", nr_pages / m);
+		nr_pages++;
+	}
+	if (!error)
+		printk("\b\b\b\bdone\n");
 	return error;
 }
 
@@ -295,19 +443,20 @@ static void dump_info(void)
 	pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
 	pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
 	pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
-	pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages);
+	pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
 }
 
-static void init_header(void)
+static void init_header(unsigned int nr_pages)
 {
 	memset(&swsusp_info, 0, sizeof(swsusp_info));
 	swsusp_info.version_code = LINUX_VERSION_CODE;
 	swsusp_info.num_physpages = num_physpages;
 	memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
 
-	swsusp_info.suspend_pagedir = pagedir_nosave;
 	swsusp_info.cpus = num_online_cpus();
-	swsusp_info.image_pages = nr_copy_pages;
+	swsusp_info.image_pages = nr_pages;
+	swsusp_info.pages = nr_pages +
+		((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT);
 }
 
 static int close_swap(void)
@@ -326,39 +475,53 @@ static int close_swap(void)
 }
 
 /**
- *	free_pagedir_entries - Free pages used by the page directory.
- *
- *	This is used during suspend for error recovery.
+ *	pack_orig_addresses - the .orig_address fields of the PBEs from the
+ *	list starting at @pbe are stored in the array @buf[] (1 page)
  */
 
-static void free_pagedir_entries(void)
+static inline struct pbe *pack_orig_addresses(unsigned long *buf,
+                                              struct pbe *pbe)
 {
-	int i;
+	int j;
 
-	for (i = 0; i < swsusp_info.pagedir_pages; i++)
-		swap_free(swsusp_info.pagedir[i]);
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		buf[j] = pbe->orig_address;
+		pbe = pbe->next;
+	}
+	if (!pbe)
+		for (; j < PAGE_SIZE / sizeof(long); j++)
+			buf[j] = 0;
+	return pbe;
 }
 
-
 /**
- *	write_pagedir - Write the array of pages holding the page directory.
- *	@last:	Last swap entry we write (needed for header).
+ *	save_image_metadata - save the .orig_address fields of the PBEs
+ *	from the list @pblist using the swap map handle @handle
  */
 
-static int write_pagedir(void)
+static int save_image_metadata(struct pbe *pblist,
+                               struct swap_map_handle *handle)
 {
-	int error = 0;
+	unsigned long *buf;
 	unsigned int n = 0;
-	struct pbe *pbe;
+	struct pbe *p;
+	int error = 0;
 
-	printk( "Writing pagedir...");
-	for_each_pb_page (pbe, pagedir_nosave) {
-		if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++])))
-			return error;
+	printk("Saving image metadata ... ");
+	buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
+	if (!buf)
+		return -ENOMEM;
+	p = pblist;
+	while (p) {
+		p = pack_orig_addresses(buf, p);
+		error = swap_map_write_page(handle, (unsigned long)buf);
+		if (error)
+			break;
+		n++;
 	}
-
-	swsusp_info.pagedir_pages = n;
-	printk("done (%u pages)\n", n);
+	free_page((unsigned long)buf);
+	if (!error)
+		printk("done (%u pages saved)\n", n);
 	return error;
 }
 
@@ -384,33 +547,48 @@ static int enough_swap(unsigned int nr_pages)
 
 /**
  *	write_suspend_image - Write entire image and metadata.
- *
  */
-static int write_suspend_image(void)
+static int write_suspend_image(struct pbe *pblist, unsigned int nr_pages)
 {
+	struct swap_map_page *swap_map;
+	struct swap_map_handle handle;
 	int error;
 
-	if (!enough_swap(nr_copy_pages)) {
+	if (!enough_swap(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free swap\n");
 		return -ENOSPC;
 	}
 
-	init_header();
-	if ((error = data_write()))
-		goto FreeData;
+	init_header(nr_pages);
+	swap_map = alloc_swap_map(swsusp_info.pages);
+	if (!swap_map)
+		return -ENOMEM;
+	init_swap_map_handle(&handle, swap_map);
 
-	if ((error = write_pagedir()))
-		goto FreePagedir;
+	error = save_image_metadata(pblist, &handle);
+	if (!error)
+		error = save_image_data(pblist, &handle, nr_pages);
+	if (error)
+		goto Free_image_entries;
 
-	if ((error = close_swap()))
-		goto FreePagedir;
- Done:
+	swap_map = reverse_swap_map(swap_map);
+	error = save_swap_map(swap_map, &swsusp_info.start);
+	if (error)
+		goto Free_map_entries;
+
+	error = close_swap();
+	if (error)
+		goto Free_map_entries;
+
+Free_swap_map:
+	free_swap_map(swap_map);
 	return error;
- FreePagedir:
-	free_pagedir_entries();
- FreeData:
-	data_free();
-	goto Done;
+
+Free_map_entries:
+	free_swap_map_entries(swap_map);
+Free_image_entries:
+	free_image_entries(swap_map);
+	goto Free_swap_map;
 }
 
 /* It is important _NOT_ to umount filesystems at this point. We want
@@ -418,7 +596,7 @@ static int write_suspend_image(void)
  * filesystem clean: it is not. (And it does not matter, if we resume
  * correctly, we'll mark system clean, anyway.)
  */
-int swsusp_write(void)
+int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 {
 	int error;
 
@@ -427,14 +605,12 @@ int swsusp_write(void)
 		return error;
 	}
 	lock_swapdevices();
-	error = write_suspend_image();
+	error = write_suspend_image(pblist, nr_pages);
 	/* This will unlock ignored swap devices since writing is finished */
 	lock_swapdevices();
 	return error;
 }
 
-
-
 int swsusp_suspend(void)
 {
 	int error;
@@ -531,7 +707,6 @@ static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
 	/* We assume both lists contain the same number of elements */
 	while (src) {
 		dst->orig_address = src->orig_address;
-		dst->swap_address = src->swap_address;
 		dst = dst->next;
 		src = src->next;
 	}
@@ -611,6 +786,61 @@ static int bio_write_page(pgoff_t page_off, void *page)
 	return submit(WRITE, page_off, page);
 }
 
+/**
+ *	The following functions allow us to read data using a swap map
+ *	in a file-alike way
+ */
+
+static inline void release_swap_map_reader(struct swap_map_handle *handle)
+{
+	if (handle->cur)
+		free_page((unsigned long)handle->cur);
+	handle->cur = NULL;
+}
+
+static inline int get_swap_map_reader(struct swap_map_handle *handle,
+                                      swp_entry_t start)
+{
+	int error;
+
+	if (!swp_offset(start))
+		return -EINVAL;
+	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+	if (!handle->cur)
+		return -ENOMEM;
+	error = bio_read_page(swp_offset(start), handle->cur);
+	if (error) {
+		release_swap_map_reader(handle);
+		return error;
+	}
+	handle->k = 0;
+	return 0;
+}
+
+static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
+{
+	unsigned long offset;
+	int error;
+
+	if (!handle->cur)
+		return -EINVAL;
+	offset = swp_offset(handle->cur->entries[handle->k]);
+	if (!offset)
+		return -EINVAL;
+	error = bio_read_page(offset, buf);
+	if (error)
+		return error;
+	if (++handle->k >= MAP_PAGE_SIZE) {
+		handle->k = 0;
+		offset = swp_offset(handle->cur->next_swap);
+		if (!offset)
+			release_swap_map_reader(handle);
+		else
+			error = bio_read_page(offset, handle->cur);
+	}
+	return error;
+}
+
 /*
  * Sanity check if this image makes sense with this kernel/swap context
  * I really don't think that it's foolproof but more than nothing..
@@ -639,7 +869,6 @@ static const char *sanity_check(void)
 	return NULL;
 }
 
-
 static int check_header(void)
 {
 	const char *reason = NULL;
@@ -653,7 +882,6 @@ static int check_header(void)
 		printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason);
 		return -EPERM;
 	}
-	nr_copy_pages = swsusp_info.image_pages;
 	return error;
 }
 
@@ -680,75 +908,88 @@ static int check_sig(void)
 }
 
 /**
- *	data_read - Read image pages from swap.
- *
- *	You do not need to check for overlaps, check_pagedir()
- *	already did that.
+ *	load_image_data - load the image data using the swap map handle
+ *	@handle and store them using the page backup list @pblist
+ *	(assume there are @nr_pages pages to load)
  */
 
-static int data_read(struct pbe *pblist)
+static int load_image_data(struct pbe *pblist,
+                           struct swap_map_handle *handle,
+                           unsigned int nr_pages)
 {
+	int error;
+	unsigned int m;
 	struct pbe *p;
-	int error = 0;
-	int i = 0;
-	int mod = swsusp_info.image_pages / 100;
-
-	if (!mod)
-		mod = 1;
-
-	printk("swsusp: Reading image data (%lu pages):     ",
-			swsusp_info.image_pages);
-
-	for_each_pbe (p, pblist) {
-		if (!(i % mod))
-			printk("\b\b\b\b%3d%%", i / mod);
 
-		if ((error = bio_read_page(swp_offset(p->swap_address),
-						(void *)p->address)))
-			return error;
-
-		i++;
+	if (!pblist)
+		return -EINVAL;
+	printk("Loading image data pages (%u pages) ...     ", nr_pages);
+	m = nr_pages / 100;
+	if (!m)
+		m = 1;
+	nr_pages = 0;
+	p = pblist;
+	while (p) {
+		error = swap_map_read_page(handle, (void *)p->address);
+		if (error)
+			break;
+		p = p->next;
+		if (!(nr_pages % m))
+			printk("\b\b\b\b%3d%%", nr_pages / m);
+		nr_pages++;
 	}
-	printk("\b\b\b\bdone\n");
+	if (!error)
+		printk("\b\b\b\bdone\n");
 	return error;
 }
 
 /**
- *	read_pagedir - Read page backup list pages from swap
+ *	unpack_orig_addresses - copy the elements of @buf[] (1 page) to
+ *	the PBEs in the list starting at @pbe
  */
 
-static int read_pagedir(struct pbe *pblist)
+static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
+                                                struct pbe *pbe)
 {
-	struct pbe *pbpage, *p;
-	unsigned int i = 0;
-	int error;
+	int j;
 
-	if (!pblist)
-		return -EFAULT;
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		pbe->orig_address = buf[j];
+		pbe = pbe->next;
+	}
+	return pbe;
+}
 
-	printk("swsusp: Reading pagedir (%lu pages)\n",
-			swsusp_info.pagedir_pages);
+/**
+ *	load_image_metadata - load the image metadata using the swap map
+ *	handle @handle and put them into the PBEs in the list @pblist
+ */
 
-	for_each_pb_page (pbpage, pblist) {
-		unsigned long offset = swp_offset(swsusp_info.pagedir[i++]);
+static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
+{
+	struct pbe *p;
+	unsigned long *buf;
+	unsigned int n = 0;
+	int error = 0;
 
-		error = -EFAULT;
-		if (offset) {
-			p = (pbpage + PB_PAGE_SKIP)->next;
-			error = bio_read_page(offset, (void *)pbpage);
-			(pbpage + PB_PAGE_SKIP)->next = p;
-		}
+	printk("Loading image metadata ... ");
+	buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
+	if (!buf)
+		return -ENOMEM;
+	p = pblist;
+	while (p) {
+		error = swap_map_read_page(handle, buf);
 		if (error)
 			break;
+		p = unpack_orig_addresses(buf, p);
+		n++;
 	}
-
+	free_page((unsigned long)buf);
 	if (!error)
-		BUG_ON(i != swsusp_info.pagedir_pages);
-
+		printk("done (%u pages loaded)\n", n);
 	return error;
 }
 
-
 static int check_suspend_image(void)
 {
 	int error = 0;
@@ -762,34 +1003,39 @@ static int check_suspend_image(void)
 	return 0;
 }
 
-static int read_suspend_image(void)
+static int read_suspend_image(struct pbe **pblist_ptr)
 {
 	int error = 0;
-	struct pbe *p;
+	struct pbe *p, *pblist;
+	struct swap_map_handle handle;
+	unsigned int nr_pages = swsusp_info.image_pages;
 
-	if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0)))
+	p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
+	if (!p)
 		return -ENOMEM;
-
-	if ((error = read_pagedir(p)))
+	error = get_swap_map_reader(&handle, swsusp_info.start);
+	if (error)
+		/* The PBE list at p will be released by swsusp_free() */
 		return error;
-	create_pbe_list(p, nr_copy_pages);
-	mark_unsafe_pages(p);
-	pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
-	if (pagedir_nosave) {
-		create_pbe_list(pagedir_nosave, nr_copy_pages);
-		copy_page_backup_list(pagedir_nosave, p);
+	error = load_image_metadata(p, &handle);
+	if (!error) {
+		mark_unsafe_pages(p);
+		pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
+		if (pblist)
+			copy_page_backup_list(pblist, p);
+		free_pagedir(p);
+		if (!pblist)
+			error = -ENOMEM;
+
+		/* Allocate memory for the image and read the data from swap */
+		if (!error)
+			error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
+		if (!error)
+			error = load_image_data(pblist, &handle, nr_pages);
+		if (!error)
+			*pblist_ptr = pblist;
 	}
-	free_pagedir(p);
-	if (!pagedir_nosave)
-		return -ENOMEM;
-
-	/* Allocate memory for the image and read the data from swap */
-
-	error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1);
-
-	if (!error)
-		error = data_read(pagedir_nosave);
-
+	release_swap_map_reader(&handle);
 	return error;
 }
 
@@ -821,7 +1067,7 @@ int swsusp_check(void)
  *	swsusp_read - Read saved image from swap.
  */
 
-int swsusp_read(void)
+int swsusp_read(struct pbe **pblist_ptr)
 {
 	int error;
 
@@ -830,7 +1076,7 @@ int swsusp_read(void)
 		return PTR_ERR(resume_bdev);
 	}
 
-	error = read_suspend_image();
+	error = read_suspend_image(pblist_ptr);
 	blkdev_put(resume_bdev);
 
 	if (!error)
-- 
cgit v1.2.2


From 72a97e08394a3b2e75481ff680ec2a0591e3cba4 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:13:46 -0800
Subject: [PATCH] swsusp: improve freeing of memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch makes swsusp free only as much memory as needed to complete the
suspend and not as much as possible.   In the most of cases this should speed
up the suspend and make the system much more responsive after resume,
especially if a GUI (eg.  X Windows) is used.

If needed, the old behavior (ie to free as much memory as possible during
suspend) can be restored by unsetting FAST_FREE in power.h

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c     | 30 +++--------------------
 kernel/power/power.h    | 14 ++++++++---
 kernel/power/snapshot.c | 65 +++++++++++++++++++++++++++++++++++++++++++++----
 kernel/power/swsusp.c   | 52 ++++++++++++++++++++++++++++++++++++++-
 4 files changed, 125 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 76a5131b0e80..9e51cdf7b78d 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -24,6 +24,7 @@
 
 extern suspend_disk_method_t pm_disk_mode;
 
+extern int swsusp_shrink_memory(void);
 extern int swsusp_suspend(void);
 extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
 extern int swsusp_check(void);
@@ -73,31 +74,6 @@ static void power_down(suspend_disk_method_t mode)
 static int in_suspend __nosavedata = 0;
 
 
-/**
- *	free_some_memory -  Try to free as much memory as possible
- *
- *	... but do not OOM-kill anyone
- *
- *	Notice: all userland should be stopped at this point, or
- *	livelock is possible.
- */
-
-static void free_some_memory(void)
-{
-	unsigned int i = 0;
-	unsigned int tmp;
-	unsigned long pages = 0;
-	char *p = "-\\|/";
-
-	printk("Freeing memory...  ");
-	while ((tmp = shrink_all_memory(10000))) {
-		pages += tmp;
-		printk("\b%c", p[i++ % 4]);
-	}
-	printk("\bdone (%li pages freed)\n", pages);
-}
-
-
 static inline void platform_finish(void)
 {
 	if (pm_disk_mode == PM_DISK_PLATFORM) {
@@ -127,8 +103,8 @@ static int prepare_processes(void)
 	}
 
 	/* Free memory before shutting down devices. */
-	free_some_memory();
-	return 0;
+	if (!(error = swsusp_shrink_memory()))
+		return 0;
 thaw:
 	thaw_processes();
 	enable_nonboot_cpus();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 977877c6dcfc..acdc83b3d890 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -49,18 +49,26 @@ extern void thaw_processes(void);
 extern int pm_prepare_console(void);
 extern void pm_restore_console(void);
 
-
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
 
 extern unsigned int nr_copy_pages;
-extern suspend_pagedir_t *pagedir_nosave;
-extern suspend_pagedir_t *pagedir_save;
+extern struct pbe *pagedir_nosave;
+
+/*
+ * This compilation switch determines the way in which memory will be freed
+ * during suspend.  If defined, only as much memory will be freed as needed
+ * to complete the suspend, which will make it go faster.  Otherwise, the
+ * largest possible amount of memory will be freed.
+ */
+#define FAST_FREE	1
 
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
+extern unsigned int count_data_pages(void);
 extern void free_pagedir(struct pbe *pblist);
+extern void release_eaten_pages(void);
 extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
 extern void swsusp_free(void);
 extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 152d56cdf017..e80d282dbf58 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -37,6 +37,31 @@ struct pbe *pagedir_nosave;
 unsigned int nr_copy_pages;
 
 #ifdef CONFIG_HIGHMEM
+unsigned int count_highmem_pages(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	unsigned int n = 0;
+
+	for_each_zone (zone)
+		if (is_highmem(zone)) {
+			mark_free_pages(zone);
+			for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
+				struct page *page;
+				unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+				if (!pfn_valid(pfn))
+					continue;
+				page = pfn_to_page(pfn);
+				if (PageReserved(page))
+					continue;
+				if (PageNosaveFree(page))
+					continue;
+				n++;
+			}
+		}
+	return n;
+}
+
 struct highmem_page {
 	char *data;
 	struct page *page;
@@ -152,17 +177,15 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn)
 	BUG_ON(PageReserved(page) && PageNosave(page));
 	if (PageNosave(page))
 		return 0;
-	if (PageReserved(page) && pfn_is_nosave(pfn)) {
-		pr_debug("[nosave pfn 0x%lx]", pfn);
+	if (PageReserved(page) && pfn_is_nosave(pfn))
 		return 0;
-	}
 	if (PageNosaveFree(page))
 		return 0;
 
 	return 1;
 }
 
-static unsigned count_data_pages(void)
+unsigned int count_data_pages(void)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
@@ -266,6 +289,35 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 	}
 }
 
+/**
+ *	On resume it is necessary to trace and eventually free the unsafe
+ *	pages that have been allocated, because they are needed for I/O
+ *	(on x86-64 we likely will "eat" these pages once again while
+ *	creating the temporary page translation tables)
+ */
+
+struct eaten_page {
+	struct eaten_page *next;
+	char padding[PAGE_SIZE - sizeof(void *)];
+};
+
+static struct eaten_page *eaten_pages = NULL;
+
+void release_eaten_pages(void)
+{
+	struct eaten_page *p, *q;
+
+	p = eaten_pages;
+	while (p) {
+		q = p->next;
+		/* We don't want swsusp_free() to free this page again */
+		ClearPageNosave(virt_to_page(p));
+		free_page((unsigned long)p);
+		p = q;
+	}
+	eaten_pages = NULL;
+}
+
 /**
  *	@safe_needed - on resume, for storing the PBE list and the image,
  *	we can only use memory pages that do not conflict with the pages
@@ -284,9 +336,12 @@ static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 	if (safe_needed)
 		do {
 			res = (void *)get_zeroed_page(gfp_mask);
-			if (res && PageNosaveFree(virt_to_page(res)))
+			if (res && PageNosaveFree(virt_to_page(res))) {
 				/* This is for swsusp_free() */
 				SetPageNosave(virt_to_page(res));
+				((struct eaten_page *)res)->next = eaten_pages;
+				eaten_pages = res;
+			}
 		} while (res && PageNosaveFree(virt_to_page(res)));
 	else
 		res = (void *)get_zeroed_page(gfp_mask);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index b09bd7c0998d..f77f9397a364 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -70,11 +70,13 @@
 #include "power.h"
 
 #ifdef CONFIG_HIGHMEM
+unsigned int count_highmem_pages(void);
 int save_highmem(void);
 int restore_highmem(void);
 #else
 static int save_highmem(void) { return 0; }
 static int restore_highmem(void) { return 0; }
+static unsigned int count_highmem_pages(void) { return 0; }
 #endif
 
 extern char resume_file[];
@@ -611,6 +613,52 @@ int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 	return error;
 }
 
+/**
+ *	swsusp_shrink_memory -  Try to free as much memory as needed
+ *
+ *	... but do not OOM-kill anyone
+ *
+ *	Notice: all userland should be stopped before it is called, or
+ *	livelock is possible.
+ */
+
+#define SHRINK_BITE	10000
+
+int swsusp_shrink_memory(void)
+{
+	long tmp;
+	struct zone *zone;
+	unsigned long pages = 0;
+	unsigned int i = 0;
+	char *p = "-\\|/";
+
+	printk("Shrinking memory...  ");
+	do {
+#ifdef FAST_FREE
+		tmp = 2 * count_highmem_pages();
+		tmp += tmp / 50 + count_data_pages();
+		tmp += (tmp + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
+			PAGES_FOR_IO;
+		for_each_zone (zone)
+			if (!is_highmem(zone))
+				tmp -= zone->free_pages;
+		if (tmp > 0) {
+			tmp = shrink_all_memory(SHRINK_BITE);
+			if (!tmp)
+				return -ENOMEM;
+			pages += tmp;
+		}
+#else
+		tmp = shrink_all_memory(SHRINK_BITE);
+		pages += tmp;
+#endif
+		printk("\b%c", p[i++%4]);
+	} while (tmp > 0);
+	printk("\bdone (%lu pages freed)\n", pages);
+
+	return 0;
+}
+
 int swsusp_suspend(void)
 {
 	int error;
@@ -1030,8 +1078,10 @@ static int read_suspend_image(struct pbe **pblist_ptr)
 		/* Allocate memory for the image and read the data from swap */
 		if (!error)
 			error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
-		if (!error)
+		if (!error) {
+			release_eaten_pages();
 			error = load_image_data(pblist, &handle, nr_pages);
+		}
 		if (!error)
 			*pblist_ptr = pblist;
 	}
-- 
cgit v1.2.2


From e5e2fa7857f6bf46605c77d949fa6698b9b0bc28 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:14:20 -0800
Subject: [PATCH] swsusp: fix enough_free_mem

This patch fixes a problem with the function enough_free_mem() used by
swsusp to verify if there is a sufficient number of memory pages available
to it to create and save the suspend image.

Namely, enough_free_mem() uses nr_free_pages() to obtain the number of free
memory pages, which is incorrect, because this function returns the total
number of free pages, including free highmem pages, and the highmem pages
cannot be used by swsusp for storing the image data.

The patch makes enough_free_mem() avoid counting the free highmem
pages as available to swsusp.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index e80d282dbf58..41f66365f0d8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -428,8 +428,14 @@ void swsusp_free(void)
 
 static int enough_free_mem(unsigned int nr_pages)
 {
-	pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
-	return nr_free_pages() > (nr_pages + PAGES_FOR_IO +
+	struct zone *zone;
+	unsigned int n = 0;
+
+	for_each_zone (zone)
+		if (!is_highmem(zone))
+			n += zone->free_pages;
+	pr_debug("swsusp: available memory: %u pages\n", n);
+	return n > (nr_pages + PAGES_FOR_IO +
 		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
-- 
cgit v1.2.2


From c050ca78705592d440c22055865bf4de40fe2a4c Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Fri, 6 Jan 2006 00:15:21 -0800
Subject: [PATCH] swsusp: Drop duplicate prototypes

These two prototypes are already present in sched.h, remove duplicate
version.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index acdc83b3d890..e521e61e0d95 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -43,9 +43,6 @@ static struct subsys_attribute _name##_attr = {	\
 
 extern struct subsystem power_subsys;
 
-extern int freeze_processes(void);
-extern void thaw_processes(void);
-
 extern int pm_prepare_console(void);
 extern void pm_restore_console(void);
 
-- 
cgit v1.2.2


From b3a93a255ec33a04776ec50efb30b7a99168dda2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:15:22 -0800
Subject: [PATCH] swsusp: limit image size

Limit the size of the suspend image to approx.  500 MB, which should
improve the overall performance of swsusp on systems with more than 1 GB of
RAM.

It introduces the constant IMAGE_SIZE that can be set to the preferred size
of the image (in MB) and modifies the memory-shrinking part of swsusp to
take this constant into account (500 is the default value of IMAGE_SIZE).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h  |  8 +++-----
 kernel/power/swsusp.c | 17 ++++++++---------
 2 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index e521e61e0d95..9b0459903613 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -53,12 +53,10 @@ extern unsigned int nr_copy_pages;
 extern struct pbe *pagedir_nosave;
 
 /*
- * This compilation switch determines the way in which memory will be freed
- * during suspend.  If defined, only as much memory will be freed as needed
- * to complete the suspend, which will make it go faster.  Otherwise, the
- * largest possible amount of memory will be freed.
+ * Preferred image size in MB (set it to zero to get the smallest
+ * image possible)
  */
-#define FAST_FREE	1
+#define IMAGE_SIZE	500
 
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f77f9397a364..6d5ceaf4c364 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -626,7 +626,7 @@ int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 
 int swsusp_shrink_memory(void)
 {
-	long tmp;
+	long size, tmp;
 	struct zone *zone;
 	unsigned long pages = 0;
 	unsigned int i = 0;
@@ -634,11 +634,11 @@ int swsusp_shrink_memory(void)
 
 	printk("Shrinking memory...  ");
 	do {
-#ifdef FAST_FREE
-		tmp = 2 * count_highmem_pages();
-		tmp += tmp / 50 + count_data_pages();
-		tmp += (tmp + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
+		size = 2 * count_highmem_pages();
+		size += size / 50 + count_data_pages();
+		size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
 			PAGES_FOR_IO;
+		tmp = size;
 		for_each_zone (zone)
 			if (!is_highmem(zone))
 				tmp -= zone->free_pages;
@@ -647,11 +647,10 @@ int swsusp_shrink_memory(void)
 			if (!tmp)
 				return -ENOMEM;
 			pages += tmp;
+		} else if (size > (IMAGE_SIZE * 1024 * 1024) / PAGE_SIZE) {
+			tmp = shrink_all_memory(SHRINK_BITE);
+			pages += tmp;
 		}
-#else
-		tmp = shrink_all_memory(SHRINK_BITE);
-		pages += tmp;
-#endif
 		printk("\b%c", p[i++%4]);
 	} while (tmp > 0);
 	printk("\bdone (%lu pages freed)\n", pages);
-- 
cgit v1.2.2


From ca0aec0f7a94bf9f07fefa8bfd23282d4e8ceb8a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:15:56 -0800
Subject: [PATCH] swsusp: make image size limit tunable

Make the suspend image size limit tunable via /sys/power/image_size.

It is necessary for systems on which there is a limited amount of swap
available for suspend.  It can also be useful for optimizing performance of
swsusp on systems with 1 GB of RAM or more.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c   | 20 ++++++++++++++++++++
 kernel/power/power.h  |  7 ++-----
 kernel/power/swsusp.c | 10 +++++++++-
 3 files changed, 31 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 9e51cdf7b78d..e24446f8d8cd 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -365,9 +365,29 @@ out:
 
 power_attr(resume);
 
+static ssize_t image_size_show(struct subsystem * subsys, char *buf)
+{
+	return sprintf(buf, "%u\n", image_size);
+}
+
+static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+	unsigned int size;
+
+	if (sscanf(buf, "%u", &size) == 1) {
+		image_size = size;
+		return n;
+	}
+
+	return -EINVAL;
+}
+
+power_attr(image_size);
+
 static struct attribute * g[] = {
 	&disk_attr.attr,
 	&resume_attr.attr,
+	&image_size_attr.attr,
 	NULL,
 };
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9b0459903613..273a5b1d70be 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -52,11 +52,8 @@ extern const void __nosave_begin, __nosave_end;
 extern unsigned int nr_copy_pages;
 extern struct pbe *pagedir_nosave;
 
-/*
- * Preferred image size in MB (set it to zero to get the smallest
- * image possible)
- */
-#define IMAGE_SIZE	500
+/* Preferred image size in MB (default 500) */
+extern unsigned int image_size;
 
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 6d5ceaf4c364..d760a6a719f0 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -69,6 +69,14 @@
 
 #include "power.h"
 
+/*
+ * Preferred image size in MB (tunable via /sys/power/image_size).
+ * When it is set to N, swsusp will do its best to ensure the image
+ * size will not exceed N MB, but if that is impossible, it will
+ * try to create the smallest image possible.
+ */
+unsigned int image_size = 500;
+
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void);
 int save_highmem(void);
@@ -647,7 +655,7 @@ int swsusp_shrink_memory(void)
 			if (!tmp)
 				return -ENOMEM;
 			pages += tmp;
-		} else if (size > (IMAGE_SIZE * 1024 * 1024) / PAGE_SIZE) {
+		} else if (size > (image_size * 1024 * 1024) / PAGE_SIZE) {
 			tmp = shrink_all_memory(SHRINK_BITE);
 			pages += tmp;
 		}
-- 
cgit v1.2.2


From 1adf6c8ea916bc4a2587a881ec7715fece63fb5e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:17:16 -0800
Subject: [PATCH] swsusp: improve handling of swap partitions

This changes the handling of swap partitions by swsusp to avoid locking of the
swap devices that are not used for suspend and, consequently, simplifies the
code.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 128 ++++++++++++++------------------------------------
 1 file changed, 36 insertions(+), 92 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index d760a6a719f0..0479c9be7d71 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -104,13 +104,7 @@ static struct swsusp_info swsusp_info;
  * Saving part...
  */
 
-/* We memorize in swapfile_used what swap devices are used for suspension */
-#define SWAPFILE_UNUSED    0
-#define SWAPFILE_SUSPEND   1	/* This is the suspending device */
-#define SWAPFILE_IGNORED   2	/* Those are other swap devices ignored for suspension */
-
-static unsigned short swapfile_used[MAX_SWAPFILES];
-static unsigned short root_swap;
+static unsigned short root_swap = 0xffff;
 
 static int mark_swapfiles(swp_entry_t prev)
 {
@@ -146,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev)
  * devfs, since the resume code can only recognize the form /dev/hda4,
  * but the suspend code would see the long name.)
  */
-static int is_resume_device(const struct swap_info_struct *swap_info)
+static inline int is_resume_device(const struct swap_info_struct *swap_info)
 {
 	struct file *file = swap_info->swap_file;
 	struct inode *inode = file->f_dentry->d_inode;
@@ -156,55 +150,23 @@ static int is_resume_device(const struct swap_info_struct *swap_info)
 }
 
 static int swsusp_swap_check(void) /* This is called before saving image */
-{
-	int i, len;
-
-	len=strlen(resume_file);
-	root_swap = 0xFFFF;
-
-	spin_lock(&swap_lock);
-	for (i=0; i<MAX_SWAPFILES; i++) {
-		if (!(swap_info[i].flags & SWP_WRITEOK)) {
-			swapfile_used[i]=SWAPFILE_UNUSED;
-		} else {
-			if (!len) {
-	    			printk(KERN_WARNING "resume= option should be used to set suspend device" );
-				if (root_swap == 0xFFFF) {
-					swapfile_used[i] = SWAPFILE_SUSPEND;
-					root_swap = i;
-				} else
-					swapfile_used[i] = SWAPFILE_IGNORED;
-			} else {
-	  			/* we ignore all swap devices that are not the resume_file */
-				if (is_resume_device(&swap_info[i])) {
-					swapfile_used[i] = SWAPFILE_SUSPEND;
-					root_swap = i;
-				} else {
-				  	swapfile_used[i] = SWAPFILE_IGNORED;
-				}
-			}
-		}
-	}
-	spin_unlock(&swap_lock);
-	return (root_swap != 0xffff) ? 0 : -ENODEV;
-}
-
-/**
- * This is called after saving image so modification
- * will be lost after resume... and that's what we want.
- * we make the device unusable. A new call to
- * lock_swapdevices can unlock the devices.
- */
-static void lock_swapdevices(void)
 {
 	int i;
 
+	if (!swsusp_resume_device)
+		return -ENODEV;
 	spin_lock(&swap_lock);
-	for (i = 0; i< MAX_SWAPFILES; i++)
-		if (swapfile_used[i] == SWAPFILE_IGNORED) {
-			swap_info[i].flags ^= SWP_WRITEOK;
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		if (!(swap_info[i].flags & SWP_WRITEOK))
+			continue;
+		if (is_resume_device(swap_info + i)) {
+			spin_unlock(&swap_lock);
+			root_swap = i;
+			return 0;
 		}
+	}
 	spin_unlock(&swap_lock);
+	return -ENODEV;
 }
 
 /**
@@ -222,19 +184,14 @@ static void lock_swapdevices(void)
 static int write_page(unsigned long addr, swp_entry_t *loc)
 {
 	swp_entry_t entry;
-	int error = 0;
+	int error = -ENOSPC;
 
-	entry = get_swap_page();
-	if (swp_offset(entry) &&
-	    swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) {
-		error = rw_swap_page_sync(WRITE, entry,
-					  virt_to_page(addr));
-		if (error == -EIO)
-			error = 0;
-		if (!error)
+	entry = get_swap_page_of_type(root_swap);
+	if (swp_offset(entry)) {
+		error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr));
+		if (!error || error == -EIO)
 			*loc = entry;
-	} else
-		error = -ENOSPC;
+	}
 	return error;
 }
 
@@ -539,31 +496,38 @@ static int save_image_metadata(struct pbe *pblist,
  *	enough_swap - Make sure we have enough swap to save the image.
  *
  *	Returns TRUE or FALSE after checking the total amount of swap
- *	space avaiable.
- *
- *	FIXME: si_swapinfo(&i) returns all swap devices information.
- *	We should only consider resume_device.
+ *	space avaiable from the resume partition.
  */
 
 static int enough_swap(unsigned int nr_pages)
 {
-	struct sysinfo i;
+	unsigned int free_swap = swap_info[root_swap].pages -
+		swap_info[root_swap].inuse_pages;
 
-	si_swapinfo(&i);
-	pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
-	return i.freeswap > (nr_pages + PAGES_FOR_IO +
+	pr_debug("swsusp: free swap pages: %u\n", free_swap);
+	return free_swap > (nr_pages + PAGES_FOR_IO +
 		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
 /**
- *	write_suspend_image - Write entire image and metadata.
+ *	swsusp_write - Write entire image and metadata.
+ *
+ *	It is important _NOT_ to umount filesystems at this point. We want
+ *	them synced (in case something goes wrong) but we DO not want to mark
+ *	filesystem clean: it is not. (And it does not matter, if we resume
+ *	correctly, we'll mark system clean, anyway.)
  */
-static int write_suspend_image(struct pbe *pblist, unsigned int nr_pages)
+
+int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 {
 	struct swap_map_page *swap_map;
 	struct swap_map_handle handle;
 	int error;
 
+	if ((error = swsusp_swap_check())) {
+		printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
+		return error;
+	}
 	if (!enough_swap(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free swap\n");
 		return -ENOSPC;
@@ -601,26 +565,6 @@ Free_image_entries:
 	goto Free_swap_map;
 }
 
-/* It is important _NOT_ to umount filesystems at this point. We want
- * them synced (in case something goes wrong) but we DO not want to mark
- * filesystem clean: it is not. (And it does not matter, if we resume
- * correctly, we'll mark system clean, anyway.)
- */
-int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
-{
-	int error;
-
-	if ((error = swsusp_swap_check())) {
-		printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
-		return error;
-	}
-	lock_swapdevices();
-	error = write_suspend_image(pblist, nr_pages);
-	/* This will unlock ignored swap devices since writing is finished */
-	lock_swapdevices();
-	return error;
-}
-
 /**
  *	swsusp_shrink_memory -  Try to free as much memory as needed
  *
-- 
cgit v1.2.2


From 277c6e2ad7369558dbd7ffbcc6dcbe16458bf723 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 6 Jan 2006 00:17:58 -0800
Subject: [PATCH] swsusp: save image header first
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This makes the swsusp_info structure become the header of the image in the
literal sense (ie.  it is saved to the swap and read before any other image
data with the help of the swsusp's swap map structure, so generally it is
treated in the same way as the rest of the image).

The main thing it does is to make swsusp_header contain the offset of the swap
map used to track the image data pages rather than the offset of swsusp_info.
 Simultaneously, swsusp_info becomes the first image page written to the swap.

The other changes are generally consequences of the above with a few
exceptions (there's some consolidation in the image reading part as a few
functions turn into trivial wrappers around something else).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h  |   1 -
 kernel/power/swsusp.c | 190 +++++++++++++++++---------------------------------
 2 files changed, 65 insertions(+), 126 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 273a5b1d70be..7e8492fd1423 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -16,7 +16,6 @@ struct swsusp_info {
 	int			cpus;
 	unsigned long		image_pages;
 	unsigned long		pages;
-	swp_entry_t		start;
 } __attribute__((aligned(PAGE_SIZE)));
 
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 0479c9be7d71..55a18d26abed 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -93,7 +93,7 @@ extern char resume_file[];
 
 static struct swsusp_header {
 	char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
-	swp_entry_t swsusp_info;
+	swp_entry_t image;
 	char	orig_sig[10];
 	char	sig[10];
 } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
@@ -106,7 +106,7 @@ static struct swsusp_info swsusp_info;
 
 static unsigned short root_swap = 0xffff;
 
-static int mark_swapfiles(swp_entry_t prev)
+static int mark_swapfiles(swp_entry_t start)
 {
 	int error;
 
@@ -117,7 +117,7 @@ static int mark_swapfiles(swp_entry_t prev)
 	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
 		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
-		swsusp_header.swsusp_info = prev;
+		swsusp_header.image = start;
 		error = rw_swap_page_sync(WRITE,
 					  swp_entry(root_swap, 0),
 					  virt_to_page((unsigned long)
@@ -423,22 +423,7 @@ static void init_header(unsigned int nr_pages)
 	swsusp_info.cpus = num_online_cpus();
 	swsusp_info.image_pages = nr_pages;
 	swsusp_info.pages = nr_pages +
-		((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT);
-}
-
-static int close_swap(void)
-{
-	swp_entry_t entry;
-	int error;
-
-	dump_info();
-	error = write_page((unsigned long)&swsusp_info, &entry);
-	if (!error) {
-		printk( "S" );
-		error = mark_swapfiles(entry);
-		printk( "|\n" );
-	}
-	return error;
+		((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
 }
 
 /**
@@ -522,6 +507,7 @@ int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 {
 	struct swap_map_page *swap_map;
 	struct swap_map_handle handle;
+	swp_entry_t start;
 	int error;
 
 	if ((error = swsusp_swap_check())) {
@@ -539,18 +525,23 @@ int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
 		return -ENOMEM;
 	init_swap_map_handle(&handle, swap_map);
 
-	error = save_image_metadata(pblist, &handle);
+	error = swap_map_write_page(&handle, (unsigned long)&swsusp_info);
+	if (!error)
+		error = save_image_metadata(pblist, &handle);
 	if (!error)
 		error = save_image_data(pblist, &handle, nr_pages);
 	if (error)
 		goto Free_image_entries;
 
 	swap_map = reverse_swap_map(swap_map);
-	error = save_swap_map(swap_map, &swsusp_info.start);
+	error = save_swap_map(swap_map, &start);
 	if (error)
 		goto Free_map_entries;
 
-	error = close_swap();
+	dump_info();
+	printk( "S" );
+	error = mark_swapfiles(start);
+	printk( "|\n" );
 	if (error)
 		goto Free_map_entries;
 
@@ -840,70 +831,28 @@ static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
 	return error;
 }
 
-/*
- * Sanity check if this image makes sense with this kernel/swap context
- * I really don't think that it's foolproof but more than nothing..
- */
-
-static const char *sanity_check(void)
+static int check_header(void)
 {
+	char *reason = NULL;
+
 	dump_info();
 	if (swsusp_info.version_code != LINUX_VERSION_CODE)
-		return "kernel version";
+		reason = "kernel version";
 	if (swsusp_info.num_physpages != num_physpages)
-		return "memory size";
+		reason = "memory size";
 	if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
-		return "system type";
+		reason = "system type";
 	if (strcmp(swsusp_info.uts.release,system_utsname.release))
-		return "kernel release";
+		reason = "kernel release";
 	if (strcmp(swsusp_info.uts.version,system_utsname.version))
-		return "version";
+		reason = "version";
 	if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
-		return "machine";
-#if 0
-	/* We can't use number of online CPUs when we use hotplug to remove them ;-))) */
-	if (swsusp_info.cpus != num_possible_cpus())
-		return "number of cpus";
-#endif
-	return NULL;
-}
-
-static int check_header(void)
-{
-	const char *reason = NULL;
-	int error;
-
-	if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info)))
-		return error;
-
- 	/* Is this same machine? */
-	if ((reason = sanity_check())) {
-		printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason);
+		reason = "machine";
+	if (reason) {
+		printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
 		return -EPERM;
 	}
-	return error;
-}
-
-static int check_sig(void)
-{
-	int error;
-
-	memset(&swsusp_header, 0, sizeof(swsusp_header));
-	if ((error = bio_read_page(0, &swsusp_header)))
-		return error;
-	if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
-		memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
-
-		/*
-		 * Reset swap signature now.
-		 */
-		error = bio_write_page(0, &swsusp_header);
-	} else {
-		return -EINVAL;
-	}
-	if (!error)
-		pr_debug("swsusp: Signature found, resuming\n");
-	return error;
+	return 0;
 }
 
 /**
@@ -989,33 +938,29 @@ static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handl
 	return error;
 }
 
-static int check_suspend_image(void)
-{
-	int error = 0;
-
-	if ((error = check_sig()))
-		return error;
-
-	if ((error = check_header()))
-		return error;
-
-	return 0;
-}
-
-static int read_suspend_image(struct pbe **pblist_ptr)
+int swsusp_read(struct pbe **pblist_ptr)
 {
-	int error = 0;
+	int error;
 	struct pbe *p, *pblist;
 	struct swap_map_handle handle;
-	unsigned int nr_pages = swsusp_info.image_pages;
+	unsigned int nr_pages;
 
+	if (IS_ERR(resume_bdev)) {
+		pr_debug("swsusp: block device not initialised\n");
+		return PTR_ERR(resume_bdev);
+	}
+
+	error = get_swap_map_reader(&handle, swsusp_header.image);
+	if (!error)
+		error = swap_map_read_page(&handle, &swsusp_info);
+	if (!error)
+		error = check_header();
+	if (error)
+		return error;
+	nr_pages = swsusp_info.image_pages;
 	p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
 	if (!p)
 		return -ENOMEM;
-	error = get_swap_map_reader(&handle, swsusp_info.start);
-	if (error)
-		/* The PBE list at p will be released by swsusp_free() */
-		return error;
 	error = load_image_metadata(p, &handle);
 	if (!error) {
 		mark_unsafe_pages(p);
@@ -1037,11 +982,18 @@ static int read_suspend_image(struct pbe **pblist_ptr)
 			*pblist_ptr = pblist;
 	}
 	release_swap_map_reader(&handle);
+
+	blkdev_put(resume_bdev);
+
+	if (!error)
+		pr_debug("swsusp: Reading resume file was successful\n");
+	else
+		pr_debug("swsusp: Error %d resuming\n", error);
 	return error;
 }
 
 /**
- *      swsusp_check - Check for saved image in swap
+ *      swsusp_check - Check for swsusp signature in the resume device
  */
 
 int swsusp_check(void)
@@ -1051,39 +1003,27 @@ int swsusp_check(void)
 	resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
 	if (!IS_ERR(resume_bdev)) {
 		set_blocksize(resume_bdev, PAGE_SIZE);
-		error = check_suspend_image();
+		memset(&swsusp_header, 0, sizeof(swsusp_header));
+		if ((error = bio_read_page(0, &swsusp_header)))
+			return error;
+		if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
+			memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
+			/* Reset swap signature now */
+			error = bio_write_page(0, &swsusp_header);
+		} else {
+			return -EINVAL;
+		}
 		if (error)
-		    blkdev_put(resume_bdev);
-	} else
+			blkdev_put(resume_bdev);
+		else
+			pr_debug("swsusp: Signature found, resuming\n");
+	} else {
 		error = PTR_ERR(resume_bdev);
-
-	if (!error)
-		pr_debug("swsusp: resume file found\n");
-	else
-		pr_debug("swsusp: Error %d check for resume file\n", error);
-	return error;
-}
-
-/**
- *	swsusp_read - Read saved image from swap.
- */
-
-int swsusp_read(struct pbe **pblist_ptr)
-{
-	int error;
-
-	if (IS_ERR(resume_bdev)) {
-		pr_debug("swsusp: block device not initialised\n");
-		return PTR_ERR(resume_bdev);
 	}
 
-	error = read_suspend_image(pblist_ptr);
-	blkdev_put(resume_bdev);
+	if (error)
+		pr_debug("swsusp: Error %d check for resume file\n", error);
 
-	if (!error)
-		pr_debug("swsusp: Reading resume file was successful\n");
-	else
-		pr_debug("swsusp: Error %d resuming\n", error);
 	return error;
 }
 
-- 
cgit v1.2.2


From 089545f0c71bab6511395c2a060d7f81a99bad58 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:12 -0800
Subject: [PATCH] s390: cputime_t fixes

There are some more places where the use of cputime_t instead of an integer
type and the associated macros is necessary for the virtual cputime accounting
on s390.  Affected are the s390 specific appldata code and BSD process
accounting.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/acct.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 6312d6bd43e3..38d57fa6b78f 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -427,6 +427,7 @@ static void do_acct_process(long exitcode, struct file *file)
 	u64 elapsed;
 	u64 run_time;
 	struct timespec uptime;
+	unsigned long jiffies;
 
 	/*
 	 * First check to see if there is enough free_space to continue
@@ -467,12 +468,12 @@ static void do_acct_process(long exitcode, struct file *file)
 #endif
 	do_div(elapsed, AHZ);
 	ac.ac_btime = xtime.tv_sec - elapsed;
-	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(
-					    current->signal->utime +
-					    current->group_leader->utime));
-	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(
-					    current->signal->stime +
-					    current->group_leader->stime));
+	jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime,
+						 current->signal->utime));
+	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
+	jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime,
+						 current->signal->stime));
+	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
 	/* we really need to bite the bullet and change layout */
 	ac.ac_uid = current->uid;
 	ac.ac_gid = current->gid;
@@ -580,7 +581,8 @@ void acct_process(long exitcode)
 void acct_update_integrals(struct task_struct *tsk)
 {
 	if (likely(tsk->mm)) {
-		long delta = tsk->stime - tsk->acct_stimexpd;
+		long delta =
+			cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd;
 
 		if (delta == 0)
 			return;
-- 
cgit v1.2.2


From 347a8dc3b815f0c0fa62a1df075184ffe4cbdcf1 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 6 Jan 2006 00:19:28 -0800
Subject: [PATCH] s390: cleanup Kconfig

Sanitize some s390 Kconfig options.  We have ARCH_S390, ARCH_S390X,
ARCH_S390_31, 64BIT, S390_SUPPORT and COMPAT.  Replace these 6 options by
S390, 64BIT and COMPAT.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/panic.c  | 4 ++--
 kernel/sysctl.c | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index aabc5f86fa3f..c5c4ab255834 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -60,7 +60,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 	long i;
 	static char buf[1024];
 	va_list args;
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
         unsigned long caller = (unsigned long) __builtin_return_address(0);
 #endif
 
@@ -125,7 +125,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 		printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n");
 	}
 #endif
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
         disabled_wait(caller);
 #endif
 	local_irq_enable();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 345f4a1d533f..a85047bb5739 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -108,7 +108,7 @@ extern int pwrsw_enabled;
 extern int unaligned_enabled;
 #endif
 
-#ifdef CONFIG_ARCH_S390
+#ifdef CONFIG_S390
 #ifdef CONFIG_MATHEMU
 extern int sysctl_ieee_emulation_warnings;
 #endif
@@ -542,7 +542,7 @@ static ctl_table kern_table[] = {
 		.extra1		= &minolduid,
 		.extra2		= &maxolduid,
 	},
-#ifdef CONFIG_ARCH_S390
+#ifdef CONFIG_S390
 #ifdef CONFIG_MATHEMU
 	{
 		.ctl_name	= KERN_IEEE_EMULATION_WARNINGS,
@@ -644,7 +644,7 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-#if defined(CONFIG_ARCH_S390)
+#if defined(CONFIG_S390)
 	{
 		.ctl_name	= KERN_SPIN_RETRY,
 		.procname	= "spin_retry",
-- 
cgit v1.2.2


From 6fe2e70bbed3995d930f39452fb6ce3be7dc47dc Mon Sep 17 00:00:00 2001
From: Jayachandran C <c.jayachandran@gmail.com>
Date: Fri, 6 Jan 2006 00:19:54 -0800
Subject: [PATCH] kernel/module.c: removed dead code

This patch fixes an issue reported by Coverity in kernel/module.c

Error reported: Cannot reach this line of code "else return ptr;"

Patch description:
  This is the error path, so 'err' will be negative, the else case
  is not required, this patch removes it.

Signed-off-by: Jayachandran C. <c.jayachandran@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 2ea929d51ad0..4b06bbad49c2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1854,8 +1854,7 @@ static struct module *load_module(void __user *umod,
 	kfree(args);
  free_hdr:
 	vfree(hdr);
-	if (err < 0) return ERR_PTR(err);
-	else return ptr;
+	return ERR_PTR(err);
 
  truncated:
 	printk(KERN_ERR "Module len %lu truncated\n", len);
-- 
cgit v1.2.2


From 0aec63e67c69545ca757a73a66f5dcf05fa484bf Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 6 Jan 2006 15:36:48 -0800
Subject: [PATCH] Fix posix-cpu-timers sched_time accumulation

I've spent the past 3 days digging into a glibc testsuite failure in
current CVS, specifically libc/rt/tst-cputimer1.c The thr1 and thr2
timers fire too early in the second pass of this test.  The second
pass is noteworthy because it makes use of intervals, whereas the
first pass does not.

All throughout the posix-cpu-timers.c code, the calculation of the
process sched_time sum is implemented roughly as:

	unsigned long long sum;

	sum = tsk->signal->sched_time;
	t = tsk;
	do {
		sum += t->sched_time;
		t = next_thread(t);
	} while (t != tsk);

In fact this is the exact scheme used by check_process_timers().

In the case of check_process_timers(), current->sched_time has just
been updated (via scheduler_tick(), which is invoked by
update_process_times(), which subsequently invokes
run_posix_cpu_timers()) So there is no special processing necessary
wrt. that.

In other contexts, we have to allot for the fact that tsk->sched_time
might be a bit out of date if we are current.  And the
posix-cpu-timers.c code uses current_sched_time() to deal with that.

Unfortunately it does so in an erroneous and inconsistent manner in
one spot which is what results in the early timer firing.

In cpu_clock_sample_group_locked(), it does this:

		cpu->sched = p->signal->sched_time;
		/* Add in each other live thread.  */
		while ((t = next_thread(t)) != p) {
			cpu->sched += t->sched_time;
		}
		if (p->tgid == current->tgid) {
			/*
			 * We're sampling ourselves, so include the
			 * cycles not yet banked.  We still omit
			 * other threads running on other CPUs,
			 * so the total can always be behind as
			 * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ).
			 */
			cpu->sched += current_sched_time(current);
		} else {
			cpu->sched += p->sched_time;
		}

The problem is the "p->tgid == current->tgid" test.  If "p" is
not current, and the tgids are the same, we will add the process
t->sched_time twice into cpu->sched and omit "p"'s sched_time
which is very very very wrong.

posix-cpu-timers.c has a helper function, sched_ns(p) which takes care
of this, so my fix is to use that here instead of this special tgid
test.

The fact that current can be one of the sub-threads of "p" points out
that we could make things a little bit more accurate, perhaps by using
sched_ns() on every thread we process in these loops.  It also points
out that we don't use the most accurate value for threads in the group
actively running other cpus (and this is mentioned in the comment).

But that is a future enhancement, and this fix here definitely makes
sense.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index cae4f5728997..4c68edff900b 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -238,18 +238,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
 		while ((t = next_thread(t)) != p) {
 			cpu->sched += t->sched_time;
 		}
-		if (p->tgid == current->tgid) {
-			/*
-			 * We're sampling ourselves, so include the
-			 * cycles not yet banked.  We still omit
-			 * other threads running on other CPUs,
-			 * so the total can always be behind as
-			 * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ).
-			 */
-			cpu->sched += current_sched_time(current);
-		} else {
-			cpu->sched += p->sched_time;
-		}
+		cpu->sched += sched_ns(p);
 		break;
 	}
 	return 0;
-- 
cgit v1.2.2


From 67207b9664a8d603138ef1556141e6d0a102bea7 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 15 Nov 2005 15:53:48 -0500
Subject: [PATCH] spufs: The SPU file system, base

This is the current version of the spu file system, used
for driving SPEs on the Cell Broadband Engine.

This release is almost identical to the version for the
2.6.14 kernel posted earlier, which is available as part
of the Cell BE Linux distribution from
http://www.bsc.es/projects/deepcomputing/linuxoncell/.

The first patch provides all the interfaces for running
spu application, but does not have any support for
debugging SPU tasks or for scheduling. Both these
functionalities are added in the subsequent patches.

See Documentation/filesystems/spufs.txt on how to use
spufs.

Signed-off-by: Arnd Bergmann <arndb@de.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 kernel/sys_ni.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1ab2370e2efa..d4739a475d23 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -90,3 +90,5 @@ cond_syscall(sys_pciconfig_iobase);
 cond_syscall(sys32_ipc);
 cond_syscall(sys32_sysctl);
 cond_syscall(ppc_rtas);
+cond_syscall(sys_spu_run);
+cond_syscall(sys_spu_create);
-- 
cgit v1.2.2


From 54c32021eb6feafc32e90104e960b38301521b7b Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Sun, 4 Dec 2005 18:39:51 +1100
Subject: [PATCH] powerpc: Add arch-dependent copy_oldmem_page

Signed-off-by: Haren Myneni <haren@us.ibm.com>
Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 kernel/crash_dump.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index 334c37f5218a..fccb27dbc623 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -14,10 +14,12 @@
 
 #include <asm/io.h>
 #include <asm/uaccess.h>
+#include <asm/kexec.h>
 
 /* Stores the physical address of elf header of crash image. */
 unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
 
+#ifndef HAVE_ARCH_COPY_OLDMEM_PAGE
 /**
  * copy_oldmem_page - copy one page from "oldmem"
  * @pfn: page frame number to be copied
@@ -59,3 +61,4 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
 	kfree(page);
 	return csize;
 }
+#endif
-- 
cgit v1.2.2


From 9d0243bca345d5ce25d3f4b74b7facb3a6df1232 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 8 Jan 2006 01:00:39 -0800
Subject: [PATCH] drop-pagecache

Add /proc/sys/vm/drop_caches.  When written to, this will cause the kernel to
discard as much pagecache and/or reclaimable slab objects as it can.  THis
operation requires root permissions.

It won't drop dirty data, so the user should run `sync' first.

Caveats:

a) Holds inode_lock for exorbitant amounts of time.

b) Needs to be taught about NUMA nodes: propagate these all the way through
   so the discarding can be controlled on a per-node basis.

This is a debugging feature: useful for getting consistent results between
filesystem benchmarks.  We could possibly put it under a config option, but
it's less than 300 bytes.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a85047bb5739..8dcf6fd5b0f9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -68,6 +68,7 @@ extern int min_free_kbytes;
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
+extern int sysctl_drop_caches;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
@@ -774,6 +775,15 @@ static ctl_table vm_table[] = {
 		.proc_handler	= &lowmem_reserve_ratio_sysctl_handler,
 		.strategy	= &sysctl_intvec,
 	},
+	{
+		.ctl_name	= VM_DROP_PAGECACHE,
+		.procname	= "drop_caches",
+		.data		= &sysctl_drop_caches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= drop_caches_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+	},
 	{
 		.ctl_name	= VM_MIN_FREE_KBYTES,
 		.procname	= "min_free_kbytes",
-- 
cgit v1.2.2


From 8ad4b1fb8205340dba16b63467bb23efc27264d6 Mon Sep 17 00:00:00 2001
From: Rohit Seth <rohit.seth@intel.com>
Date: Sun, 8 Jan 2006 01:00:40 -0800
Subject: [PATCH] Make high and batch sizes of per_cpu_pagelists configurable

As recently there has been lot of traffic on the right values for batch and
high water marks for per_cpu_pagelists.  This patch makes these two
variables configurable through /proc interface.

A new tunable /proc/sys/vm/percpu_pagelist_fraction is added.  This entry
controls the fraction of pages at most in each zone that are allocated for
each per cpu page list.  The min value for this is 8.  It means that we
don't allow more than 1/8th of pages in each zone to be allocated in any
single per_cpu_pagelist.

The batch value of each per cpu pagelist is also updated as a result.  It
is set to pcp->high/4.  The upper limit of batch is (PAGE_SHIFT * 8)

Signed-off-by: Rohit Seth <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8dcf6fd5b0f9..03b0598f2369 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -69,6 +69,7 @@ extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
+extern int percpu_pagelist_fraction;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
@@ -79,6 +80,7 @@ extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
 static int minolduid;
+static int min_percpu_pagelist_fract = 8;
 
 static int ngroups_max = NGROUPS_MAX;
 
@@ -794,6 +796,16 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
+	{
+		.ctl_name	= VM_PERCPU_PAGELIST_FRACTION,
+		.procname	= "percpu_pagelist_fraction",
+		.data		= &percpu_pagelist_fraction,
+		.maxlen		= sizeof(percpu_pagelist_fraction),
+		.mode		= 0644,
+		.proc_handler	= &percpu_pagelist_fraction_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &min_percpu_pagelist_fract,
+	},
 #ifdef CONFIG_MMU
 	{
 		.ctl_name	= VM_MAX_MAP_COUNT,
-- 
cgit v1.2.2


From 15316ba81aee6775d6079fb46c66c801989e7d10 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sun, 8 Jan 2006 01:00:43 -0800
Subject: [PATCH] add schedule_on_each_cpu()

swap migration's isolate_lru_page() currently uses an IPI to notify other
processors that the lru caches need to be drained if the page cannot be
found on the LRU.  The IPI interrupt may interrupt a processor that is just
processing lru requests and cause a race condition.

This patch introduces a new function run_on_each_cpu() that uses the
keventd() to run the LRU draining on each processor.  Processors disable
preemption when dealing the LRU caches (these are per processor) and thus
executing LRU draining from another process is safe.

Thanks to Lee Schermerhorn <lee.schermerhorn@hp.com> for finding this race
condition.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2bd5aee1c736..62d47220696a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -419,6 +419,25 @@ int schedule_delayed_work_on(int cpu,
 	return ret;
 }
 
+int schedule_on_each_cpu(void (*func) (void *info), void *info)
+{
+	int cpu;
+	struct work_struct *work;
+
+	work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL);
+
+	if (!work)
+		return -ENOMEM;
+	for_each_online_cpu(cpu) {
+		INIT_WORK(work + cpu, func, info);
+		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
+				work + cpu);
+	}
+	flush_workqueue(keventd_wq);
+	kfree(work);
+	return 0;
+}
+
 void flush_scheduled_work(void)
 {
 	flush_workqueue(keventd_wq);
-- 
cgit v1.2.2


From 39743889aaf76725152f16aa90ca3c45f6d52da3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sun, 8 Jan 2006 01:00:51 -0800
Subject: [PATCH] Swap Migration V5: sys_migrate_pages interface

sys_migrate_pages implementation using swap based page migration

This is the original API proposed by Ray Bryant in his posts during the first
half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org.

The intent of sys_migrate is to migrate memory of a process.  A process may
have migrated to another node.  Memory was allocated optimally for the prior
context.  sys_migrate_pages allows to shift the memory to the new node.

sys_migrate_pages is also useful if the processes available memory nodes have
changed through cpuset operations to manually move the processes memory.  Paul
Jackson is working on an automated mechanism that will allow an automatic
migration if the cpuset of a process is changed.  However, a user may decide
to manually control the migration.

This implementation is put into the policy layer since it uses concepts and
functions that are also needed for mbind and friends.  The patch also provides
a do_migrate_pages function that may be useful for cpusets to automatically
move memory.  sys_migrate_pages does not modify policies in contrast to Ray's
implementation.

The current code here is based on the swap based page migration capability and
thus is not able to preserve the physical layout relative to it containing
nodeset (which may be a cpuset).  When direct page migration becomes available
then the implementation needs to be changed to do a isomorphic move of pages
between different nodesets.  The current implementation simply evicts all
pages in source nodeset that are not in the target nodeset.

Patch supports ia64, i386 and x86_64.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1ab2370e2efa..7a8bc7f60d91 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -82,6 +82,7 @@ cond_syscall(compat_sys_socketcall);
 cond_syscall(sys_inotify_init);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
+cond_syscall(sys_migrate_pages);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
-- 
cgit v1.2.2


From 45b07ef31d1182d2cfde7711327e3afb268bb1ac Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:00:56 -0800
Subject: [PATCH] cpusets: swap migration interface

Add a boolean "memory_migrate" to each cpuset, represented by a file
containing "0" or "1" in each directory below /dev/cpuset.

It defaults to false (file contains "0").  It can be set true by writing
"1" to the file.

If true, then anytime that a task is attached to the cpuset so marked, the
pages of that task will be moved to that cpuset, preserving, to the extent
practical, the cpuset-relative placement of the pages.

Also anytime that a cpuset so marked has its memory placement changed (by
writing to its "mems" file), the tasks in that cpuset will have their pages
moved to the cpusets new nodes, preserving, to the extent practical, the
cpuset-relative placement of the moved pages.

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7430640f9816..f63383e01ec7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -87,6 +87,7 @@ struct cpuset {
 typedef enum {
 	CS_CPU_EXCLUSIVE,
 	CS_MEM_EXCLUSIVE,
+	CS_MEMORY_MIGRATE,
 	CS_REMOVED,
 	CS_NOTIFY_ON_RELEASE
 } cpuset_flagbits_t;
@@ -112,6 +113,11 @@ static inline int notify_on_release(const struct cpuset *cs)
 	return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
 }
 
+static inline int is_memory_migrate(const struct cpuset *cs)
+{
+	return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+}
+
 /*
  * Increment this atomic integer everytime any cpuset changes its
  * mems_allowed value.  Users of cpusets can track this generation
@@ -602,16 +608,24 @@ static void refresh_mems(void)
 	if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
 		struct cpuset *cs;
 		nodemask_t oldmem = current->mems_allowed;
+		int migrate;
 
 		down(&callback_sem);
 		task_lock(current);
 		cs = current->cpuset;
+		migrate = is_memory_migrate(cs);
 		guarantee_online_mems(cs, &current->mems_allowed);
 		current->cpuset_mems_generation = cs->mems_generation;
 		task_unlock(current);
 		up(&callback_sem);
-		if (!nodes_equal(oldmem, current->mems_allowed))
+		if (!nodes_equal(oldmem, current->mems_allowed)) {
 			numa_policy_rebind(&oldmem, &current->mems_allowed);
+			if (migrate) {
+				do_migrate_pages(current->mm, &oldmem,
+					&current->mems_allowed,
+					MPOL_MF_MOVE_ALL);
+			}
+		}
 	}
 }
 
@@ -795,7 +809,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
- *						CS_NOTIFY_ON_RELEASE)
+ *				CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE)
  * cs:	the cpuset to update
  * buf:	the buffer where we read the 0 or 1
  *
@@ -848,6 +862,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	struct task_struct *tsk;
 	struct cpuset *oldcs;
 	cpumask_t cpus;
+	nodemask_t from, to;
 
 	if (sscanf(pidbuf, "%d", &pid) != 1)
 		return -EIO;
@@ -893,7 +908,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	guarantee_online_cpus(cs, &cpus);
 	set_cpus_allowed(tsk, cpus);
 
+	from = oldcs->mems_allowed;
+	to = cs->mems_allowed;
+
 	up(&callback_sem);
+	if (is_memory_migrate(cs))
+		do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
 	put_task_struct(tsk);
 	if (atomic_dec_and_test(&oldcs->count))
 		check_for_release(oldcs, ppathbuf);
@@ -905,6 +925,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 typedef enum {
 	FILE_ROOT,
 	FILE_DIR,
+	FILE_MEMORY_MIGRATE,
 	FILE_CPULIST,
 	FILE_MEMLIST,
 	FILE_CPU_EXCLUSIVE,
@@ -960,6 +981,9 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	case FILE_NOTIFY_ON_RELEASE:
 		retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
 		break;
+	case FILE_MEMORY_MIGRATE:
+		retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
+		break;
 	case FILE_TASKLIST:
 		retval = attach_task(cs, buffer, &pathbuf);
 		break;
@@ -1060,6 +1084,9 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 	case FILE_NOTIFY_ON_RELEASE:
 		*s++ = notify_on_release(cs) ? '1' : '0';
 		break;
+	case FILE_MEMORY_MIGRATE:
+		*s++ = is_memory_migrate(cs) ? '1' : '0';
+		break;
 	default:
 		retval = -EINVAL;
 		goto out;
@@ -1408,6 +1435,11 @@ static struct cftype cft_notify_on_release = {
 	.private = FILE_NOTIFY_ON_RELEASE,
 };
 
+static struct cftype cft_memory_migrate = {
+	.name = "memory_migrate",
+	.private = FILE_MEMORY_MIGRATE,
+};
+
 static int cpuset_populate_dir(struct dentry *cs_dentry)
 {
 	int err;
@@ -1422,6 +1454,8 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
 		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
 		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0)
+		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
 		return err;
 	return 0;
-- 
cgit v1.2.2


From 22fc6eccbf4ce4eb6265e6ada7b50a7b9cc57d05 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Sun, 8 Jan 2006 01:01:27 -0800
Subject: [PATCH] Change maxaligned_in_smp alignemnt macros to
 internodealigned_in_smp macros

____cacheline_maxaligned_in_smp is currently used to align critical structures
and avoid false sharing.  It uses per-arch L1_CACHE_SHIFT_MAX and people find
L1_CACHE_SHIFT_MAX useless.

However, we have been using ____cacheline_maxaligned_in_smp to align
structures on the internode cacheline size.  As per Andi's suggestion,
following patch kills ____cacheline_maxaligned_in_smp and introduces
INTERNODE_CACHE_SHIFT, which defaults to L1_CACHE_SHIFT for all arches.
Arches needing L3/Internode cacheline alignment can define
INTERNODE_CACHE_SHIFT in the arch asm/cache.h.  Patch replaces
____cacheline_maxaligned_in_smp with ____cacheline_internodealigned_in_smp

With this patch, L1_CACHE_SHIFT_MAX can be killed

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 48d3bce465b8..c9afc61240e4 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -61,9 +61,9 @@ struct rcu_state {
 	                              /* for current batch to proceed.        */
 };
 
-static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp =
+static struct rcu_state rcu_state ____cacheline_internodealigned_in_smp =
 	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
-static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp =
+static struct rcu_state rcu_bh_state ____cacheline_internodealigned_in_smp =
 	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
 
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
-- 
cgit v1.2.2


From e56d090310d7625ecb43a1eeebd479f04affb48b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 8 Jan 2006 01:01:37 -0800
Subject: [PATCH] RCU signal handling

RCU tasklist_lock and RCU signal handling: send signals RCU-read-locked
instead of tasklist_lock read-locked.  This is a scalability improvement on
SMP and a preemption-latency improvement under PREEMPT_RCU.

Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: William Irwin <wli@holomorphy.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c     |  1 -
 kernel/fork.c     | 10 +++++-
 kernel/pid.c      | 22 ++++++-------
 kernel/rcupdate.c |  1 +
 kernel/sched.c    |  7 ++++
 kernel/signal.c   | 97 +++++++++++++++++++++++++++++++++++++++++++++++--------
 6 files changed, 111 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index ee515683b92d..c73a7eb26de3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,7 +72,6 @@ repeat:
 		__ptrace_unlink(p);
 	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
 	__exit_signal(p);
-	__exit_sighand(p);
 	/*
 	 * Note that the fastpath in sys_times depends on __exit_signal having
 	 * updated the counters before a task is removed from the tasklist of
diff --git a/kernel/fork.c b/kernel/fork.c
index fb8572a42297..7fe3adfa65cb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -743,6 +743,14 @@ int unshare_files(void)
 
 EXPORT_SYMBOL(unshare_files);
 
+void sighand_free_cb(struct rcu_head *rhp)
+{
+	struct sighand_struct *sp;
+
+	sp = container_of(rhp, struct sighand_struct, rcu);
+	kmem_cache_free(sighand_cachep, sp);
+}
+
 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct sighand_struct *sig;
@@ -752,7 +760,7 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
 		return 0;
 	}
 	sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
-	tsk->sighand = sig;
+	rcu_assign_pointer(tsk->sighand, sig);
 	if (!sig)
 		return -ENOMEM;
 	spin_lock_init(&sig->siglock);
diff --git a/kernel/pid.c b/kernel/pid.c
index edba31c681ac..1acc07246991 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -136,7 +136,7 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
 	struct hlist_node *elem;
 	struct pid *pid;
 
-	hlist_for_each_entry(pid, elem,
+	hlist_for_each_entry_rcu(pid, elem,
 			&pid_hash[type][pid_hashfn(nr)], pid_chain) {
 		if (pid->nr == nr)
 			return pid;
@@ -150,15 +150,15 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
 
 	task_pid = &task->pids[type];
 	pid = find_pid(type, nr);
+	task_pid->nr = nr;
 	if (pid == NULL) {
-		hlist_add_head(&task_pid->pid_chain,
-				&pid_hash[type][pid_hashfn(nr)]);
 		INIT_LIST_HEAD(&task_pid->pid_list);
+		hlist_add_head_rcu(&task_pid->pid_chain,
+				   &pid_hash[type][pid_hashfn(nr)]);
 	} else {
 		INIT_HLIST_NODE(&task_pid->pid_chain);
-		list_add_tail(&task_pid->pid_list, &pid->pid_list);
+		list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
 	}
-	task_pid->nr = nr;
 
 	return 0;
 }
@@ -170,20 +170,20 @@ static fastcall int __detach_pid(task_t *task, enum pid_type type)
 
 	pid = &task->pids[type];
 	if (!hlist_unhashed(&pid->pid_chain)) {
-		hlist_del(&pid->pid_chain);
 
-		if (list_empty(&pid->pid_list))
+		if (list_empty(&pid->pid_list)) {
 			nr = pid->nr;
-		else {
+			hlist_del_rcu(&pid->pid_chain);
+		} else {
 			pid_next = list_entry(pid->pid_list.next,
 						struct pid, pid_list);
 			/* insert next pid from pid_list to hash */
-			hlist_add_head(&pid_next->pid_chain,
-				&pid_hash[type][pid_hashfn(pid_next->nr)]);
+			hlist_replace_rcu(&pid->pid_chain,
+					  &pid_next->pid_chain);
 		}
 	}
 
-	list_del(&pid->pid_list);
+	list_del_rcu(&pid->pid_list);
 	pid->nr = 0;
 
 	return nr;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c9afc61240e4..0a669bd2f6d1 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
+#include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <asm/atomic.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index 6f46c94cc29e..92733091154c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -176,6 +176,13 @@ static unsigned int task_timeslice(task_t *p)
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
 				< (long long) (sd)->cache_hot_time)
 
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+	__put_task_struct(container_of(rhp, struct task_struct, rcu));
+}
+
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
+
 /*
  * These are the runqueue data structures:
  */
diff --git a/kernel/signal.c b/kernel/signal.c
index d7611f189ef7..64737c72dadd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -329,13 +329,20 @@ void __exit_sighand(struct task_struct *tsk)
 	/* Ok, we're done with the signal handlers */
 	tsk->sighand = NULL;
 	if (atomic_dec_and_test(&sighand->count))
-		kmem_cache_free(sighand_cachep, sighand);
+		sighand_free(sighand);
 }
 
 void exit_sighand(struct task_struct *tsk)
 {
 	write_lock_irq(&tasklist_lock);
-	__exit_sighand(tsk);
+	rcu_read_lock();
+	if (tsk->sighand != NULL) {
+		struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
+		spin_lock(&sighand->siglock);
+		__exit_sighand(tsk);
+		spin_unlock(&sighand->siglock);
+	}
+	rcu_read_unlock();
 	write_unlock_irq(&tasklist_lock);
 }
 
@@ -345,12 +352,14 @@ void exit_sighand(struct task_struct *tsk)
 void __exit_signal(struct task_struct *tsk)
 {
 	struct signal_struct * sig = tsk->signal;
-	struct sighand_struct * sighand = tsk->sighand;
+	struct sighand_struct * sighand;
 
 	if (!sig)
 		BUG();
 	if (!atomic_read(&sig->count))
 		BUG();
+	rcu_read_lock();
+	sighand = rcu_dereference(tsk->sighand);
 	spin_lock(&sighand->siglock);
 	posix_cpu_timers_exit(tsk);
 	if (atomic_dec_and_test(&sig->count)) {
@@ -358,6 +367,7 @@ void __exit_signal(struct task_struct *tsk)
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
 		tsk->signal = NULL;
+		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
 		flush_sigqueue(&sig->shared_pending);
 	} else {
@@ -389,9 +399,11 @@ void __exit_signal(struct task_struct *tsk)
 		sig->nvcsw += tsk->nvcsw;
 		sig->nivcsw += tsk->nivcsw;
 		sig->sched_time += tsk->sched_time;
+		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
 		sig = NULL;	/* Marker for below.  */
 	}
+	rcu_read_unlock();
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
@@ -1080,18 +1092,28 @@ void zap_other_threads(struct task_struct *p)
 }
 
 /*
- * Must be called with the tasklist_lock held for reading!
+ * Must be called under rcu_read_lock() or with tasklist_lock read-held.
  */
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
 	unsigned long flags;
+	struct sighand_struct *sp;
 	int ret;
 
+retry:
 	ret = check_kill_permission(sig, info, p);
-	if (!ret && sig && p->sighand) {
-		spin_lock_irqsave(&p->sighand->siglock, flags);
+	if (!ret && sig && (sp = p->sighand)) {
+		if (!get_task_struct_rcu(p))
+			return -ESRCH;
+		spin_lock_irqsave(&sp->siglock, flags);
+		if (p->sighand != sp) {
+			spin_unlock_irqrestore(&sp->siglock, flags);
+			put_task_struct(p);
+			goto retry;
+		}
 		ret = __group_send_sig_info(sig, info, p);
-		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+		spin_unlock_irqrestore(&sp->siglock, flags);
+		put_task_struct(p);
 	}
 
 	return ret;
@@ -1136,14 +1158,21 @@ int
 kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 {
 	int error;
+	int acquired_tasklist_lock = 0;
 	struct task_struct *p;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
+	if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) {
+		read_lock(&tasklist_lock);
+		acquired_tasklist_lock = 1;
+	}
 	p = find_task_by_pid(pid);
 	error = -ESRCH;
 	if (p)
 		error = group_send_sig_info(sig, info, p);
-	read_unlock(&tasklist_lock);
+	if (unlikely(acquired_tasklist_lock))
+		read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	return error;
 }
 
@@ -1355,16 +1384,54 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 {
 	unsigned long flags;
 	int ret = 0;
+	struct sighand_struct *sh;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-	read_lock(&tasklist_lock);
+
+	/*
+	 * The rcu based delayed sighand destroy makes it possible to
+	 * run this without tasklist lock held. The task struct itself
+	 * cannot go away as create_timer did get_task_struct().
+	 *
+	 * We return -1, when the task is marked exiting, so
+	 * posix_timer_event can redirect it to the group leader
+	 */
+	rcu_read_lock();
 
 	if (unlikely(p->flags & PF_EXITING)) {
 		ret = -1;
 		goto out_err;
 	}
 
-	spin_lock_irqsave(&p->sighand->siglock, flags);
+retry:
+	sh = rcu_dereference(p->sighand);
+
+	spin_lock_irqsave(&sh->siglock, flags);
+	if (p->sighand != sh) {
+		/* We raced with exec() in a multithreaded process... */
+		spin_unlock_irqrestore(&sh->siglock, flags);
+		goto retry;
+	}
+
+	/*
+	 * We do the check here again to handle the following scenario:
+	 *
+	 * CPU 0		CPU 1
+	 * send_sigqueue
+	 * check PF_EXITING
+	 * interrupt		exit code running
+	 *			__exit_signal
+	 *			lock sighand->siglock
+	 *			unlock sighand->siglock
+	 * lock sh->siglock
+	 * add(tsk->pending) 	flush_sigqueue(tsk->pending)
+	 *
+	 */
+
+	if (unlikely(p->flags & PF_EXITING)) {
+		ret = -1;
+		goto out;
+	}
 
 	if (unlikely(!list_empty(&q->list))) {
 		/*
@@ -1388,9 +1455,9 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 		signal_wake_up(p, sig == SIGKILL);
 
 out:
-	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	spin_unlock_irqrestore(&sh->siglock, flags);
 out_err:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -1402,7 +1469,9 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	int ret = 0;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+
 	read_lock(&tasklist_lock);
+	/* Since it_lock is held, p->sighand cannot be NULL. */
 	spin_lock_irqsave(&p->sighand->siglock, flags);
 	handle_stop_signal(sig, p);
 
@@ -1436,7 +1505,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 out:
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
 	read_unlock(&tasklist_lock);
-	return(ret);
+	return ret;
 }
 
 /*
-- 
cgit v1.2.2


From 2d89c929078588aa9b9c674ef03ee9aa816b59b8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Sun, 8 Jan 2006 01:01:38 -0800
Subject: [PATCH] Simpler signal-exit concurrency handling

Some simplification in checking signal delivery against concurrent exit.
Instead of using get_task_struct_rcu(), which increments the task_struct
reference count, check the reference count after acquiring sighand lock.

Signed-off-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 64737c72dadd..9b6fda5e87f1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1102,18 +1102,19 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 
 retry:
 	ret = check_kill_permission(sig, info, p);
-	if (!ret && sig && (sp = p->sighand)) {
-		if (!get_task_struct_rcu(p))
-			return -ESRCH;
+	if (!ret && sig && (sp = rcu_dereference(p->sighand))) {
 		spin_lock_irqsave(&sp->siglock, flags);
 		if (p->sighand != sp) {
 			spin_unlock_irqrestore(&sp->siglock, flags);
-			put_task_struct(p);
 			goto retry;
 		}
+		if ((atomic_read(&sp->count) == 0) ||
+				(atomic_read(&p->usage) == 0)) {
+			spin_unlock_irqrestore(&sp->siglock, flags);
+			return -ESRCH;
+		}
 		ret = __group_send_sig_info(sig, info, p);
 		spin_unlock_irqrestore(&sp->siglock, flags);
-		put_task_struct(p);
 	}
 
 	return ret;
-- 
cgit v1.2.2


From 5966514db662fb24c9bb43226a80106bcffd51f8 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:47 -0800
Subject: [PATCH] cpuset: mempolicy one more nodemask conversion

Finish converting mm/mempolicy.c from bitmaps to nodemasks.  The previous
conversion had left one routine using bitmaps, since it involved a
corresponding change to kernel/cpuset.c

Fix that interface by replacing with a simple macro that calls nodes_subset(),
or if !CONFIG_CPUSET, returns (1).

Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <christoph@lameter.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f63383e01ec7..6503c6da4c4f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1749,16 +1749,6 @@ done:
 		refresh_mems();
 }
 
-/**
- * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed
- * @nodes: pointer to a node bitmap that is and-ed with mems_allowed
- */
-void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
-{
-	bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
-							MAX_NUMNODES);
-}
-
 /**
  * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
  * @zl: the zonelist to be checked
-- 
cgit v1.2.2


From 3e0d98b9f1eb757fc98efc84e74e54a08308aa73 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:49 -0800
Subject: [PATCH] cpuset: memory pressure meter

Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.

This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.

This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.

This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure.  It's up to the batch
manager or other user code to decide what to do about it and take action.

==> Unless this feature is enabled by writing "1" to the special file
    /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
    code of __alloc_pages() for this metric reduces to simply noticing
    that the cpuset_memory_pressure_enabled flag is zero.  So only
    systems that enable this feature will compute the metric.

Why a per-cpuset, running average:

    Because this meter is per-cpuset, rather than per-task or mm, the
    system load imposed by a batch scheduler monitoring this metric is
    sharply reduced on large systems, because a scan of the tasklist can be
    avoided on each set of queries.

    Because this meter is a running average, instead of an accumulating
    counter, a batch scheduler can detect memory pressure with a single
    read, instead of having to read and accumulate results for a period of
    time.

    Because this meter is per-cpuset rather than per-task or mm, the
    batch scheduler can obtain the key information, memory pressure in a
    cpuset, with a single read, rather than having to query and accumulate
    results over all the (dynamically changing) set of tasks in the cpuset.

A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.

A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 193 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 191 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6503c6da4c4f..5a06fef669f8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -56,6 +56,15 @@
 
 #define CPUSET_SUPER_MAGIC 		0x27e0eb
 
+/* See "Frequency meter" comments, below. */
+
+struct fmeter {
+	int cnt;		/* unprocessed events count */
+	int val;		/* most recent output value */
+	time_t time;		/* clock (secs) when val computed */
+	spinlock_t lock;	/* guards read or write of above */
+};
+
 struct cpuset {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 	cpumask_t cpus_allowed;		/* CPUs allowed to tasks in cpuset */
@@ -80,7 +89,9 @@ struct cpuset {
 	 * Copy of global cpuset_mems_generation as of the most
 	 * recent time this cpuset changed its mems_allowed.
 	 */
-	 int mems_generation;
+	int mems_generation;
+
+	struct fmeter fmeter;		/* memory_pressure filter */
 };
 
 /* bits in struct cpuset flags field */
@@ -149,7 +160,7 @@ static struct cpuset top_cpuset = {
 };
 
 static struct vfsmount *cpuset_mount;
-static struct super_block *cpuset_sb = NULL;
+static struct super_block *cpuset_sb;
 
 /*
  * We have two global cpuset semaphores below.  They can nest.
@@ -806,6 +817,19 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	return retval;
 }
 
+/*
+ * Call with manage_sem held.
+ */
+
+static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
+{
+	if (simple_strtoul(buf, NULL, 10) != 0)
+		cpuset_memory_pressure_enabled = 1;
+	else
+		cpuset_memory_pressure_enabled = 0;
+	return 0;
+}
+
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
@@ -847,6 +871,104 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 	return 0;
 }
 
+/*
+ * Frequency meter - How fast is some event occuring?
+ *
+ * These routines manage a digitally filtered, constant time based,
+ * event frequency meter.  There are four routines:
+ *   fmeter_init() - initialize a frequency meter.
+ *   fmeter_markevent() - called each time the event happens.
+ *   fmeter_getrate() - returns the recent rate of such events.
+ *   fmeter_update() - internal routine used to update fmeter.
+ *
+ * A common data structure is passed to each of these routines,
+ * which is used to keep track of the state required to manage the
+ * frequency meter and its digital filter.
+ *
+ * The filter works on the number of events marked per unit time.
+ * The filter is single-pole low-pass recursive (IIR).  The time unit
+ * is 1 second.  Arithmetic is done using 32-bit integers scaled to
+ * simulate 3 decimal digits of precision (multiplied by 1000).
+ *
+ * With an FM_COEF of 933, and a time base of 1 second, the filter
+ * has a half-life of 10 seconds, meaning that if the events quit
+ * happening, then the rate returned from the fmeter_getrate()
+ * will be cut in half each 10 seconds, until it converges to zero.
+ *
+ * It is not worth doing a real infinitely recursive filter.  If more
+ * than FM_MAXTICKS ticks have elapsed since the last filter event,
+ * just compute FM_MAXTICKS ticks worth, by which point the level
+ * will be stable.
+ *
+ * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
+ * arithmetic overflow in the fmeter_update() routine.
+ *
+ * Given the simple 32 bit integer arithmetic used, this meter works
+ * best for reporting rates between one per millisecond (msec) and
+ * one per 32 (approx) seconds.  At constant rates faster than one
+ * per msec it maxes out at values just under 1,000,000.  At constant
+ * rates between one per msec, and one per second it will stabilize
+ * to a value N*1000, where N is the rate of events per second.
+ * At constant rates between one per second and one per 32 seconds,
+ * it will be choppy, moving up on the seconds that have an event,
+ * and then decaying until the next event.  At rates slower than
+ * about one in 32 seconds, it decays all the way back to zero between
+ * each event.
+ */
+
+#define FM_COEF 933		/* coefficient for half-life of 10 secs */
+#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
+#define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
+#define FM_SCALE 1000		/* faux fixed point scale */
+
+/* Initialize a frequency meter */
+static void fmeter_init(struct fmeter *fmp)
+{
+	fmp->cnt = 0;
+	fmp->val = 0;
+	fmp->time = 0;
+	spin_lock_init(&fmp->lock);
+}
+
+/* Internal meter update - process cnt events and update value */
+static void fmeter_update(struct fmeter *fmp)
+{
+	time_t now = get_seconds();
+	time_t ticks = now - fmp->time;
+
+	if (ticks == 0)
+		return;
+
+	ticks = min(FM_MAXTICKS, ticks);
+	while (ticks-- > 0)
+		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
+	fmp->time = now;
+
+	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
+	fmp->cnt = 0;
+}
+
+/* Process any previous ticks, then bump cnt by one (times scale). */
+static void fmeter_markevent(struct fmeter *fmp)
+{
+	spin_lock(&fmp->lock);
+	fmeter_update(fmp);
+	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
+	spin_unlock(&fmp->lock);
+}
+
+/* Process any previous ticks, then return current value. */
+static int fmeter_getrate(struct fmeter *fmp)
+{
+	int val;
+
+	spin_lock(&fmp->lock);
+	fmeter_update(fmp);
+	val = fmp->val;
+	spin_unlock(&fmp->lock);
+	return val;
+}
+
 /*
  * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
  * writing the path of the old cpuset in 'ppathbuf' if it needs to be
@@ -931,6 +1053,8 @@ typedef enum {
 	FILE_CPU_EXCLUSIVE,
 	FILE_MEM_EXCLUSIVE,
 	FILE_NOTIFY_ON_RELEASE,
+	FILE_MEMORY_PRESSURE_ENABLED,
+	FILE_MEMORY_PRESSURE,
 	FILE_TASKLIST,
 } cpuset_filetype_t;
 
@@ -984,6 +1108,12 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	case FILE_MEMORY_MIGRATE:
 		retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
 		break;
+	case FILE_MEMORY_PRESSURE_ENABLED:
+		retval = update_memory_pressure_enabled(cs, buffer);
+		break;
+	case FILE_MEMORY_PRESSURE:
+		retval = -EACCES;
+		break;
 	case FILE_TASKLIST:
 		retval = attach_task(cs, buffer, &pathbuf);
 		break;
@@ -1087,6 +1217,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 	case FILE_MEMORY_MIGRATE:
 		*s++ = is_memory_migrate(cs) ? '1' : '0';
 		break;
+	case FILE_MEMORY_PRESSURE_ENABLED:
+		*s++ = cpuset_memory_pressure_enabled ? '1' : '0';
+		break;
+	case FILE_MEMORY_PRESSURE:
+		s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
+		break;
 	default:
 		retval = -EINVAL;
 		goto out;
@@ -1440,6 +1576,16 @@ static struct cftype cft_memory_migrate = {
 	.private = FILE_MEMORY_MIGRATE,
 };
 
+static struct cftype cft_memory_pressure_enabled = {
+	.name = "memory_pressure_enabled",
+	.private = FILE_MEMORY_PRESSURE_ENABLED,
+};
+
+static struct cftype cft_memory_pressure = {
+	.name = "memory_pressure",
+	.private = FILE_MEMORY_PRESSURE,
+};
+
 static int cpuset_populate_dir(struct dentry *cs_dentry)
 {
 	int err;
@@ -1456,6 +1602,8 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
 		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0)
 		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
+		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
 		return err;
 	return 0;
@@ -1491,6 +1639,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	INIT_LIST_HEAD(&cs->children);
 	atomic_inc(&cpuset_mems_generation);
 	cs->mems_generation = atomic_read(&cpuset_mems_generation);
+	fmeter_init(&cs->fmeter);
 
 	cs->parent = parent;
 
@@ -1580,6 +1729,7 @@ int __init cpuset_init(void)
 	top_cpuset.cpus_allowed = CPU_MASK_ALL;
 	top_cpuset.mems_allowed = NODE_MASK_ALL;
 
+	fmeter_init(&top_cpuset.fmeter);
 	atomic_inc(&cpuset_mems_generation);
 	top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
 
@@ -1601,6 +1751,9 @@ int __init cpuset_init(void)
 	top_cpuset.dentry = root;
 	root->d_inode->i_op = &cpuset_dir_inode_operations;
 	err = cpuset_populate_dir(root);
+	/* memory_pressure_enabled is in root cpuset only */
+	if (err == 0)
+		err = cpuset_add_file(root, &cft_memory_pressure_enabled);
 out:
 	return err;
 }
@@ -1890,6 +2043,42 @@ done:
 	return overlap;
 }
 
+/*
+ * Collection of memory_pressure is suppressed unless
+ * this flag is enabled by writing "1" to the special
+ * cpuset file 'memory_pressure_enabled' in the root cpuset.
+ */
+
+int cpuset_memory_pressure_enabled;
+
+/**
+ * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+ *
+ * Keep a running average of the rate of synchronous (direct)
+ * page reclaim efforts initiated by tasks in each cpuset.
+ *
+ * This represents the rate at which some task in the cpuset
+ * ran low on memory on all nodes it was allowed to use, and
+ * had to enter the kernels page reclaim code in an effort to
+ * create more free memory by tossing clean pages or swapping
+ * or writing dirty pages.
+ *
+ * Display to user space in the per-cpuset read-only file
+ * "memory_pressure".  Value displayed is an integer
+ * representing the recent rate of entry into the synchronous
+ * (direct) page reclaim by any task attached to the cpuset.
+ **/
+
+void __cpuset_memory_pressure_bump(void)
+{
+	struct cpuset *cs;
+
+	task_lock(current);
+	cs = current->cpuset;
+	fmeter_markevent(&cs->fmeter);
+	task_unlock(current);
+}
+
 /*
  * proc_cpuset_show()
  *  - Print tasks cpuset path into seq_file.
-- 
cgit v1.2.2


From c5b2aff89635495064592dc90da595f8a880ee87 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:51 -0800
Subject: [PATCH] cpuset: minor spacing initializer fixes

Four trivial cpuset fixes: remove extra spaces, remove useless initializers,
mark one __read_mostly.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 5a06fef669f8..f513dd937eee 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -54,7 +54,7 @@
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
 
-#define CPUSET_SUPER_MAGIC 		0x27e0eb
+#define CPUSET_SUPER_MAGIC		0x27e0eb
 
 /* See "Frequency meter" comments, below. */
 
@@ -154,9 +154,6 @@ static struct cpuset top_cpuset = {
 	.count = ATOMIC_INIT(0),
 	.sibling = LIST_HEAD_INIT(top_cpuset.sibling),
 	.children = LIST_HEAD_INIT(top_cpuset.children),
-	.parent = NULL,
-	.dentry = NULL,
-	.mems_generation = 0,
 };
 
 static struct vfsmount *cpuset_mount;
@@ -1341,7 +1338,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode)
 
 /*
  *	cpuset_create_dir - create a directory for an object.
- *	cs: 	the cpuset we create the directory for.
+ *	cs:	the cpuset we create the directory for.
  *		It must have a valid ->parent field
  *		And we are going to fill its ->dentry field.
  *	name:	The name to give to the cpuset directory. Will be copied.
@@ -2049,7 +2046,7 @@ done:
  * cpuset file 'memory_pressure_enabled' in the root cpuset.
  */
 
-int cpuset_memory_pressure_enabled;
+int cpuset_memory_pressure_enabled __read_mostly;
 
 /**
  * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
-- 
cgit v1.2.2


From 59dac16fb95f09253b8086134443abeb439703cd Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:52 -0800
Subject: [PATCH] cpuset: update_nodemask code reformat

Restructure code layout of the kernel/cpuset.c update_nodemask() routine,
removing embedded returns and nested if's in favor of goto completion labels.
This is being done in anticipation of adding more logic to this routine, which
will favor the goto style structure.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f513dd937eee..3ea63da11d71 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -799,18 +799,23 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	trialcs = *cs;
 	retval = nodelist_parse(buf, trialcs.mems_allowed);
 	if (retval < 0)
-		return retval;
+		goto done;
 	nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
-	if (nodes_empty(trialcs.mems_allowed))
-		return -ENOSPC;
-	retval = validate_change(cs, &trialcs);
-	if (retval == 0) {
-		down(&callback_sem);
-		cs->mems_allowed = trialcs.mems_allowed;
-		atomic_inc(&cpuset_mems_generation);
-		cs->mems_generation = atomic_read(&cpuset_mems_generation);
-		up(&callback_sem);
+	if (nodes_empty(trialcs.mems_allowed)) {
+		retval = -ENOSPC;
+		goto done;
 	}
+	retval = validate_change(cs, &trialcs);
+	if (retval < 0)
+		goto done;
+
+	down(&callback_sem);
+	cs->mems_allowed = trialcs.mems_allowed;
+	atomic_inc(&cpuset_mems_generation);
+	cs->mems_generation = atomic_read(&cpuset_mems_generation);
+	up(&callback_sem);
+
+done:
 	return retval;
 }
 
-- 
cgit v1.2.2


From b4b2641843db124637fa3d2cb2101982035dcc82 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:53 -0800
Subject: [PATCH] cpuset: fork hook fix

Fix obscure, never seen in real life, cpuset fork race.  The cpuset_fork()
call in fork.c was setting up the correct task->cpuset pointer after the
tasklist_lock was dropped, which briefly exposed the newly forked process with
an unsafe (copied from parent without locks or usage counter increment) cpuset
pointer.

In theory, that exposed cpuset pointer could have been pointing at a cpuset
that was already freed and removed, and in theory another task that had been
sitting on the tasklist_lock waiting to scan the task list could have raced
down the entire tasklist, found our new child at the far end, and dereferenced
that bogus cpuset pointer.

To fix, setup up the correct cpuset pointer in the new child by calling
cpuset_fork() before the new task is linked into the tasklist, and with that,
add a fork failure case, to dereference that cpuset, if the fork fails along
the way, after cpuset_fork() was called.

Had to remove a BUG_ON() from cpuset_exit(), because it was no longer valid -
the call to cpuset_exit() from a failed fork would not have PF_EXITING set.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 4 +---
 kernel/fork.c   | 6 ++++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3ea63da11d71..d9349cc48b95 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1821,15 +1821,13 @@ void cpuset_fork(struct task_struct *child)
  *
  * We don't need to task_lock() this reference to tsk->cpuset,
  * because tsk is already marked PF_EXITING, so attach_task() won't
- * mess with it.
+ * mess with it, or task is a failed fork, never visible to attach_task.
  **/
 
 void cpuset_exit(struct task_struct *tsk)
 {
 	struct cpuset *cs;
 
-	BUG_ON(!(tsk->flags & PF_EXITING));
-
 	cs = tsk->cpuset;
 	tsk->cpuset = NULL;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 7fe3adfa65cb..7992ee759d89 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -972,12 +972,13 @@ static task_t *copy_process(unsigned long clone_flags,
 	p->io_context = NULL;
 	p->io_wait = NULL;
 	p->audit_context = NULL;
+	cpuset_fork(p);
 #ifdef CONFIG_NUMA
  	p->mempolicy = mpol_copy(p->mempolicy);
  	if (IS_ERR(p->mempolicy)) {
  		retval = PTR_ERR(p->mempolicy);
  		p->mempolicy = NULL;
- 		goto bad_fork_cleanup;
+ 		goto bad_fork_cleanup_cpuset;
  	}
 #endif
 
@@ -1148,7 +1149,6 @@ static task_t *copy_process(unsigned long clone_flags,
 	total_forks++;
 	write_unlock_irq(&tasklist_lock);
 	proc_fork_connector(p);
-	cpuset_fork(p);
 	retval = 0;
 
 fork_out:
@@ -1180,7 +1180,9 @@ bad_fork_cleanup_security:
 bad_fork_cleanup_policy:
 #ifdef CONFIG_NUMA
 	mpol_free(p->mempolicy);
+bad_fork_cleanup_cpuset:
 #endif
+	cpuset_exit(p);
 bad_fork_cleanup:
 	if (p->binfmt)
 		module_put(p->binfmt->module);
-- 
cgit v1.2.2


From cf2a473c4089aa41c26f653200673f5a4cc25047 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:54 -0800
Subject: [PATCH] cpuset: combine refresh_mems and update_mems

The important code paths through alloc_pages_current() and alloc_page_vma(),
by which most kernel page allocations go, both called
cpuset_update_current_mems_allowed(), which in turn called refresh_mems().
-Both- of these latter two routines did a tasklock, got the tasks cpuset
pointer, and checked for out of date cpuset->mems_generation.

That was a silly duplication of code and waste of CPU cycles on an important
code path.

Consolidated those two routines into a single routine, called
cpuset_update_task_memory_state(), since it updates more than just
mems_allowed.

Changed all callers of either routine to call the new consolidated routine.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 95 +++++++++++++++++++++++++--------------------------------
 1 file changed, 41 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d9349cc48b95..e9917d71628a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -584,13 +584,26 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 	BUG_ON(!nodes_intersects(*pmask, node_online_map));
 }
 
-/*
- * Refresh current tasks mems_allowed and mems_generation from current
- * tasks cpuset.
+/**
+ * cpuset_update_task_memory_state - update task memory placement
  *
- * Call without callback_sem or task_lock() held.  May be called with
- * or without manage_sem held.  Will acquire task_lock() and might
- * acquire callback_sem during call.
+ * If the current tasks cpusets mems_allowed changed behind our
+ * backs, update current->mems_allowed, mems_generation and task NUMA
+ * mempolicy to the new value.
+ *
+ * Task mempolicy is updated by rebinding it relative to the
+ * current->cpuset if a task has its memory placement changed.
+ * Do not call this routine if in_interrupt().
+ *
+ * Call without callback_sem or task_lock() held.  May be called
+ * with or without manage_sem held.  Except in early boot or
+ * an exiting task, when tsk->cpuset is NULL, this routine will
+ * acquire task_lock().  We don't need to use task_lock to guard
+ * against another task changing a non-NULL cpuset pointer to NULL,
+ * as that is only done by a task on itself, and if the current task
+ * is here, it is not simultaneously in the exit code NULL'ing its
+ * cpuset pointer.  This routine also might acquire callback_sem and
+ * current->mm->mmap_sem during call.
  *
  * The task_lock() is required to dereference current->cpuset safely.
  * Without it, we could pick up the pointer value of current->cpuset
@@ -605,32 +618,36 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * task has been modifying its cpuset.
  */
 
-static void refresh_mems(void)
+void cpuset_update_task_memory_state()
 {
 	int my_cpusets_mem_gen;
+	struct task_struct *tsk = current;
+	struct cpuset *cs = tsk->cpuset;
 
-	task_lock(current);
-	my_cpusets_mem_gen = current->cpuset->mems_generation;
-	task_unlock(current);
+	if (unlikely(!cs))
+		return;
+
+	task_lock(tsk);
+	my_cpusets_mem_gen = cs->mems_generation;
+	task_unlock(tsk);
 
-	if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
-		struct cpuset *cs;
-		nodemask_t oldmem = current->mems_allowed;
+	if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
+		nodemask_t oldmem = tsk->mems_allowed;
 		int migrate;
 
 		down(&callback_sem);
-		task_lock(current);
-		cs = current->cpuset;
+		task_lock(tsk);
+		cs = tsk->cpuset;	/* Maybe changed when task not locked */
 		migrate = is_memory_migrate(cs);
-		guarantee_online_mems(cs, &current->mems_allowed);
-		current->cpuset_mems_generation = cs->mems_generation;
-		task_unlock(current);
+		guarantee_online_mems(cs, &tsk->mems_allowed);
+		tsk->cpuset_mems_generation = cs->mems_generation;
+		task_unlock(tsk);
 		up(&callback_sem);
-		if (!nodes_equal(oldmem, current->mems_allowed)) {
-			numa_policy_rebind(&oldmem, &current->mems_allowed);
+		numa_policy_rebind(&oldmem, &tsk->mems_allowed);
+		if (!nodes_equal(oldmem, tsk->mems_allowed)) {
 			if (migrate) {
-				do_migrate_pages(current->mm, &oldmem,
-					&current->mems_allowed,
+				do_migrate_pages(tsk->mm, &oldmem,
+					&tsk->mems_allowed,
 					MPOL_MF_MOVE_ALL);
 			}
 		}
@@ -1630,7 +1647,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 		return -ENOMEM;
 
 	down(&manage_sem);
-	refresh_mems();
+	cpuset_update_task_memory_state();
 	cs->flags = 0;
 	if (notify_on_release(parent))
 		set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1688,7 +1705,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	/* the vfs holds both inode->i_sem already */
 
 	down(&manage_sem);
-	refresh_mems();
+	cpuset_update_task_memory_state();
 	if (atomic_read(&cs->count) > 0) {
 		up(&manage_sem);
 		return -EBUSY;
@@ -1872,36 +1889,6 @@ void cpuset_init_current_mems_allowed(void)
 	current->mems_allowed = NODE_MASK_ALL;
 }
 
-/**
- * cpuset_update_current_mems_allowed - update mems parameters to new values
- *
- * If the current tasks cpusets mems_allowed changed behind our backs,
- * update current->mems_allowed and mems_generation to the new value.
- * Do not call this routine if in_interrupt().
- *
- * Call without callback_sem or task_lock() held.  May be called
- * with or without manage_sem held.  Unless exiting, it will acquire
- * task_lock().  Also might acquire callback_sem during call to
- * refresh_mems().
- */
-
-void cpuset_update_current_mems_allowed(void)
-{
-	struct cpuset *cs;
-	int need_to_refresh = 0;
-
-	task_lock(current);
-	cs = current->cpuset;
-	if (!cs)
-		goto done;
-	if (current->cpuset_mems_generation != cs->mems_generation)
-		need_to_refresh = 1;
-done:
-	task_unlock(current);
-	if (need_to_refresh)
-		refresh_mems();
-}
-
 /**
  * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
  * @zl: the zonelist to be checked
-- 
cgit v1.2.2


From 909d75a3b77bdd8baa9429bad3b69a654d2954ce Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:55 -0800
Subject: [PATCH] cpuset: implement cpuset_mems_allowed

Provide a cpuset_mems_allowed() method, which the sys_migrate_pages() code
needed, to obtain the mems_allowed vector of a cpuset, and replaced the
workaround in sys_migrate_pages() to call this new method.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e9917d71628a..0d0dbbd6560a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1871,14 +1871,14 @@ void cpuset_exit(struct task_struct *tsk)
  * tasks cpuset.
  **/
 
-cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
+cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
 {
 	cpumask_t mask;
 
 	down(&callback_sem);
-	task_lock((struct task_struct *)tsk);
+	task_lock(tsk);
 	guarantee_online_cpus(tsk->cpuset, &mask);
-	task_unlock((struct task_struct *)tsk);
+	task_unlock(tsk);
 	up(&callback_sem);
 
 	return mask;
@@ -1889,6 +1889,29 @@ void cpuset_init_current_mems_allowed(void)
 	current->mems_allowed = NODE_MASK_ALL;
 }
 
+/**
+ * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
+ *
+ * Description: Returns the nodemask_t mems_allowed of the cpuset
+ * attached to the specified @tsk.  Guaranteed to return some non-empty
+ * subset of node_online_map, even if this means going outside the
+ * tasks cpuset.
+ **/
+
+nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
+{
+	nodemask_t mask;
+
+	down(&callback_sem);
+	task_lock(tsk);
+	guarantee_online_mems(tsk->cpuset, &mask);
+	task_unlock(tsk);
+	up(&callback_sem);
+
+	return mask;
+}
+
 /**
  * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
  * @zl: the zonelist to be checked
-- 
cgit v1.2.2


From 74cb21553f4bf244185b9bec4c26e4e3169ad55e Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:56 -0800
Subject: [PATCH] cpuset: numa_policy_rebind cleanup

Cleanup, reorganize and make more robust the mempolicy.c code to rebind
mempolicies relative to the containing cpuset after a tasks memory placement
changes.

The real motivator for this cleanup patch is to lay more groundwork for the
upcoming patch to correctly rebind NUMA mempolicies that are attached to vma's
after the containing cpuset memory placement changes.

NUMA mempolicies are constrained by the cpuset their task is a member of.
When either (1) a task is moved to a different cpuset, or (2) the 'mems'
mems_allowed of a cpuset is changed, then the NUMA mempolicies have embedded
node numbers (for MPOL_BIND, MPOL_INTERLEAVE and MPOL_PREFERRED) that need to
be recalculated, relative to their new cpuset placement.

The old code used an unreliable method of determining what was the old
mems_allowed constraining the mempolicy.  It just looked at the tasks
mems_allowed value.  This sort of worked with the present code, that just
rebinds the -task- mempolicy, and leaves any -vma- mempolicies broken,
referring to the old nodes.  But in an upcoming patch, the vma mempolicies
will be rebound as well.  Then the order in which the various task and vma
mempolicies are updated will no longer be deterministic, and one can no longer
count on the task->mems_allowed holding the old value for as long as needed.
It's not even clear if the current code was guaranteed to work reliably for
task mempolicies.

So I added a mems_allowed field to each mempolicy, stating exactly what
mems_allowed the policy is relative to, and updated synchronously and reliably
anytime that the mempolicy is rebound.

Also removed a useless wrapper routine, numa_policy_rebind(), and had its
caller, cpuset_update_task_memory_state(), call directly to the rewritten
policy_rebind() routine, and made that rebind routine extern instead of
static, and added a "mpol_" prefix to its name, making it
mpol_rebind_policy().

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0d0dbbd6560a..8f764de3a9e7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -643,7 +643,7 @@ void cpuset_update_task_memory_state()
 		tsk->cpuset_mems_generation = cs->mems_generation;
 		task_unlock(tsk);
 		up(&callback_sem);
-		numa_policy_rebind(&oldmem, &tsk->mems_allowed);
+		mpol_rebind_task(tsk, &tsk->mems_allowed);
 		if (!nodes_equal(oldmem, tsk->mems_allowed)) {
 			if (migrate) {
 				do_migrate_pages(tsk->mm, &oldmem,
-- 
cgit v1.2.2


From 202f72d5d1b5c2c084f63ef996c736d208b447b5 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:57 -0800
Subject: [PATCH] cpuset: number_of_cpusets optimization

Easy little optimization hack to avoid actually having to call
cpuset_zone_allowed() and check mems_allowed, in the main page allocation
routine, __alloc_pages().  This saves several CPU cycles per page allocation
on systems not using cpusets.

A counter is updated each time a cpuset is created or removed, and whenever
there is only one cpuset in the system, it must be the root cpuset, which
contains all CPUs and all Memory Nodes.  In that case, when the counter is
one, all allocations are allowed.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8f764de3a9e7..6004719f26ee 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -56,6 +56,13 @@
 
 #define CPUSET_SUPER_MAGIC		0x27e0eb
 
+/*
+ * Tracks how many cpusets are currently defined in system.
+ * When there is only one cpuset (the root cpuset) we can
+ * short circuit some hooks.
+ */
+int number_of_cpusets;
+
 /* See "Frequency meter" comments, below. */
 
 struct fmeter {
@@ -1664,6 +1671,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 
 	down(&callback_sem);
 	list_add(&cs->sibling, &cs->parent->children);
+	number_of_cpusets++;
 	up(&callback_sem);
 
 	err = cpuset_create_dir(cs, name, mode);
@@ -1726,6 +1734,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	spin_unlock(&d->d_lock);
 	cpuset_d_remove_dir(d);
 	dput(d);
+	number_of_cpusets--;
 	up(&callback_sem);
 	if (list_empty(&parent->children))
 		check_for_release(parent, &pathbuf);
@@ -1769,6 +1778,7 @@ int __init cpuset_init(void)
 	root->d_inode->i_nlink++;
 	top_cpuset.dentry = root;
 	root->d_inode->i_op = &cpuset_dir_inode_operations;
+	number_of_cpusets = 1;
 	err = cpuset_populate_dir(root);
 	/* memory_pressure_enabled is in root cpuset only */
 	if (err == 0)
@@ -1982,7 +1992,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  **/
 
-int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
 	int node;			/* node that zone z is on */
 	const struct cpuset *cs;	/* current cpuset ancestors */
-- 
cgit v1.2.2


From 4225399a66b315d4d1fb1cb61b75dda201c832e3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:01:59 -0800
Subject: [PATCH] cpuset: rebind vma mempolicies fix

Fix more of longstanding bug in cpuset/mempolicy interaction.

NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset.  The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.

When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.

An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.

This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired.  The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.

Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan.  In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock).  So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.

Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound.  A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.

When a task is moved to a different cpuset, it is easier, as there is only one
task involved.  It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.

It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice.  This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6004719f26ee..19f87565be17 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -812,12 +812,24 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 }
 
 /*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset.  Needs to validate the request, update the
+ * cpusets mems_allowed and mems_generation, and for each
+ * task in the cpuset, rebind any vma mempolicies.
+ *
  * Call with manage_sem held.  May take callback_sem during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
  */
 
 static int update_nodemask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
+	struct task_struct *g, *p;
+	struct mm_struct **mmarray;
+	int i, n, ntasks;
+	int fudge;
 	int retval;
 
 	trialcs = *cs;
@@ -839,6 +851,76 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	cs->mems_generation = atomic_read(&cpuset_mems_generation);
 	up(&callback_sem);
 
+	set_cpuset_being_rebound(cs);		/* causes mpol_copy() rebind */
+
+	fudge = 10;				/* spare mmarray[] slots */
+	fudge += cpus_weight(cs->cpus_allowed);	/* imagine one fork-bomb/cpu */
+	retval = -ENOMEM;
+
+	/*
+	 * Allocate mmarray[] to hold mm reference for each task
+	 * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding
+	 * tasklist_lock.  We could use GFP_ATOMIC, but with a
+	 * few more lines of code, we can retry until we get a big
+	 * enough mmarray[] w/o using GFP_ATOMIC.
+	 */
+	while (1) {
+		ntasks = atomic_read(&cs->count);	/* guess */
+		ntasks += fudge;
+		mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
+		if (!mmarray)
+			goto done;
+		write_lock_irq(&tasklist_lock);		/* block fork */
+		if (atomic_read(&cs->count) <= ntasks)
+			break;				/* got enough */
+		write_unlock_irq(&tasklist_lock);	/* try again */
+		kfree(mmarray);
+	}
+
+	n = 0;
+
+	/* Load up mmarray[] with mm reference for each task in cpuset. */
+	do_each_thread(g, p) {
+		struct mm_struct *mm;
+
+		if (n >= ntasks) {
+			printk(KERN_WARNING
+				"Cpuset mempolicy rebind incomplete.\n");
+			continue;
+		}
+		if (p->cpuset != cs)
+			continue;
+		mm = get_task_mm(p);
+		if (!mm)
+			continue;
+		mmarray[n++] = mm;
+	} while_each_thread(g, p);
+	write_unlock_irq(&tasklist_lock);
+
+	/*
+	 * Now that we've dropped the tasklist spinlock, we can
+	 * rebind the vma mempolicies of each mm in mmarray[] to their
+	 * new cpuset, and release that mm.  The mpol_rebind_mm()
+	 * call takes mmap_sem, which we couldn't take while holding
+	 * tasklist_lock.  Forks can happen again now - the mpol_copy()
+	 * cpuset_being_rebound check will catch such forks, and rebind
+	 * their vma mempolicies too.  Because we still hold the global
+	 * cpuset manage_sem, we know that no other rebind effort will
+	 * be contending for the global variable cpuset_being_rebound.
+	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
+	 * is idempotent.
+	 */
+	for (i = 0; i < n; i++) {
+		struct mm_struct *mm = mmarray[i];
+
+		mpol_rebind_mm(mm, &cs->mems_allowed);
+		mmput(mm);
+	}
+
+	/* We're done rebinding vma's to this cpusets new mems_allowed. */
+	kfree(mmarray);
+	set_cpuset_being_rebound(NULL);
+	retval = 0;
 done:
 	return retval;
 }
@@ -1011,6 +1093,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	struct cpuset *oldcs;
 	cpumask_t cpus;
 	nodemask_t from, to;
+	struct mm_struct *mm;
 
 	if (sscanf(pidbuf, "%d", &pid) != 1)
 		return -EIO;
@@ -1060,6 +1143,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	to = cs->mems_allowed;
 
 	up(&callback_sem);
+
+	mm = get_task_mm(tsk);
+	if (mm) {
+		mpol_rebind_mm(mm, &to);
+		mmput(mm);
+	}
+
 	if (is_memory_migrate(cs))
 		do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
 	put_task_struct(tsk);
-- 
cgit v1.2.2


From 04c19fa6f16047abff2288ddbc1f0798ede5a849 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:02:00 -0800
Subject: [PATCH] cpuset: migrate all tasks in cpuset at once

Given the mechanism in the previous patch to handle rebinding the per-vma
mempolicies of all tasks in a cpuset that changes its memory placement, it is
now easier to handle the page migration requirements of such tasks at the same
time.

The previous code didn't actually attempt to migrate the pages of the tasks in
a cpuset whose memory placement changed until the next time each such task
tried to allocate memory.  This was undesirable, as users invoking memory page
migration exected to happen when the placement changed, not some unspecified
time later when the task needed more memory.

It is now trivial to handle the page migration at the same time as the per-vma
rebinding is done.

The routine cpuset.c:update_nodemask(), which handles changing a cpusets
memory placement ('mems') now checks for the special case of being asked to
write a placement that is the same as before.  It was harmless enough before
to just recompute everything again, even though nothing had changed.  But page
migration is a heavy weight operation - moving pages about.  So now it is
worth avoiding that if asked to move a cpuset to its current location.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 19f87565be17..cf8203a5fa71 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -639,25 +639,14 @@ void cpuset_update_task_memory_state()
 	task_unlock(tsk);
 
 	if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
-		nodemask_t oldmem = tsk->mems_allowed;
-		int migrate;
-
 		down(&callback_sem);
 		task_lock(tsk);
 		cs = tsk->cpuset;	/* Maybe changed when task not locked */
-		migrate = is_memory_migrate(cs);
 		guarantee_online_mems(cs, &tsk->mems_allowed);
 		tsk->cpuset_mems_generation = cs->mems_generation;
 		task_unlock(tsk);
 		up(&callback_sem);
 		mpol_rebind_task(tsk, &tsk->mems_allowed);
-		if (!nodes_equal(oldmem, tsk->mems_allowed)) {
-			if (migrate) {
-				do_migrate_pages(tsk->mm, &oldmem,
-					&tsk->mems_allowed,
-					MPOL_MF_MOVE_ALL);
-			}
-		}
 	}
 }
 
@@ -815,7 +804,9 @@ static int update_cpumask(struct cpuset *cs, char *buf)
  * Handle user request to change the 'mems' memory placement
  * of a cpuset.  Needs to validate the request, update the
  * cpusets mems_allowed and mems_generation, and for each
- * task in the cpuset, rebind any vma mempolicies.
+ * task in the cpuset, rebind any vma mempolicies and if
+ * the cpuset is marked 'memory_migrate', migrate the tasks
+ * pages to the new memory.
  *
  * Call with manage_sem held.  May take callback_sem during call.
  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -826,9 +817,11 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 static int update_nodemask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
+	nodemask_t oldmem;
 	struct task_struct *g, *p;
 	struct mm_struct **mmarray;
 	int i, n, ntasks;
+	int migrate;
 	int fudge;
 	int retval;
 
@@ -837,6 +830,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	if (retval < 0)
 		goto done;
 	nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
+	oldmem = cs->mems_allowed;
+	if (nodes_equal(oldmem, trialcs.mems_allowed)) {
+		retval = 0;		/* Too easy - nothing to do */
+		goto done;
+	}
 	if (nodes_empty(trialcs.mems_allowed)) {
 		retval = -ENOSPC;
 		goto done;
@@ -908,12 +906,17 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	 * cpuset manage_sem, we know that no other rebind effort will
 	 * be contending for the global variable cpuset_being_rebound.
 	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
-	 * is idempotent.
+	 * is idempotent.  Also migrate pages in each mm to new nodes.
 	 */
+	migrate = is_memory_migrate(cs);
 	for (i = 0; i < n; i++) {
 		struct mm_struct *mm = mmarray[i];
 
 		mpol_rebind_mm(mm, &cs->mems_allowed);
+		if (migrate) {
+			do_migrate_pages(mm, &oldmem, &cs->mems_allowed,
+							MPOL_MF_MOVE_ALL);
+		}
 		mmput(mm);
 	}
 
-- 
cgit v1.2.2


From c417f0242ebe578924a30d4e53d35b5059fed4e7 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:02:01 -0800
Subject: [PATCH] cpuset: remove test for null cpuset from alloc code path

Remove a couple of more lines of code from the cpuset hooks in the page
allocation code path.

There was a check for a NULL cpuset pointer in the routine
cpuset_update_task_memory_state() that was only needed during system boot,
after the memory subsystem was initialized, before the cpuset subsystem was
initialized, to catch a NULL task->cpuset pointer.

Add a cpuset_init_early() routine, just before the mem_init() call in
init/main.c, that sets up just enough of the init tasks cpuset structure to
render cpuset_update_task_memory_state() calls harmless.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cf8203a5fa71..fc949e4a625c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -603,9 +603,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * Do not call this routine if in_interrupt().
  *
  * Call without callback_sem or task_lock() held.  May be called
- * with or without manage_sem held.  Except in early boot or
- * an exiting task, when tsk->cpuset is NULL, this routine will
- * acquire task_lock().  We don't need to use task_lock to guard
+ * with or without manage_sem held.  Doesn't need task_lock to guard
  * against another task changing a non-NULL cpuset pointer to NULL,
  * as that is only done by a task on itself, and if the current task
  * is here, it is not simultaneously in the exit code NULL'ing its
@@ -631,9 +629,6 @@ void cpuset_update_task_memory_state()
 	struct task_struct *tsk = current;
 	struct cpuset *cs = tsk->cpuset;
 
-	if (unlikely(!cs))
-		return;
-
 	task_lock(tsk);
 	my_cpusets_mem_gen = cs->mems_generation;
 	task_unlock(tsk);
@@ -1836,6 +1831,21 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	return 0;
 }
 
+/*
+ * cpuset_init_early - just enough so that the calls to
+ * cpuset_update_task_memory_state() in early init code
+ * are harmless.
+ */
+
+int __init cpuset_init_early(void)
+{
+	struct task_struct *tsk = current;
+
+	tsk->cpuset = &top_cpuset;
+	tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation);
+	return 0;
+}
+
 /**
  * cpuset_init - initialize cpusets at system boot
  *
-- 
cgit v1.2.2


From 6b9c2603ce07f70de9c7a8d335ecd028e8ff11f3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:02:02 -0800
Subject: [PATCH] cpuset: use rcu directly optimization

Optimize the cpuset impact on page allocation, the most performance critical
cpuset hook in the kernel.

On each page allocation, the cpuset hook needs to check for a possible change
in the current tasks cpuset.  It can now handle the common case, of no change,
without taking any spinlock or semaphore, thanks to RCU.

Convert a spinlock on the current task to an rcu_read_lock(), saving
approximately a memory barrier and an atomic op, depending on architecture.

This is done by adding rcu_assign_pointer() and synchronize_rcu() calls to the
write side of the task->cpuset pointer, in cpuset.c:attach_task(), to delay
freeing up a detached cpuset until after any critical sections referencing
that pointer.

Thanks to Andi Kleen, Nick Piggin and Eric Dumazet for ideas.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index fc949e4a625c..6fe28d6f282b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -39,6 +39,7 @@
 #include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
@@ -248,6 +249,11 @@ static struct super_block *cpuset_sb;
  * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
  * (task->alloc_lock) already in the task_struct routinely used for
  * such matters.
+ *
+ * P.S.  One more locking exception.  RCU is used to guard the
+ * update of a tasks cpuset pointer by attach_task() and the
+ * access of task->cpuset->mems_generation via that pointer in
+ * the routine cpuset_update_task_memory_state().
  */
 
 static DECLARE_MUTEX(manage_sem);
@@ -610,12 +616,24 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * cpuset pointer.  This routine also might acquire callback_sem and
  * current->mm->mmap_sem during call.
  *
- * The task_lock() is required to dereference current->cpuset safely.
- * Without it, we could pick up the pointer value of current->cpuset
- * in one instruction, and then attach_task could give us a different
- * cpuset, and then the cpuset we had could be removed and freed,
- * and then on our next instruction, we could dereference a no longer
- * valid cpuset pointer to get its mems_generation field.
+ * Reading current->cpuset->mems_generation doesn't need task_lock
+ * to guard the current->cpuset derefence, because it is guarded
+ * from concurrent freeing of current->cpuset by attach_task(),
+ * using RCU.
+ *
+ * The rcu_dereference() is technically probably not needed,
+ * as I don't actually mind if I see a new cpuset pointer but
+ * an old value of mems_generation.  However this really only
+ * matters on alpha systems using cpusets heavily.  If I dropped
+ * that rcu_dereference(), it would save them a memory barrier.
+ * For all other arch's, rcu_dereference is a no-op anyway, and for
+ * alpha systems not using cpusets, another planned optimization,
+ * avoiding the rcu critical section for tasks in the root cpuset
+ * which is statically allocated, so can't vanish, will make this
+ * irrelevant.  Better to use RCU as intended, than to engage in
+ * some cute trick to save a memory barrier that is impossible to
+ * test, for alpha systems using cpusets heavily, which might not
+ * even exist.
  *
  * This routine is needed to update the per-task mems_allowed data,
  * within the tasks context, when it is trying to allocate memory
@@ -627,11 +645,12 @@ void cpuset_update_task_memory_state()
 {
 	int my_cpusets_mem_gen;
 	struct task_struct *tsk = current;
-	struct cpuset *cs = tsk->cpuset;
+	struct cpuset *cs;
 
-	task_lock(tsk);
+	rcu_read_lock();
+	cs = rcu_dereference(tsk->cpuset);
 	my_cpusets_mem_gen = cs->mems_generation;
-	task_unlock(tsk);
+	rcu_read_unlock();
 
 	if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
 		down(&callback_sem);
@@ -1131,7 +1150,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 		return -ESRCH;
 	}
 	atomic_inc(&cs->count);
-	tsk->cpuset = cs;
+	rcu_assign_pointer(tsk->cpuset, cs);
 	task_unlock(tsk);
 
 	guarantee_online_cpus(cs, &cpus);
@@ -1151,6 +1170,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	if (is_memory_migrate(cs))
 		do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
 	put_task_struct(tsk);
+	synchronize_rcu();
 	if (atomic_dec_and_test(&oldcs->count))
 		check_for_release(oldcs, ppathbuf);
 	return 0;
-- 
cgit v1.2.2


From 7edc59628b2f5d6516b4677b3b56f5f056e45cd9 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:02:03 -0800
Subject: [PATCH] cpuset: mark number_of_cpusets read_mostly

Mark cpuset global 'number_of_cpusets' as __read_mostly.

This global is accessed everytime a zone is considered in the zonelist loops
beneath __alloc_pages, looking for a free memory page.  If number_of_cpusets
is just one, then we can short circuit the mems_allowed check.

Since this global is read alot on a hot path, and written rarely, it is an
excellent candidate for __read_mostly.

Thanks to Christoph Lameter for the suggestion.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6fe28d6f282b..681a5d58d40d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -62,7 +62,7 @@
  * When there is only one cpuset (the root cpuset) we can
  * short circuit some hooks.
  */
-int number_of_cpusets;
+int number_of_cpusets __read_mostly;
 
 /* See "Frequency meter" comments, below. */
 
-- 
cgit v1.2.2


From 03a285f58064b8e0af08383e082e383753d9c33e Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 8 Jan 2006 01:02:04 -0800
Subject: [PATCH] cpuset: skip rcu check if task is in root cpuset

For systems that aren't using cpusets, but have them CONFIG_CPUSET enabled in
their kernel (eventually this may be most distribution kernels), this patch
removes even the minimal rcu_read_lock() from the memory page allocation path.

Actually, it removes that rcu call for any task that is in the root cpuset
(top_cpuset), which on systems not actively using cpusets, is all tasks.

We don't need the rcu check for tasks in the top_cpuset, because the
top_cpuset is statically allocated, so at no risk of being freed out from
underneath us.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 681a5d58d40d..e04c2da9dadb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -647,10 +647,15 @@ void cpuset_update_task_memory_state()
 	struct task_struct *tsk = current;
 	struct cpuset *cs;
 
-	rcu_read_lock();
-	cs = rcu_dereference(tsk->cpuset);
-	my_cpusets_mem_gen = cs->mems_generation;
-	rcu_read_unlock();
+	if (tsk->cpuset == &top_cpuset) {
+		/* Don't need rcu for top_cpuset.  It's never freed. */
+		my_cpusets_mem_gen = top_cpuset.mems_generation;
+	} else {
+		rcu_read_lock();
+		cs = rcu_dereference(tsk->cpuset);
+		my_cpusets_mem_gen = cs->mems_generation;
+		rcu_read_unlock();
+	}
 
 	if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
 		down(&callback_sem);
-- 
cgit v1.2.2


From 97a41e26124330e41aa10ef88cd1711bc3d17460 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 8 Jan 2006 01:02:17 -0800
Subject: [PATCH] kernel/: small cleanups

This patch contains the following cleanups:
- make needlessly global functions static
- every file should include the headers containing the prototypes for
  it's global functions

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/audit.c      | 2 +-
 kernel/irq/proc.c   | 2 ++
 kernel/rcutorture.c | 2 +-
 kernel/timer.c      | 1 +
 4 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 32fa03ad1984..d13ab7d2d899 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -267,7 +267,7 @@ static int audit_set_failure(int state, uid_t loginuid)
 	return old;
 }
 
-int kauditd_thread(void *dummy)
+static int kauditd_thread(void *dummy)
 {
 	struct sk_buff *skb;
 
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 8a64a4844cde..d03b5eef8ce0 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -10,6 +10,8 @@
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
 
+#include "internals.h"
+
 static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
 
 #ifdef CONFIG_SMP
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 49fbbeff201c..36efe088ad81 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -103,7 +103,7 @@ atomic_t n_rcu_torture_error;
 /*
  * Allocate an element from the rcu_tortures pool.
  */
-struct rcu_torture *
+static struct rcu_torture *
 rcu_torture_alloc(void)
 {
 	struct list_head *p;
diff --git a/kernel/timer.c b/kernel/timer.c
index fd74268d8663..074b4bd5cfd8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -33,6 +33,7 @@
 #include <linux/posix-timers.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/delay.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
-- 
cgit v1.2.2


From 095975da26dba21698582e91e96be10f7417333f Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sun, 8 Jan 2006 01:02:19 -0800
Subject: [PATCH] rcu file: use atomic primitives

Use atomic_inc_not_zero for rcu files instead of special case rcuref.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c   | 14 --------------
 kernel/rcutorture.c |  1 -
 2 files changed, 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 0a669bd2f6d1..30b0bba03859 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -46,7 +46,6 @@
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
-#include <linux/rcuref.h>
 #include <linux/cpu.h>
 
 /* Definition for rcupdate control block. */
@@ -74,19 +73,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
 static int maxbatch = 10000;
 
-#ifndef __HAVE_ARCH_CMPXCHG
-/*
- * We use an array of spinlocks for the rcurefs -- similar to ones in sparc
- * 32 bit atomic_t implementations, and a hash function similar to that
- * for our refcounting needs.
- * Can't help multiprocessors which donot have cmpxchg :(
- */
-
-spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = {
-	[0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
-};
-#endif
-
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
  * @head: structure to be used for queueing the RCU updates.
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 36efe088ad81..75174c81529a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -39,7 +39,6 @@
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
-#include <linux/rcuref.h>
 #include <linux/cpu.h>
 #include <linux/random.h>
 #include <linux/delay.h>
-- 
cgit v1.2.2


From 6b9c7ed84837753a436415097063232422e29a35 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 8 Jan 2006 01:02:33 -0800
Subject: [PATCH] use ptrace_get_task_struct in various places

The ptrace_get_task_struct() helper that I added as part of the ptrace
consolidation is useful in variety of places that currently opencode it.
Switch them to the common helpers.

Add a ptrace_traceme() helper that needs to be explicitly called, and simplify
the ptrace_get_task_struct() interface.  We don't need the request argument
now, and we return the task_struct directly, using ERR_PTR() for error
returns.  It's a bit more code in the callers, but we have two sane routines
that do one thing well now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/ptrace.c | 77 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 46 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 656476eedb1b..cceaf09ac413 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -408,54 +408,62 @@ int ptrace_request(struct task_struct *child, long request,
 	return ret;
 }
 
-#ifndef __ARCH_SYS_PTRACE
-static int ptrace_get_task_struct(long request, long pid,
-		struct task_struct **childp)
+/**
+ * ptrace_traceme  --  helper for PTRACE_TRACEME
+ *
+ * Performs checks and sets PT_PTRACED.
+ * Should be used by all ptrace implementations for PTRACE_TRACEME.
+ */
+int ptrace_traceme(void)
 {
-	struct task_struct *child;
 	int ret;
 
 	/*
-	 * Callers use child == NULL as an indication to exit early even
-	 * when the return value is 0, so make sure it is non-NULL here.
+	 * Are we already being traced?
+	 */
+	if (current->ptrace & PT_PTRACED)
+		return -EPERM;
+	ret = security_ptrace(current->parent, current);
+	if (ret)
+		return -EPERM;
+	/*
+	 * Set the ptrace bit in the process ptrace flags.
 	 */
-	*childp = NULL;
+	current->ptrace |= PT_PTRACED;
+	return 0;
+}
 
-	if (request == PTRACE_TRACEME) {
-		/*
-		 * Are we already being traced?
-		 */
-		if (current->ptrace & PT_PTRACED)
-			return -EPERM;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			return -EPERM;
-		/*
-		 * Set the ptrace bit in the process ptrace flags.
-		 */
-		current->ptrace |= PT_PTRACED;
-		return 0;
-	}
+/**
+ * ptrace_get_task_struct  --  grab a task struct reference for ptrace
+ * @pid:       process id to grab a task_struct reference of
+ *
+ * This function is a helper for ptrace implementations.  It checks
+ * permissions and then grabs a task struct for use of the actual
+ * ptrace implementation.
+ *
+ * Returns the task_struct for @pid or an ERR_PTR() on failure.
+ */
+struct task_struct *ptrace_get_task_struct(pid_t pid)
+{
+	struct task_struct *child;
 
 	/*
-	 * You may not mess with init
+	 * Tracing init is not allowed.
 	 */
 	if (pid == 1)
-		return -EPERM;
+		return ERR_PTR(-EPERM);
 
-	ret = -ESRCH;
 	read_lock(&tasklist_lock);
 	child = find_task_by_pid(pid);
 	if (child)
 		get_task_struct(child);
 	read_unlock(&tasklist_lock);
 	if (!child)
-		return -ESRCH;
-
-	*childp = child;
-	return 0;
+		return ERR_PTR(-ESRCH);
+	return child;
 }
 
+#ifndef __ARCH_SYS_PTRACE
 asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 {
 	struct task_struct *child;
@@ -465,9 +473,16 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 	 * This lock_kernel fixes a subtle race with suid exec
 	 */
 	lock_kernel();
-	ret = ptrace_get_task_struct(request, pid, &child);
-	if (!child)
+	if (request == PTRACE_TRACEME) {
+		ret = ptrace_traceme();
 		goto out;
+	}
+
+	child = ptrace_get_task_struct(pid);
+	if (IS_ERR(child)) {
+		ret = PTR_ERR(child);
+		goto out;
+	}
 
 	if (request == PTRACE_ATTACH) {
 		ret = ptrace_attach(child);
-- 
cgit v1.2.2


From 025510cd20f4c35c3958bea133d96c9bd7c6ef9e Mon Sep 17 00:00:00 2001
From: Guillaume Chazarain <guichaz@yahoo.fr>
Date: Sun, 8 Jan 2006 01:02:41 -0800
Subject: [PATCH] printk return value: fix it

What's the true meaning of the printk return value?  Should it include the
priority prefix length of 3?  and what about the timing information?  In
both cases it was broken:

strace -e write echo 1 > /dev/kmsg
=> write(1, "1\n", 2)                      = 5
strace -e write echo "<1>1" > /dev/kmsg
=> write(1, "<1>1\n", 5)                   = 8

The returned length was "length of input string + 3", I made it "length
of string output to the log buffer".

Note that I couldn't find any printk caller in the kernel interested by its
return value besides kmsg_write.

Signed-off-by: Guillaume Chazarain <guichaz@yahoo.fr>
Acked-By: Tim Bird <tim.bird@am.sony.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 5287be83e3e7..2251be80cd22 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -569,7 +569,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 				   p[1] <= '7' && p[2] == '>') {
 					loglev_char = p[1];
 					p += 3;
-					printed_len += 3;
+					printed_len -= 3;
 				} else {
 					loglev_char = default_message_loglevel
 						+ '0';
@@ -584,7 +584,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 
 				for (tp = tbuf; tp < tbuf + tlen; tp++)
 					emit_log_char(*tp);
-				printed_len += tlen - 3;
+				printed_len += tlen;
 			} else {
 				if (p[0] != '<' || p[1] < '0' ||
 				   p[1] > '7' || p[2] != '>') {
@@ -592,8 +592,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 					emit_log_char(default_message_loglevel
 						+ '0');
 					emit_log_char('>');
+					printed_len += 3;
 				}
-				printed_len += 3;
 			}
 			log_level_unknown = 0;
 			if (!*p)
-- 
cgit v1.2.2


From 71fabd5e4835309b4feca6209122ce56c595c461 Mon Sep 17 00:00:00 2001
From: George Anzinger <george@mvista.com>
Date: Sun, 8 Jan 2006 01:02:48 -0800
Subject: [PATCH] sigaction should clear all signals on SIG_IGN, not just < 32

While rooting aroung in the signal code trying to understand how to fix the
SIG_IGN ploy (set sig handler to SIG_IGN and flood system with high speed
repeating timers) I came across what, I think, is a problem in sigaction()
in that when processing a SIG_IGN request it flushes signals from 1 to
SIGRTMIN and leaves the rest.  Attempt to fix this.

Signed-off-by: George Anzinger <george@mvista.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 9b6fda5e87f1..e20724af9b36 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -620,6 +620,33 @@ void signal_wake_up(struct task_struct *t, int resume)
 		kick_process(t);
 }
 
+/*
+ * Remove signals in mask from the pending set and queue.
+ * Returns 1 if any signals were found.
+ *
+ * All callers must be holding the siglock.
+ *
+ * This version takes a sigset mask and looks at all signals,
+ * not just those in the first mask word.
+ */
+static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
+{
+	struct sigqueue *q, *n;
+	sigset_t m;
+
+	sigandsets(&m, mask, &s->signal);
+	if (sigisemptyset(&m))
+		return 0;
+
+	signandsets(&s->signal, &s->signal, mask);
+	list_for_each_entry_safe(q, n, &s->list, list) {
+		if (sigismember(mask, q->info.si_signo)) {
+			list_del_init(&q->list);
+			__sigqueue_free(q);
+		}
+	}
+	return 1;
+}
 /*
  * Remove signals in mask from the pending set and queue.
  * Returns 1 if any signals were found.
@@ -2408,6 +2435,7 @@ int
 do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
 {
 	struct k_sigaction *k;
+	sigset_t mask;
 
 	if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
 		return -EINVAL;
@@ -2455,9 +2483,11 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
 			*k = *act;
 			sigdelsetmask(&k->sa.sa_mask,
 				      sigmask(SIGKILL) | sigmask(SIGSTOP));
-			rm_from_queue(sigmask(sig), &t->signal->shared_pending);
+			sigemptyset(&mask);
+			sigaddset(&mask, sig);
+			rm_from_queue_full(&mask, &t->signal->shared_pending);
 			do {
-				rm_from_queue(sigmask(sig), &t->pending);
+				rm_from_queue_full(&mask, &t->pending);
 				recalc_sigpending_tsk(t);
 				t = next_thread(t);
 			} while (t != current);
-- 
cgit v1.2.2


From 676121fcb66c861804e38d94214fd5670a1ef595 Mon Sep 17 00:00:00 2001
From: Ben Collins <bcollins@debian.org>
Date: Sun, 8 Jan 2006 01:03:04 -0800
Subject: [PATCH] Unchecked alloc_percpu() return in __create_workqueue()

__create_workqueue() not checking return of alloc_percpu()

NULL dereference was possible.

Signed-off-by: Ben Collins <bcollins@ubuntu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 62d47220696a..e72fb6478da6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -315,6 +315,11 @@ struct workqueue_struct *__create_workqueue(const char *name,
 		return NULL;
 
 	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
+	if (!wq->cpu_wq) {
+		kfree(wq);
+		return NULL;
+	}
+
 	wq->name = name;
 	/* We don't need the distraction of CPUs appearing and vanishing. */
 	lock_cpu_hotplug();
-- 
cgit v1.2.2


From 0811af28ce49fab963e3e6b8abcf8c460f971cd4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:03:09 -0800
Subject: [PATCH] kill_proc_info_as_uid: don't use hardcoded constants

Use symbolic names instead of hardcoded constants.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Harald Welte <laforge@gnumonks.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index e20724af9b36..114cf9209bcd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1220,8 +1220,7 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
 		ret = -ESRCH;
 		goto out_unlock;
 	}
-	if ((!info || ((unsigned long)info != 1 &&
-			(unsigned long)info != 2 && SI_FROMUSER(info)))
+	if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
 	    && (euid != p->suid) && (euid != p->uid)
 	    && (uid != p->suid) && (uid != p->uid)) {
 		ret = -EPERM;
-- 
cgit v1.2.2


From 485a6435abc3897934ce0dc530e31db93e9296a6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:03:14 -0800
Subject: [PATCH] little do_group_exit() cleanup

zap_other_threads() sets SIGNAL_GROUP_EXIT at the very start,
do_group_exit() doesn't need to do it.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index c73a7eb26de3..a80824f6108b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -925,7 +925,6 @@ do_group_exit(int exit_code)
 			/* Another thread got here before we took the lock.  */
 			exit_code = sig->group_exit_code;
 		else {
-			sig->flags = SIGNAL_GROUP_EXIT;
 			sig->group_exit_code = exit_code;
 			zap_other_threads(current);
 		}
-- 
cgit v1.2.2


From 86174cdcb44c095ffd2e3ca69caa56244cf701d5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:03:28 -0800
Subject: [PATCH] remove unneeded sig->curr_target recalculation

This patch removes unneeded sig->curr_target recalculation under 'if
(atomic_dec_and_test(&sig->count))' in __exit_signal().

When sig->count == 0 the signal can't be sent to this task and
next_thread(tsk) == tsk anyway.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 114cf9209bcd..08aa5b263f36 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -364,8 +364,6 @@ void __exit_signal(struct task_struct *tsk)
 	posix_cpu_timers_exit(tsk);
 	if (atomic_dec_and_test(&sig->count)) {
 		posix_cpu_timers_exit_group(tsk);
-		if (tsk == sig->curr_target)
-			sig->curr_target = next_thread(tsk);
 		tsk->signal = NULL;
 		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
-- 
cgit v1.2.2


From 5160ee6fc891a9ca114be0e90fa6655647bb64b2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 8 Jan 2006 01:03:32 -0800
Subject: [PATCH] shrink dentry struct

Some long time ago, dentry struct was carefully tuned so that on 32 bits
UP, sizeof(struct dentry) was exactly 128, ie a power of 2, and a multiple
of memory cache lines.

Then RCU was added and dentry struct enlarged by two pointers, with nice
results for SMP, but not so good on UP, because breaking the above tuning
(128 + 8 = 136 bytes)

This patch reverts this unwanted side effect, by using an union (d_u),
where d_rcu and d_child are placed so that these two fields can share their
memory needs.

At the time d_free() is called (and d_rcu is really used), d_child is known
to be empty and not touched by the dentry freeing.

Lockless lookups only access d_name, d_parent, d_lock, d_op, d_flags (so
the previous content of d_child is not needed if said dentry was unhashed
but still accessed by a CPU because of RCU constraints)

As dentry cache easily contains millions of entries, a size reduction is
worth the extra complexity of the ugly C union.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Maneesh Soni <maneesh@in.ibm.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Ian Kent <raven@themaw.net>
Cc: Paul Jackson <pj@sgi.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Cc: James Morris <jmorris@namei.org>
Cc: Stephen Smalley <sds@epoch.ncsc.mil>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e04c2da9dadb..eab64e23bcae 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -331,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
 	spin_lock(&dcache_lock);
 	node = dentry->d_subdirs.next;
 	while (node != &dentry->d_subdirs) {
-		struct dentry *d = list_entry(node, struct dentry, d_child);
+		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
 		list_del_init(node);
 		if (d->d_inode) {
 			d = dget_locked(d);
@@ -343,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
 		}
 		node = dentry->d_subdirs.next;
 	}
-	list_del_init(&dentry->d_child);
+	list_del_init(&dentry->d_u.d_child);
 	spin_unlock(&dcache_lock);
 	remove_dir(dentry);
 }
-- 
cgit v1.2.2


From 9841d61d75da5e46ed7a978bed4f50c78b1d87fd Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Sun, 8 Jan 2006 01:03:41 -0800
Subject: [PATCH] Add tainting for proprietary helper modules

Kernels that have had Windows drivers loaded into them are undebuggable.
I've wasted a number of hours chasing bugs filed in Fedora bugzilla only to
find out much later that the user had used such 'helpers', and their
problems were unreproducable without them loaded.

Acked-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 4b06bbad49c2..5d9078d6f0fa 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1715,6 +1715,11 @@ static struct module *load_module(void __user *umod,
 	/* Set up license info based on the info section */
 	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
 
+	if (strcmp(mod->name, "ndiswrapper") == 0)
+		add_taint(TAINT_PROPRIETARY_MODULE);
+	if (strcmp(mod->name, "driverloader") == 0)
+		add_taint(TAINT_PROPRIETARY_MODULE);
+
 #ifdef CONFIG_MODULE_UNLOAD
 	/* Set up MODINFO_ATTR fields */
 	setup_modinfo(mod, sechdrs, infoindex);
-- 
cgit v1.2.2


From d84f520348d77e61f936227a048403dbc349fff1 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Date: Sun, 8 Jan 2006 01:03:42 -0800
Subject: [PATCH] Extend RCU torture module to test tickless idle CPU

This patch forces RCU torture threads off various CPUs in the system
allowing them to become idle and go tickless.  Meant to test support for
such tickless idle CPU in RCU.

Signed-off-by: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 91 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 75174c81529a..773219907dd8 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -48,9 +48,11 @@
 MODULE_LICENSE("GPL");
 
 static int nreaders = -1;	/* # reader threads, defaults to 4*ncpus */
-static int stat_interval = 0;	/* Interval between stats, in seconds. */
+static int stat_interval;	/* Interval between stats, in seconds. */
 				/*  Defaults to "only at end of test". */
-static int verbose = 0;		/* Print more debug info. */
+static int verbose;		/* Print more debug info. */
+static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
+static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
 
 MODULE_PARM(nreaders, "i");
 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
@@ -58,6 +60,10 @@ MODULE_PARM(stat_interval, "i");
 MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
 MODULE_PARM(verbose, "i");
 MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
+MODULE_PARM(test_no_idle_hz, "i");
+MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
+MODULE_PARM(shuffle_interval, "i");
+MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
 #define TORTURE_FLAG "rcutorture: "
 #define PRINTK_STRING(s) \
 	do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
@@ -72,6 +78,7 @@ static int nrealreaders;
 static struct task_struct *writer_task;
 static struct task_struct **reader_tasks;
 static struct task_struct *stats_task;
+static struct task_struct *shuffler_task;
 
 #define RCU_TORTURE_PIPE_LEN 10
 
@@ -375,12 +382,77 @@ rcu_torture_stats(void *arg)
 	return 0;
 }
 
+static int rcu_idle_cpu;	/* Force all torture tasks off this CPU */
+
+/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
+ * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
+ */
+void rcu_torture_shuffle_tasks(void)
+{
+	cpumask_t tmp_mask = CPU_MASK_ALL;
+	int i;
+
+	lock_cpu_hotplug();
+
+	/* No point in shuffling if there is only one online CPU (ex: UP) */
+	if (num_online_cpus() == 1) {
+		unlock_cpu_hotplug();
+		return;
+	}
+
+	if (rcu_idle_cpu != -1)
+		cpu_clear(rcu_idle_cpu, tmp_mask);
+
+	set_cpus_allowed(current, tmp_mask);
+
+	if (reader_tasks != NULL) {
+		for (i = 0; i < nrealreaders; i++)
+			if (reader_tasks[i])
+				set_cpus_allowed(reader_tasks[i], tmp_mask);
+	}
+
+	if (writer_task)
+		set_cpus_allowed(writer_task, tmp_mask);
+
+	if (stats_task)
+		set_cpus_allowed(stats_task, tmp_mask);
+
+	if (rcu_idle_cpu == -1)
+		rcu_idle_cpu = num_online_cpus() - 1;
+	else
+		rcu_idle_cpu--;
+
+	unlock_cpu_hotplug();
+}
+
+/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
+ * system to become idle at a time and cut off its timer ticks. This is meant
+ * to test the support for such tickless idle CPU in RCU.
+ */
+static int
+rcu_torture_shuffle(void *arg)
+{
+	VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started");
+	do {
+		schedule_timeout_interruptible(shuffle_interval * HZ);
+		rcu_torture_shuffle_tasks();
+	} while (!kthread_should_stop());
+	VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
+	return 0;
+}
+
 static void
 rcu_torture_cleanup(void)
 {
 	int i;
 
 	fullstop = 1;
+	if (shuffler_task != NULL) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
+		kthread_stop(shuffler_task);
+	}
+	shuffler_task = NULL;
+
 	if (writer_task != NULL) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
 		kthread_stop(writer_task);
@@ -429,9 +501,11 @@ rcu_torture_init(void)
 		nrealreaders = nreaders;
 	else
 		nrealreaders = 2 * num_online_cpus();
-	printk(KERN_ALERT TORTURE_FLAG
-	       "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n",
-	       nrealreaders, stat_interval, verbose);
+	printk(KERN_ALERT TORTURE_FLAG "--- Start of test: nreaders=%d "
+		"stat_interval=%d verbose=%d test_no_idle_hz=%d "
+		"shuffle_interval = %d\n",
+		nrealreaders, stat_interval, verbose, test_no_idle_hz,
+		shuffle_interval);
 	fullstop = 0;
 
 	/* Set up the freelist. */
@@ -501,6 +575,18 @@ rcu_torture_init(void)
 			goto unwind;
 		}
 	}
+	if (test_no_idle_hz) {
+		rcu_idle_cpu = num_online_cpus() - 1;
+		/* Create the shuffler thread */
+		shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
+					  "rcu_torture_shuffle");
+		if (IS_ERR(shuffler_task)) {
+			firsterr = PTR_ERR(shuffler_task);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
+			shuffler_task = NULL;
+			goto unwind;
+		}
+	}
 	return 0;
 
 unwind:
-- 
cgit v1.2.2


From 5e38291d80086f6972f471c7caffa03184de0bf0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 8 Jan 2006 01:03:46 -0800
Subject: [PATCH] Don't attempt to power off if power off is not implemented

The problem.  It is expected that /sbin/halt -p works exactly like
/sbin/halt, when the kernel does not implement power off functionality.

The kernel can do a lot of work in the reboot notifiers and in
device_shutdown before we even get to machine_power_off.  Some of that
shutdown is not safe if you are leaving the power on, and it definitely
gets in the way of using sysrq or pressing ctrl-alt-del.  Since the
shutdown happens in generic code there is no way to fix this in
architecture specific code :(

Some machines are kernel oopsing today because of this.

The simple solution is to turn LINUX_REBOOT_CMD_POWER_OFF into
LINUX_REBOOT_CMD_HALT if power_off functionality is not implemented.

This has the unfortunate side effect of disabling the power off
functionality on architectures that leave pm_power_off to null and still
implement something in machine_power_off.  And it will break the build on
some architectures that don't have a pm_power_off variable at all.

On both counts I say tough.

For architectures like alpha that don't implement the pm_power_off variable
pm_power_off is declared in linux/pm.h and it is a generic part of our
power management code, and all architectures should implement it.

For architectures like parisc that have a default power off method in
machine_power_off if pm_power_off is not implemented or fails.  It is easy
enough to set the pm_power_off variable.  And nothing bad happens there,
the machines just stop powering off.

The current semantics are impossible without a flag at the top level so we
can avoid the problem code if a power off is not implemented.  pm_power_off
is as good a flag as any with the bonus that it works without modification
on at least x86, x86_64, powerpc, and ppc today.

Andrew can you pick this up and put this in the mm tree.  Kernels that
don't compile or don't power off seem saner than kernels that oops or
panic.  Until we get the arch specific patches for the problem
architectures this probably isn't smart to push into the stable kernel.
Unfortunately I don't have the time at the moment to walk through every
architecture and make them work.  And even if I did I couldn't test it :(

From: Hirokazu Takata <takata@linux-m32r.org>

    Add pm_power_off() for build fix of arch/m32r/kernel/process.c.

From: Miklos Szeredi <miklos@szeredi.hu>

    UML build fix

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Hayato Fujiwara <fujiwara@linux-m32r.org>
Signed-off-by: Hirokazu Takata <takata@linux-m32r.org>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index eecf84526afe..d8e49e659027 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -489,6 +489,12 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 	                magic2 != LINUX_REBOOT_MAGIC2C))
 		return -EINVAL;
 
+	/* Instead of trying to make the power_off code look like
+	 * halt when pm_power_off is not set do it the easy way.
+	 */
+	if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
+		cmd = LINUX_REBOOT_CMD_HALT;
+
 	lock_kernel();
 	switch (cmd) {
 	case LINUX_REBOOT_CMD_RESTART:
-- 
cgit v1.2.2


From 9a5d3023e626a0baf86ac6b892c983b3db13f22b Mon Sep 17 00:00:00 2001
From: Oren Laadan <orenl@cs.columbia.edu>
Date: Sun, 8 Jan 2006 01:03:51 -0800
Subject: [PATCH] fork: fix race in setting child's pgrp and tty

In fork, child should recopy parent's pgrp/tty after it has tasklist_lock.
Otherwise following a setpgid() on the parent, *after* copy_signal(), the
child will own a stale pgrp (which may be reused); (eg.  if copy_mm()
sleeps a long while due to memory pressure).  Similar issue for the tty.

Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 7992ee759d89..4bc0bd8ef176 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -811,9 +811,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->it_prof_expires = cputime_zero;
 	sig->it_prof_incr = cputime_zero;
 
-	sig->tty = current->signal->tty;
-	sig->pgrp = process_group(current);
-	sig->session = current->signal->session;
 	sig->leader = 0;	/* session leadership doesn't inherit */
 	sig->tty_old_pgrp = 0;
 
@@ -1136,15 +1133,15 @@ static task_t *copy_process(unsigned long clone_flags,
 	attach_pid(p, PIDTYPE_PID, p->pid);
 	attach_pid(p, PIDTYPE_TGID, p->tgid);
 	if (thread_group_leader(p)) {
+		p->signal->tty = current->signal->tty;
+		p->signal->pgrp = process_group(current);
+		p->signal->session = current->signal->session;
 		attach_pid(p, PIDTYPE_PGID, process_group(p));
 		attach_pid(p, PIDTYPE_SID, p->signal->session);
 		if (p->pid)
 			__get_cpu_var(process_counts)++;
 	}
 
-	if (!current->signal->tty && p->signal->tty)
-		p->signal->tty = NULL;
-
 	nr_threads++;
 	total_forks++;
 	write_unlock_irq(&tasklist_lock);
-- 
cgit v1.2.2


From ee0acf90d320c29916ba8c5c1b2e908d81f5057d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:03:53 -0800
Subject: [PATCH] setpgid: should work for sub-threads

setpgid(0, pgid) or setpgid(forked_child_pid, pgid) does not work unless
the calling process is a thread_group_leader().

'man setpgid' does not tell anything about that, so I consider this
behaviour is a bug.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Oren Laadan <orenl@cs.columbia.edu>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index d8e49e659027..43e557211d8d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1090,10 +1090,11 @@ asmlinkage long sys_times(struct tms __user * tbuf)
 asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 {
 	struct task_struct *p;
+	struct task_struct *group_leader = current->group_leader;
 	int err = -EINVAL;
 
 	if (!pid)
-		pid = current->pid;
+		pid = group_leader->pid;
 	if (!pgid)
 		pgid = pid;
 	if (pgid < 0)
@@ -1113,16 +1114,16 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 	if (!thread_group_leader(p))
 		goto out;
 
-	if (p->parent == current || p->real_parent == current) {
+	if (p->parent == current || p->real_parent == group_leader) {
 		err = -EPERM;
-		if (p->signal->session != current->signal->session)
+		if (p->signal->session != group_leader->signal->session)
 			goto out;
 		err = -EACCES;
 		if (p->did_exec)
 			goto out;
 	} else {
 		err = -ESRCH;
-		if (p != current)
+		if (p != group_leader)
 			goto out;
 	}
 
@@ -1134,7 +1135,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 		struct task_struct *p;
 
 		do_each_task_pid(pgid, PIDTYPE_PGID, p) {
-			if (p->signal->session == current->signal->session)
+			if (p->signal->session == group_leader->signal->session)
 				goto ok_pgid;
 		} while_each_task_pid(pgid, PIDTYPE_PGID, p);
 		goto out;
-- 
cgit v1.2.2


From e19f247a3dbd85485ec13174817ae9c2478fe541 Mon Sep 17 00:00:00 2001
From: Oren Laadan <orenl@cs.columbia.edu>
Date: Sun, 8 Jan 2006 01:03:58 -0800
Subject: [PATCH] setpgid: should work for sub-threads

setsid() does not work unless the calling process is a
thread_group_leader().

'man setpgid' does not tell anything about that, so I consider this
behaviour is a bug.

Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c |  2 +-
 kernel/sys.c  | 16 +++++++---------
 2 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index a80824f6108b..caceabf3f230 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -257,7 +257,7 @@ static inline void reparent_to_init(void)
 
 void __set_special_pids(pid_t session, pid_t pgrp)
 {
-	struct task_struct *curr = current;
+	struct task_struct *curr = current->group_leader;
 
 	if (curr->signal->session != session) {
 		detach_pid(curr, PIDTYPE_SID);
diff --git a/kernel/sys.c b/kernel/sys.c
index 43e557211d8d..f497bf56ada1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1215,24 +1215,22 @@ asmlinkage long sys_getsid(pid_t pid)
 
 asmlinkage long sys_setsid(void)
 {
+	struct task_struct *group_leader = current->group_leader;
 	struct pid *pid;
 	int err = -EPERM;
 
-	if (!thread_group_leader(current))
-		return -EINVAL;
-
 	down(&tty_sem);
 	write_lock_irq(&tasklist_lock);
 
-	pid = find_pid(PIDTYPE_PGID, current->pid);
+	pid = find_pid(PIDTYPE_PGID, group_leader->pid);
 	if (pid)
 		goto out;
 
-	current->signal->leader = 1;
-	__set_special_pids(current->pid, current->pid);
-	current->signal->tty = NULL;
-	current->signal->tty_old_pgrp = 0;
-	err = process_group(current);
+	group_leader->signal->leader = 1;
+	__set_special_pids(group_leader->pid, group_leader->pid);
+	group_leader->signal->tty = NULL;
+	group_leader->signal->tty_old_pgrp = 0;
+	err = process_group(group_leader);
 out:
 	write_unlock_irq(&tasklist_lock);
 	up(&tty_sem);
-- 
cgit v1.2.2


From f7dd795e913656c390b6bde27790c518973feea1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:03:59 -0800
Subject: [PATCH] setpgid: should not accept ptraced childs

sys_setpgid() allows to change ->pgrp of ptraced childs.

'man setpgid' does not tell anything about that, so I consider
this behaviour is a bug.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Oren Laadan <orenl@cs.columbia.edu>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index f497bf56ada1..218937e837dc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1114,7 +1114,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 	if (!thread_group_leader(p))
 		goto out;
 
-	if (p->parent == current || p->real_parent == group_leader) {
+	if (p->real_parent == group_leader) {
 		err = -EPERM;
 		if (p->signal->session != group_leader->signal->session)
 			goto out;
-- 
cgit v1.2.2


From fe7d37d1fbf8ffe78abd72728b24fb0c64f7af55 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:04:02 -0800
Subject: [PATCH] copy_process: error path cleanup

This patch moves 'fork_out:' under 'bad_fork_free:', and removes now
unneeded 'if (retval)' check.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 4bc0bd8ef176..72e3252c6763 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1146,11 +1146,6 @@ static task_t *copy_process(unsigned long clone_flags,
 	total_forks++;
 	write_unlock_irq(&tasklist_lock);
 	proc_fork_connector(p);
-	retval = 0;
-
-fork_out:
-	if (retval)
-		return ERR_PTR(retval);
 	return p;
 
 bad_fork_cleanup_namespace:
@@ -1191,7 +1186,8 @@ bad_fork_cleanup_count:
 	free_uid(p->user);
 bad_fork_free:
 	free_task(p);
-	goto fork_out;
+fork_out:
+	return ERR_PTR(retval);
 }
 
 struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
-- 
cgit v1.2.2


From eea8b54dc0dceb740da697a89c54d20dde340306 Mon Sep 17 00:00:00 2001
From: Ashutosh Naik <ashutosh.naik@gmail.com>
Date: Sun, 8 Jan 2006 01:04:25 -0800
Subject: [PATCH] modules: prevent overriding of symbols

Ensure that an exported symbol does not already exist in the kernel or in
some other module's exported symbol table.  This is done by checking the
symbol tables for the exported symbol at the time of loading the module.
Currently this is done after the relocation of the symbol.

Signed-off-by: Ashutosh Naik <ashutosh.naik@gmail.com>
Signed-off-by: Anand Krishnan <anandhkrishnan@yahoo.co.in>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5d9078d6f0fa..fb6f091f5940 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1204,6 +1204,39 @@ void *__symbol_get(const char *symbol)
 }
 EXPORT_SYMBOL_GPL(__symbol_get);
 
+/*
+ * Ensure that an exported symbol [global namespace] does not already exist
+ * in the Kernel or in some other modules exported symbol table.
+ */
+static int verify_export_symbols(struct module *mod)
+{
+	const char *name = NULL;
+	unsigned long i, ret = 0;
+	struct module *owner;
+	const unsigned long *crc;
+
+	for (i = 0; i < mod->num_syms; i++)
+	        if (__find_symbol(mod->syms[i].name, &owner, &crc, 1)) {
+			name = mod->syms[i].name;
+			ret = -ENOEXEC;
+			goto dup;
+		}
+
+	for (i = 0; i < mod->num_gpl_syms; i++)
+	        if (__find_symbol(mod->gpl_syms[i].name, &owner, &crc, 1)) {
+			name = mod->gpl_syms[i].name;
+			ret = -ENOEXEC;
+			goto dup;
+		}
+
+dup:
+	if (ret)
+		printk(KERN_ERR "%s: exports duplicate symbol %s (owned by %s)\n",
+			mod->name, name, module_name(owner));
+
+	return ret;
+}
+
 /* Change all symbols so that sh_value encodes the pointer directly. */
 static int simplify_symbols(Elf_Shdr *sechdrs,
 			    unsigned int symindex,
@@ -1772,6 +1805,12 @@ static struct module *load_module(void __user *umod,
 			goto cleanup;
 	}
 
+        /* Find duplicate symbols */
+	err = verify_export_symbols(mod);
+
+	if (err < 0)
+		goto cleanup;
+
   	/* Set up and sort exception table */
 	mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable);
 	mod->extable = extable = (void *)sechdrs[exindex].sh_addr;
-- 
cgit v1.2.2


From fb1697933a03ec47d794b38e2a4e3ccc2463fd22 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <mita@miraclelinux.com>
Date: Sun, 8 Jan 2006 01:04:29 -0800
Subject: [PATCH] modules: mark TAINT_FORCED_RMMOD correctly

Currently TAINT_FORCED_RMMOD is totally unused.  Because it is marked as
TAINT_FORCED_MODULE instead when user forced a module unload.  This patch
marks it correctly

Signed-off-by: Akinobu Mita <mita@miraclelinux.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index fb6f091f5940..f368812ac0e8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -496,15 +496,15 @@ static void module_unload_free(struct module *mod)
 }
 
 #ifdef CONFIG_MODULE_FORCE_UNLOAD
-static inline int try_force(unsigned int flags)
+static inline int try_force_unload(unsigned int flags)
 {
 	int ret = (flags & O_TRUNC);
 	if (ret)
-		add_taint(TAINT_FORCED_MODULE);
+		add_taint(TAINT_FORCED_RMMOD);
 	return ret;
 }
 #else
-static inline int try_force(unsigned int flags)
+static inline int try_force_unload(unsigned int flags)
 {
 	return 0;
 }
@@ -524,7 +524,7 @@ static int __try_stop_module(void *_sref)
 
 	/* If it's not unused, quit unless we are told to block. */
 	if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) {
-		if (!(*sref->forced = try_force(sref->flags)))
+		if (!(*sref->forced = try_force_unload(sref->flags)))
 			return -EWOULDBLOCK;
 	}
 
@@ -609,7 +609,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 	/* If it has an init func, it must have an exit func to unload */
 	if ((mod->init != NULL && mod->exit == NULL)
 	    || mod->unsafe) {
-		forced = try_force(flags);
+		forced = try_force_unload(flags);
 		if (!forced) {
 			/* This module can't be removed */
 			ret = -EBUSY;
-- 
cgit v1.2.2


From eb46996f90a0826921f1a0d81535537a9c7f91b0 Mon Sep 17 00:00:00 2001
From: Ashutosh Naik <ashutosh.naik@gmail.com>
Date: Sun, 8 Jan 2006 01:04:37 -0800
Subject: [PATCH] kernel/module.c: remove redundant spinlock in
 resolve_symbol()

Remove the redundant spinlock in the function resolve_symbol() as we are
not altering the module list, and we already hold the semaphore.

Signed-off-by: Ashutosh Naik <ashutosh.naik@gmail.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index f368812ac0e8..e4276046a1b6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -958,7 +958,6 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
 	unsigned long ret;
 	const unsigned long *crc;
 
-	spin_lock_irq(&modlist_lock);
 	ret = __find_symbol(name, &owner, &crc, mod->license_gplok);
 	if (ret) {
 		/* use_module can fail due to OOM, or module unloading */
@@ -966,7 +965,6 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
 		    !use_module(mod, owner))
 			ret = 0;
 	}
-	spin_unlock_irq(&modlist_lock);
 	return ret;
 }
 
-- 
cgit v1.2.2


From f756d5e256059018d753f0ba79980ebeb87a1bc0 Mon Sep 17 00:00:00 2001
From: Nathan Lynch <ntl@pobox.com>
Date: Sun, 8 Jan 2006 01:05:12 -0800
Subject: [PATCH] fix workqueue oops during cpu offline

Use first_cpu(cpu_possible_map) for the single-thread workqueue case.  We
used to hardcode 0, but that broke on systems where !cpu_possible(0) when
workqueue_struct->cpu_workqueue_struct was changed from a static array to
alloc_percpu.

Commit id bce61dd49d6ba7799be2de17c772e4c701558f14 ("Fix hardcoded cpu=0 in
workqueue for per_cpu_ptr() calls") fixed that for Ben's funky sparc64
system, but it regressed my Power5.  Offlining cpu 0 oopses upon the next
call to queue_work for a single-thread workqueue, because now we try to
manipulate per_cpu_ptr(wq->cpu_wq, 1), which is uninitialized.

So we need to establish an unchanging "slot" for single-thread workqueues
which will have a valid percpu allocation.  Since alloc_percpu keys off of
cpu_possible_map, which must not change after initialization, make this
slot == first_cpu(cpu_possible_map).

Signed-off-by: Nathan Lynch <ntl@pobox.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e72fb6478da6..82c4fa70595c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -29,7 +29,8 @@
 #include <linux/kthread.h>
 
 /*
- * The per-CPU workqueue (if single thread, we always use cpu 0's).
+ * The per-CPU workqueue (if single thread, we always use the first
+ * possible cpu).
  *
  * The sequence counters are for flush_scheduled_work().  It wants to wait
  * until until all currently-scheduled works are completed, but it doesn't
@@ -69,6 +70,8 @@ struct workqueue_struct {
 static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
 
+static int singlethread_cpu;
+
 /* If it's single threaded, it isn't in the list of workqueues. */
 static inline int is_single_threaded(struct workqueue_struct *wq)
 {
@@ -102,7 +105,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 
 	if (!test_and_set_bit(0, &work->pending)) {
 		if (unlikely(is_single_threaded(wq)))
-			cpu = any_online_cpu(cpu_online_map);
+			cpu = singlethread_cpu;
 		BUG_ON(!list_empty(&work->entry));
 		__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 		ret = 1;
@@ -118,7 +121,7 @@ static void delayed_work_timer_fn(unsigned long __data)
 	int cpu = smp_processor_id();
 
 	if (unlikely(is_single_threaded(wq)))
-		cpu = any_online_cpu(cpu_online_map);
+		cpu = singlethread_cpu;
 
 	__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
@@ -267,7 +270,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 
 	if (is_single_threaded(wq)) {
 		/* Always use first cpu's area. */
-		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, any_online_cpu(cpu_online_map)));
+		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
 	} else {
 		int cpu;
 
@@ -325,7 +328,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	lock_cpu_hotplug();
 	if (singlethread) {
 		INIT_LIST_HEAD(&wq->list);
-		p = create_workqueue_thread(wq, any_online_cpu(cpu_online_map));
+		p = create_workqueue_thread(wq, singlethread_cpu);
 		if (!p)
 			destroy = 1;
 		else
@@ -379,7 +382,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	/* We don't need the distraction of CPUs appearing and vanishing. */
 	lock_cpu_hotplug();
 	if (is_single_threaded(wq))
-		cleanup_workqueue_thread(wq, any_online_cpu(cpu_online_map));
+		cleanup_workqueue_thread(wq, singlethread_cpu);
 	else {
 		for_each_online_cpu(cpu)
 			cleanup_workqueue_thread(wq, cpu);
@@ -567,6 +570,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 
 void init_workqueues(void)
 {
+	singlethread_cpu = first_cpu(cpu_possible_map);
 	hotcpu_notifier(workqueue_cpu_callback, 0);
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
-- 
cgit v1.2.2


From 0f59cc4a35dbbc45c972daad0f1b063380cd9ea4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 01:05:15 -0800
Subject: [PATCH] simplify k_getrusage()

Factor out common code for different RUSAGE_xxx cases.

Don't take ->sighand->siglock in RUSAGE_SELF case, suggested by Ravikiran G
Thirumalai <kiran@scalex86.org>.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 218937e837dc..b6941e06d5d5 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1692,7 +1692,10 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 	if (unlikely(!p->signal))
 		return;
 
+	utime = stime = cputime_zero;
+
 	switch (who) {
+		case RUSAGE_BOTH:
 		case RUSAGE_CHILDREN:
 			spin_lock_irqsave(&p->sighand->siglock, flags);
 			utime = p->signal->cutime;
@@ -1702,22 +1705,11 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 			r->ru_minflt = p->signal->cmin_flt;
 			r->ru_majflt = p->signal->cmaj_flt;
 			spin_unlock_irqrestore(&p->sighand->siglock, flags);
-			cputime_to_timeval(utime, &r->ru_utime);
-			cputime_to_timeval(stime, &r->ru_stime);
-			break;
+
+			if (who == RUSAGE_CHILDREN)
+				break;
+
 		case RUSAGE_SELF:
-			spin_lock_irqsave(&p->sighand->siglock, flags);
-			utime = stime = cputime_zero;
-			goto sum_group;
-		case RUSAGE_BOTH:
-			spin_lock_irqsave(&p->sighand->siglock, flags);
-			utime = p->signal->cutime;
-			stime = p->signal->cstime;
-			r->ru_nvcsw = p->signal->cnvcsw;
-			r->ru_nivcsw = p->signal->cnivcsw;
-			r->ru_minflt = p->signal->cmin_flt;
-			r->ru_majflt = p->signal->cmaj_flt;
-		sum_group:
 			utime = cputime_add(utime, p->signal->utime);
 			stime = cputime_add(stime, p->signal->stime);
 			r->ru_nvcsw += p->signal->nvcsw;
@@ -1734,13 +1726,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 				r->ru_majflt += t->maj_flt;
 				t = next_thread(t);
 			} while (t != p);
-			spin_unlock_irqrestore(&p->sighand->siglock, flags);
-			cputime_to_timeval(utime, &r->ru_utime);
-			cputime_to_timeval(stime, &r->ru_stime);
 			break;
+
 		default:
 			BUG();
 	}
+
+	cputime_to_timeval(utime, &r->ru_utime);
+	cputime_to_timeval(stime, &r->ru_stime);
 }
 
 int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
-- 
cgit v1.2.2


From e585e47031751f4e393e10ffd922885508b958dd Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:05:24 -0800
Subject: [PATCH] tiny: Make *[ug]id16 support optional

Configurable 16-bit UID and friends support

This allows turning off the legacy 16 bit UID interfaces on embedded platforms.

   text    data     bss     dec     hex filename
3330172  529036  190556 4049764  3dcb64 vmlinux-baseline
3328268  529040  190556 4047864  3dc3f8 vmlinux

From: Adrian Bunk <bunk@stusta.de>

    UID16 was accidentially disabled for !EMBEDDED.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys_ni.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7a8bc7f60d91..72cafc922d39 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -83,6 +83,25 @@ cond_syscall(sys_inotify_init);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
 cond_syscall(sys_migrate_pages);
+cond_syscall(sys_chown16);
+cond_syscall(sys_fchown16);
+cond_syscall(sys_getegid16);
+cond_syscall(sys_geteuid16);
+cond_syscall(sys_getgid16);
+cond_syscall(sys_getgroups16);
+cond_syscall(sys_getresgid16);
+cond_syscall(sys_getresuid16);
+cond_syscall(sys_getuid16);
+cond_syscall(sys_lchown16);
+cond_syscall(sys_setfsgid16);
+cond_syscall(sys_setfsuid16);
+cond_syscall(sys_setgid16);
+cond_syscall(sys_setgroups16);
+cond_syscall(sys_setregid16);
+cond_syscall(sys_setresgid16);
+cond_syscall(sys_setresuid16);
+cond_syscall(sys_setreuid16);
+cond_syscall(sys_setuid16);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
-- 
cgit v1.2.2


From 64ca9004b819ab87648dbfc78f3ef49ee491343e Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Sun, 8 Jan 2006 01:05:26 -0800
Subject: [PATCH] Make vm86 support optional

This adds an option to remove vm86 support under CONFIG_EMBEDDED.  Saves
about 5k.

This version eliminates most of the #ifdefs of the previous version and
instead uses function stubs in vm86.h.  Also, release_vm86_irqs is moved
from asm-i386/irq.h to a more appropriate home in vm86.h so that the stubs
can live together.

$ size vmlinux-baseline vmlinux-novm86
   text    data     bss     dec     hex filename
2920821  523232  190652 3634705  377611 vmlinux-baseline
2916268  523100  190492 3629860  376324 vmlinux-novm86

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys_ni.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 72cafc922d39..bd3b9bfcfcec 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -102,6 +102,8 @@ cond_syscall(sys_setresgid16);
 cond_syscall(sys_setresuid16);
 cond_syscall(sys_setreuid16);
 cond_syscall(sys_setuid16);
+cond_syscall(sys_vm86old);
+cond_syscall(sys_vm86);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
-- 
cgit v1.2.2


From 677517771b7b6efaf8617e70f655b16f3cafcc9b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 22:19:16 +0300
Subject: [PATCH] rcu: uninline __rcu_pending()

__rcu_pending() is rather fat and called twice from rcu_pending().

rcu_pending() has multiple callers, and not that small too.

This patch uninlines both of them.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 30b0bba03859..ccc45d49ce71 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -429,6 +429,36 @@ static void rcu_process_callbacks(unsigned long unused)
 				&__get_cpu_var(rcu_bh_data));
 }
 
+static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+	/* This cpu has pending rcu entries and the grace period
+	 * for them has completed.
+	 */
+	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
+		return 1;
+
+	/* This cpu has no pending entries, but there are new entries */
+	if (!rdp->curlist && rdp->nxtlist)
+		return 1;
+
+	/* This cpu has finished callbacks to invoke */
+	if (rdp->donelist)
+		return 1;
+
+	/* The rcu core waits for a quiescent state from the cpu */
+	if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
+		return 1;
+
+	/* nothing to do */
+	return 0;
+}
+
+int rcu_pending(int cpu)
+{
+	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
+		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
+}
+
 void rcu_check_callbacks(int cpu, int user)
 {
 	if (user || 
-- 
cgit v1.2.2


From 6053ee3b32e3437e8c1e72687850f436e779bd49 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Jan 2006 15:59:19 -0800
Subject: [PATCH] mutex subsystem, core

mutex implementation, core files: just the basic subsystem, no users of it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
---
 kernel/Makefile |   2 +-
 kernel/mutex.c  | 325 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/mutex.h  |  35 ++++++
 3 files changed, 361 insertions(+), 1 deletion(-)
 create mode 100644 kernel/mutex.c
 create mode 100644 kernel/mutex.h

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 4f5a1453093a..de580b4d54a4 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o intermodule.o extable.o params.o posix-timers.o \
-	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o
+	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o
 
 obj-$(CONFIG_FUTEX) += futex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
diff --git a/kernel/mutex.c b/kernel/mutex.c
new file mode 100644
index 000000000000..7eb960661441
--- /dev/null
+++ b/kernel/mutex.c
@@ -0,0 +1,325 @@
+/*
+ * kernel/mutex.c
+ *
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * Started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
+ * David Howells for suggestions and improvements.
+ *
+ * Also see Documentation/mutex-design.txt.
+ */
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+
+/*
+ * In the DEBUG case we are using the "NULL fastpath" for mutexes,
+ * which forces all calls into the slowpath:
+ */
+#ifdef CONFIG_DEBUG_MUTEXES
+# include "mutex-debug.h"
+# include <asm-generic/mutex-null.h>
+#else
+# include "mutex.h"
+# include <asm/mutex.h>
+#endif
+
+/***
+ * mutex_init - initialize the mutex
+ * @lock: the mutex to be initialized
+ *
+ * Initialize the mutex to unlocked state.
+ *
+ * It is not allowed to initialize an already locked mutex.
+ */
+void fastcall __mutex_init(struct mutex *lock, const char *name)
+{
+	atomic_set(&lock->count, 1);
+	spin_lock_init(&lock->wait_lock);
+	INIT_LIST_HEAD(&lock->wait_list);
+
+	debug_mutex_init(lock, name);
+}
+
+EXPORT_SYMBOL(__mutex_init);
+
+/*
+ * We split the mutex lock/unlock logic into separate fastpath and
+ * slowpath functions, to reduce the register pressure on the fastpath.
+ * We also put the fastpath first in the kernel image, to make sure the
+ * branch is predicted by the CPU as default-untaken.
+ */
+static void fastcall noinline __sched
+__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__);
+
+/***
+ * mutex_lock - acquire the mutex
+ * @lock: the mutex to be acquired
+ *
+ * Lock the mutex exclusively for this task. If the mutex is not
+ * available right now, it will sleep until it can get it.
+ *
+ * The mutex must later on be released by the same task that
+ * acquired it. Recursive locking is not allowed. The task
+ * may not exit without first unlocking the mutex. Also, kernel
+ * memory where the mutex resides mutex must not be freed with
+ * the mutex still locked. The mutex must first be initialized
+ * (or statically defined) before it can be locked. memset()-ing
+ * the mutex to 0 is not allowed.
+ *
+ * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging
+ *   checks that will enforce the restrictions and will also do
+ *   deadlock debugging. )
+ *
+ * This function is similar to (but not equivalent to) down().
+ */
+void fastcall __sched mutex_lock(struct mutex *lock)
+{
+	/*
+	 * The locking fastpath is the 1->0 transition from
+	 * 'unlocked' into 'locked' state.
+	 *
+	 * NOTE: if asm/mutex.h is included, then some architectures
+	 * rely on mutex_lock() having _no other code_ here but this
+	 * fastpath. That allows the assembly fastpath to do
+	 * tail-merging optimizations. (If you want to put testcode
+	 * here, do it under #ifndef CONFIG_MUTEX_DEBUG.)
+	 */
+	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_lock);
+
+static void fastcall noinline __sched
+__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__);
+
+/***
+ * mutex_unlock - release the mutex
+ * @lock: the mutex to be released
+ *
+ * Unlock a mutex that has been locked by this task previously.
+ *
+ * This function must not be used in interrupt context. Unlocking
+ * of a not locked mutex is not allowed.
+ *
+ * This function is similar to (but not equivalent to) up().
+ */
+void fastcall __sched mutex_unlock(struct mutex *lock)
+{
+	/*
+	 * The unlocking fastpath is the 0->1 transition from 'locked'
+	 * into 'unlocked' state:
+	 *
+	 * NOTE: no other code must be here - see mutex_lock() .
+	 */
+	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_unlock);
+
+/*
+ * Lock a mutex (possibly interruptible), slowpath:
+ */
+static inline int __sched
+__mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
+{
+	struct task_struct *task = current;
+	struct mutex_waiter waiter;
+	unsigned int old_val;
+
+	debug_mutex_init_waiter(&waiter);
+
+	spin_lock_mutex(&lock->wait_lock);
+
+	debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip);
+
+	/* add waiting tasks to the end of the waitqueue (FIFO): */
+	list_add_tail(&waiter.list, &lock->wait_list);
+	waiter.task = task;
+
+	for (;;) {
+		/*
+		 * Lets try to take the lock again - this is needed even if
+		 * we get here for the first time (shortly after failing to
+		 * acquire the lock), to make sure that we get a wakeup once
+		 * it's unlocked. Later on, if we sleep, this is the
+		 * operation that gives us the lock. We xchg it to -1, so
+		 * that when we release the lock, we properly wake up the
+		 * other waiters:
+		 */
+		old_val = atomic_xchg(&lock->count, -1);
+		if (old_val == 1)
+			break;
+
+		/*
+		 * got a signal? (This code gets eliminated in the
+		 * TASK_UNINTERRUPTIBLE case.)
+		 */
+		if (unlikely(state == TASK_INTERRUPTIBLE &&
+						signal_pending(task))) {
+			mutex_remove_waiter(lock, &waiter, task->thread_info);
+			spin_unlock_mutex(&lock->wait_lock);
+
+			debug_mutex_free_waiter(&waiter);
+			return -EINTR;
+		}
+		__set_task_state(task, state);
+
+		/* didnt get the lock, go to sleep: */
+		spin_unlock_mutex(&lock->wait_lock);
+		schedule();
+		spin_lock_mutex(&lock->wait_lock);
+	}
+
+	/* got the lock - rejoice! */
+	mutex_remove_waiter(lock, &waiter, task->thread_info);
+	debug_mutex_set_owner(lock, task->thread_info __IP__);
+
+	/* set it to 0 if there are no waiters left: */
+	if (likely(list_empty(&lock->wait_list)))
+		atomic_set(&lock->count, 0);
+
+	spin_unlock_mutex(&lock->wait_lock);
+
+	debug_mutex_free_waiter(&waiter);
+
+	DEBUG_WARN_ON(list_empty(&lock->held_list));
+	DEBUG_WARN_ON(lock->owner != task->thread_info);
+
+	return 0;
+}
+
+static void fastcall noinline __sched
+__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__)
+{
+	struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE __IP__);
+}
+
+/*
+ * Release the lock, slowpath:
+ */
+static fastcall noinline void
+__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
+{
+        struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+	DEBUG_WARN_ON(lock->owner != current_thread_info());
+
+	spin_lock_mutex(&lock->wait_lock);
+
+	/*
+	 * some architectures leave the lock unlocked in the fastpath failure
+	 * case, others need to leave it locked. In the later case we have to
+	 * unlock it here
+	 */
+	if (__mutex_slowpath_needs_to_unlock())
+		atomic_set(&lock->count, 1);
+
+	debug_mutex_unlock(lock);
+
+	if (!list_empty(&lock->wait_list)) {
+		/* get the first entry from the wait-list: */
+		struct mutex_waiter *waiter =
+				list_entry(lock->wait_list.next,
+					   struct mutex_waiter, list);
+
+		debug_mutex_wake_waiter(lock, waiter);
+
+		wake_up_process(waiter->task);
+	}
+
+	debug_mutex_clear_owner(lock);
+
+	spin_unlock_mutex(&lock->wait_lock);
+}
+
+/*
+ * Here come the less common (and hence less performance-critical) APIs:
+ * mutex_lock_interruptible() and mutex_trylock().
+ */
+static int fastcall noinline __sched
+__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__);
+
+/***
+ * mutex_lock_interruptible - acquire the mutex, interruptable
+ * @lock: the mutex to be acquired
+ *
+ * Lock the mutex like mutex_lock(), and return 0 if the mutex has
+ * been acquired or sleep until the mutex becomes available. If a
+ * signal arrives while waiting for the lock then this function
+ * returns -EINTR.
+ *
+ * This function is similar to (but not equivalent to) down_interruptible().
+ */
+int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
+{
+	/* NOTE: no other code must be here - see mutex_lock() */
+	return __mutex_fastpath_lock_retval
+			(&lock->count, __mutex_lock_interruptible_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_lock_interruptible);
+
+static int fastcall noinline __sched
+__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__)
+{
+	struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE __IP__);
+}
+
+/*
+ * Spinlock based trylock, we take the spinlock and check whether we
+ * can get the lock:
+ */
+static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
+{
+	struct mutex *lock = container_of(lock_count, struct mutex, count);
+	int prev;
+
+	spin_lock_mutex(&lock->wait_lock);
+
+	prev = atomic_xchg(&lock->count, -1);
+	if (likely(prev == 1))
+		debug_mutex_set_owner(lock, current_thread_info() __RET_IP__);
+	/* Set it back to 0 if there are no waiters: */
+	if (likely(list_empty(&lock->wait_list)))
+		atomic_set(&lock->count, 0);
+
+	spin_unlock_mutex(&lock->wait_lock);
+
+	return prev == 1;
+}
+
+/***
+ * mutex_trylock - try acquire the mutex, without waiting
+ * @lock: the mutex to be acquired
+ *
+ * Try to acquire the mutex atomically. Returns 1 if the mutex
+ * has been acquired successfully, and 0 on contention.
+ *
+ * NOTE: this function follows the spin_trylock() convention, so
+ * it is negated to the down_trylock() return values! Be careful
+ * about this when converting semaphore users to mutexes.
+ *
+ * This function must not be used in interrupt context. The
+ * mutex must be released by the same task that acquired it.
+ */
+int fastcall mutex_trylock(struct mutex *lock)
+{
+	return __mutex_fastpath_trylock(&lock->count,
+					__mutex_trylock_slowpath);
+}
+
+EXPORT_SYMBOL(mutex_trylock);
+
+
+
diff --git a/kernel/mutex.h b/kernel/mutex.h
new file mode 100644
index 000000000000..00fe84e7b672
--- /dev/null
+++ b/kernel/mutex.h
@@ -0,0 +1,35 @@
+/*
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains mutex debugging related internal prototypes, for the
+ * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
+ */
+
+#define spin_lock_mutex(lock)			spin_lock(lock)
+#define spin_unlock_mutex(lock)			spin_unlock(lock)
+#define mutex_remove_waiter(lock, waiter, ti) \
+		__list_del((waiter)->list.prev, (waiter)->list.next)
+
+#define DEBUG_WARN_ON(c)				do { } while (0)
+#define debug_mutex_set_owner(lock, new_owner)		do { } while (0)
+#define debug_mutex_clear_owner(lock)			do { } while (0)
+#define debug_mutex_init_waiter(waiter)			do { } while (0)
+#define debug_mutex_wake_waiter(lock, waiter)		do { } while (0)
+#define debug_mutex_free_waiter(waiter)			do { } while (0)
+#define debug_mutex_add_waiter(lock, waiter, ti, ip)	do { } while (0)
+#define debug_mutex_unlock(lock)			do { } while (0)
+#define debug_mutex_init(lock, name)			do { } while (0)
+
+/*
+ * Return-address parameters/declarations. They are very useful for
+ * debugging, but add overhead in the !DEBUG case - so we go the
+ * trouble of using this not too elegant but zero-cost solution:
+ */
+#define __IP_DECL__
+#define __IP__
+#define __RET_IP__
+
-- 
cgit v1.2.2


From 408894ee4dd4debfdedd472eb4d8414892fc90f6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Jan 2006 15:59:20 -0800
Subject: [PATCH] mutex subsystem, debugging code

mutex implementation - add debugging code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
---
 kernel/Makefile      |   1 +
 kernel/fork.c        |   4 +
 kernel/mutex-debug.c | 464 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/mutex-debug.h | 134 +++++++++++++++
 4 files changed, 603 insertions(+)
 create mode 100644 kernel/mutex-debug.c
 create mode 100644 kernel/mutex-debug.h

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index de580b4d54a4..a940bac02837 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,6 +9,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    rcupdate.o intermodule.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o
 
+obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_FUTEX) += futex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
diff --git a/kernel/fork.c b/kernel/fork.c
index 72e3252c6763..b18d64554feb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -979,6 +979,10 @@ static task_t *copy_process(unsigned long clone_flags,
  	}
 #endif
 
+#ifdef CONFIG_DEBUG_MUTEXES
+	p->blocked_on = NULL; /* not blocked yet */
+#endif
+
 	p->tgid = p->pid;
 	if (clone_flags & CLONE_THREAD)
 		p->tgid = current->tgid;
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
new file mode 100644
index 000000000000..4fcb051a8b9e
--- /dev/null
+++ b/kernel/mutex-debug.c
@@ -0,0 +1,464 @@
+/*
+ * kernel/mutex-debug.c
+ *
+ * Debugging code for mutexes
+ *
+ * Started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * lock debugging, locking tree, deadlock detection started by:
+ *
+ *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Released under the General Public License (GPL).
+ */
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+
+#include <asm/mutex.h>
+
+#include "mutex-debug.h"
+
+/*
+ * We need a global lock when we walk through the multi-process
+ * lock tree. Only used in the deadlock-debugging case.
+ */
+DEFINE_SPINLOCK(debug_mutex_lock);
+
+/*
+ * All locks held by all tasks, in a single global list:
+ */
+LIST_HEAD(debug_mutex_held_locks);
+
+/*
+ * In the debug case we carry the caller's instruction pointer into
+ * other functions, but we dont want the function argument overhead
+ * in the nondebug case - hence these macros:
+ */
+#define __IP_DECL__		, unsigned long ip
+#define __IP__			, ip
+#define __RET_IP__		, (unsigned long)__builtin_return_address(0)
+
+/*
+ * "mutex debugging enabled" flag. We turn it off when we detect
+ * the first problem because we dont want to recurse back
+ * into the tracing code when doing error printk or
+ * executing a BUG():
+ */
+int debug_mutex_on = 1;
+
+static void printk_task(struct task_struct *p)
+{
+	if (p)
+		printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_ti(struct thread_info *ti)
+{
+	if (ti)
+		printk_task(ti->task);
+	else
+		printk("<none>");
+}
+
+static void printk_task_short(struct task_struct *p)
+{
+	if (p)
+		printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_lock(struct mutex *lock, int print_owner)
+{
+	printk(" [%p] {%s}\n", lock, lock->name);
+
+	if (print_owner && lock->owner) {
+		printk(".. held by:  ");
+		printk_ti(lock->owner);
+		printk("\n");
+	}
+	if (lock->owner) {
+		printk("... acquired at:               ");
+		print_symbol("%s\n", lock->acquire_ip);
+	}
+}
+
+/*
+ * printk locks held by a task:
+ */
+static void show_task_locks(struct task_struct *p)
+{
+	switch (p->state) {
+	case TASK_RUNNING:		printk("R"); break;
+	case TASK_INTERRUPTIBLE:	printk("S"); break;
+	case TASK_UNINTERRUPTIBLE:	printk("D"); break;
+	case TASK_STOPPED:		printk("T"); break;
+	case EXIT_ZOMBIE:		printk("Z"); break;
+	case EXIT_DEAD:			printk("X"); break;
+	default:			printk("?"); break;
+	}
+	printk_task(p);
+	if (p->blocked_on) {
+		struct mutex *lock = p->blocked_on->lock;
+
+		printk(" blocked on mutex:");
+		printk_lock(lock, 1);
+	} else
+		printk(" (not blocked on mutex)\n");
+}
+
+/*
+ * printk all locks held in the system (if filter == NULL),
+ * or all locks belonging to a single task (if filter != NULL):
+ */
+void show_held_locks(struct task_struct *filter)
+{
+	struct list_head *curr, *cursor = NULL;
+	struct mutex *lock;
+	struct thread_info *t;
+	unsigned long flags;
+	int count = 0;
+
+	if (filter) {
+		printk("------------------------------\n");
+		printk("| showing all locks held by: |  (");
+		printk_task_short(filter);
+		printk("):\n");
+		printk("------------------------------\n");
+	} else {
+		printk("---------------------------\n");
+		printk("| showing all locks held: |\n");
+		printk("---------------------------\n");
+	}
+
+	/*
+	 * Play safe and acquire the global trace lock. We
+	 * cannot printk with that lock held so we iterate
+	 * very carefully:
+	 */
+next:
+	debug_spin_lock_save(&debug_mutex_lock, flags);
+	list_for_each(curr, &debug_mutex_held_locks) {
+		if (cursor && curr != cursor)
+			continue;
+		lock = list_entry(curr, struct mutex, held_list);
+		t = lock->owner;
+		if (filter && (t != filter->thread_info))
+			continue;
+		count++;
+		cursor = curr->next;
+		debug_spin_lock_restore(&debug_mutex_lock, flags);
+
+		printk("\n#%03d:            ", count);
+		printk_lock(lock, filter ? 0 : 1);
+		goto next;
+	}
+	debug_spin_lock_restore(&debug_mutex_lock, flags);
+	printk("\n");
+}
+
+void mutex_debug_show_all_locks(void)
+{
+	struct task_struct *g, *p;
+	int count = 10;
+	int unlock = 1;
+
+	printk("\nShowing all blocking locks in the system:\n");
+
+	/*
+	 * Here we try to get the tasklist_lock as hard as possible,
+	 * if not successful after 2 seconds we ignore it (but keep
+	 * trying). This is to enable a debug printout even if a
+	 * tasklist_lock-holding task deadlocks or crashes.
+	 */
+retry:
+	if (!read_trylock(&tasklist_lock)) {
+		if (count == 10)
+			printk("hm, tasklist_lock locked, retrying... ");
+		if (count) {
+			count--;
+			printk(" #%d", 10-count);
+			mdelay(200);
+			goto retry;
+		}
+		printk(" ignoring it.\n");
+		unlock = 0;
+	}
+	if (count != 10)
+		printk(" locked it.\n");
+
+	do_each_thread(g, p) {
+		show_task_locks(p);
+		if (!unlock)
+			if (read_trylock(&tasklist_lock))
+				unlock = 1;
+	} while_each_thread(g, p);
+
+	printk("\n");
+	show_held_locks(NULL);
+	printk("=============================================\n\n");
+
+	if (unlock)
+		read_unlock(&tasklist_lock);
+}
+
+static void report_deadlock(struct task_struct *task, struct mutex *lock,
+			    struct mutex *lockblk, unsigned long ip)
+{
+	printk("\n%s/%d is trying to acquire this lock:\n",
+		current->comm, current->pid);
+	printk_lock(lock, 1);
+	printk("... trying at:                 ");
+	print_symbol("%s\n", ip);
+	show_held_locks(current);
+
+	if (lockblk) {
+		printk("but %s/%d is deadlocking current task %s/%d!\n\n",
+			task->comm, task->pid, current->comm, current->pid);
+		printk("\n%s/%d is blocked on this lock:\n",
+			task->comm, task->pid);
+		printk_lock(lockblk, 1);
+
+		show_held_locks(task);
+
+		printk("\n%s/%d's [blocked] stackdump:\n\n",
+			task->comm, task->pid);
+		show_stack(task, NULL);
+	}
+
+	printk("\n%s/%d's [current] stackdump:\n\n",
+		current->comm, current->pid);
+	dump_stack();
+	mutex_debug_show_all_locks();
+	printk("[ turning off deadlock detection. Please report this. ]\n\n");
+	local_irq_disable();
+}
+
+/*
+ * Recursively check for mutex deadlocks:
+ */
+static int check_deadlock(struct mutex *lock, int depth,
+			  struct thread_info *ti, unsigned long ip)
+{
+	struct mutex *lockblk;
+	struct task_struct *task;
+
+	if (!debug_mutex_on)
+		return 0;
+
+	ti = lock->owner;
+	if (!ti)
+		return 0;
+
+	task = ti->task;
+	lockblk = NULL;
+	if (task->blocked_on)
+		lockblk = task->blocked_on->lock;
+
+	/* Self-deadlock: */
+	if (current == task) {
+		DEBUG_OFF();
+		if (depth)
+			return 1;
+		printk("\n==========================================\n");
+		printk(  "[ BUG: lock recursion deadlock detected! |\n");
+		printk(  "------------------------------------------\n");
+		report_deadlock(task, lock, NULL, ip);
+		return 0;
+	}
+
+	/* Ugh, something corrupted the lock data structure? */
+	if (depth > 20) {
+		DEBUG_OFF();
+		printk("\n===========================================\n");
+		printk(  "[ BUG: infinite lock dependency detected!? |\n");
+		printk(  "-------------------------------------------\n");
+		report_deadlock(task, lock, lockblk, ip);
+		return 0;
+	}
+
+	/* Recursively check for dependencies: */
+	if (lockblk && check_deadlock(lockblk, depth+1, ti, ip)) {
+		printk("\n============================================\n");
+		printk(  "[ BUG: circular locking deadlock detected! ]\n");
+		printk(  "--------------------------------------------\n");
+		report_deadlock(task, lock, lockblk, ip);
+		return 0;
+	}
+	return 0;
+}
+
+/*
+ * Called when a task exits, this function checks whether the
+ * task is holding any locks, and reports the first one if so:
+ */
+void mutex_debug_check_no_locks_held(struct task_struct *task)
+{
+	struct list_head *curr, *next;
+	struct thread_info *t;
+	unsigned long flags;
+	struct mutex *lock;
+
+	if (!debug_mutex_on)
+		return;
+
+	debug_spin_lock_save(&debug_mutex_lock, flags);
+	list_for_each_safe(curr, next, &debug_mutex_held_locks) {
+		lock = list_entry(curr, struct mutex, held_list);
+		t = lock->owner;
+		if (t != task->thread_info)
+			continue;
+		list_del_init(curr);
+		DEBUG_OFF();
+		debug_spin_lock_restore(&debug_mutex_lock, flags);
+
+		printk("BUG: %s/%d, lock held at task exit time!\n",
+			task->comm, task->pid);
+		printk_lock(lock, 1);
+		if (lock->owner != task->thread_info)
+			printk("exiting task is not even the owner??\n");
+		return;
+	}
+	debug_spin_lock_restore(&debug_mutex_lock, flags);
+}
+
+/*
+ * Called when kernel memory is freed (or unmapped), or if a mutex
+ * is destroyed or reinitialized - this code checks whether there is
+ * any held lock in the memory range of <from> to <to>:
+ */
+void mutex_debug_check_no_locks_freed(const void *from, const void *to)
+{
+	struct list_head *curr, *next;
+	unsigned long flags;
+	struct mutex *lock;
+	void *lock_addr;
+
+	if (!debug_mutex_on)
+		return;
+
+	debug_spin_lock_save(&debug_mutex_lock, flags);
+	list_for_each_safe(curr, next, &debug_mutex_held_locks) {
+		lock = list_entry(curr, struct mutex, held_list);
+		lock_addr = lock;
+		if (lock_addr < from || lock_addr >= to)
+			continue;
+		list_del_init(curr);
+		DEBUG_OFF();
+		debug_spin_lock_restore(&debug_mutex_lock, flags);
+
+		printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
+			current->comm, current->pid, lock, from, to);
+		dump_stack();
+		printk_lock(lock, 1);
+		if (lock->owner != current_thread_info())
+			printk("freeing task is not even the owner??\n");
+		return;
+	}
+	debug_spin_lock_restore(&debug_mutex_lock, flags);
+}
+
+/*
+ * Must be called with lock->wait_lock held.
+ */
+void debug_mutex_set_owner(struct mutex *lock,
+			   struct thread_info *new_owner __IP_DECL__)
+{
+	lock->owner = new_owner;
+	DEBUG_WARN_ON(!list_empty(&lock->held_list));
+	if (debug_mutex_on) {
+		list_add_tail(&lock->held_list, &debug_mutex_held_locks);
+		lock->acquire_ip = ip;
+	}
+}
+
+void debug_mutex_init_waiter(struct mutex_waiter *waiter)
+{
+	memset(waiter, 0x11, sizeof(*waiter));
+	waiter->magic = waiter;
+	INIT_LIST_HEAD(&waiter->list);
+}
+
+void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
+{
+	SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock));
+	DEBUG_WARN_ON(list_empty(&lock->wait_list));
+	DEBUG_WARN_ON(waiter->magic != waiter);
+	DEBUG_WARN_ON(list_empty(&waiter->list));
+}
+
+void debug_mutex_free_waiter(struct mutex_waiter *waiter)
+{
+	DEBUG_WARN_ON(!list_empty(&waiter->list));
+	memset(waiter, 0x22, sizeof(*waiter));
+}
+
+void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+			    struct thread_info *ti __IP_DECL__)
+{
+	SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock));
+	check_deadlock(lock, 0, ti, ip);
+	/* Mark the current thread as blocked on the lock: */
+	ti->task->blocked_on = waiter;
+	waiter->lock = lock;
+}
+
+void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+			 struct thread_info *ti)
+{
+	DEBUG_WARN_ON(list_empty(&waiter->list));
+	DEBUG_WARN_ON(waiter->task != ti->task);
+	DEBUG_WARN_ON(ti->task->blocked_on != waiter);
+	ti->task->blocked_on = NULL;
+
+	list_del_init(&waiter->list);
+	waiter->task = NULL;
+}
+
+void debug_mutex_unlock(struct mutex *lock)
+{
+	DEBUG_WARN_ON(lock->magic != lock);
+	DEBUG_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
+	DEBUG_WARN_ON(lock->owner != current_thread_info());
+	if (debug_mutex_on) {
+		DEBUG_WARN_ON(list_empty(&lock->held_list));
+		list_del_init(&lock->held_list);
+	}
+}
+
+void debug_mutex_init(struct mutex *lock, const char *name)
+{
+	/*
+	 * Make sure we are not reinitializing a held lock:
+	 */
+	mutex_debug_check_no_locks_freed((void *)lock, (void *)(lock + 1));
+	lock->owner = NULL;
+	INIT_LIST_HEAD(&lock->held_list);
+	lock->name = name;
+	lock->magic = lock;
+}
+
+/***
+ * mutex_destroy - mark a mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+void fastcall mutex_destroy(struct mutex *lock)
+{
+	DEBUG_WARN_ON(mutex_is_locked(lock));
+	lock->magic = NULL;
+}
+
+EXPORT_SYMBOL_GPL(mutex_destroy);
+
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
new file mode 100644
index 000000000000..fd384050acb1
--- /dev/null
+++ b/kernel/mutex-debug.h
@@ -0,0 +1,134 @@
+/*
+ * Mutexes: blocking mutual exclusion locks
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains mutex debugging related internal declarations,
+ * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case.
+ * More details are in kernel/mutex-debug.c.
+ */
+
+extern spinlock_t debug_mutex_lock;
+extern struct list_head debug_mutex_held_locks;
+extern int debug_mutex_on;
+
+/*
+ * In the debug case we carry the caller's instruction pointer into
+ * other functions, but we dont want the function argument overhead
+ * in the nondebug case - hence these macros:
+ */
+#define __IP_DECL__		, unsigned long ip
+#define __IP__			, ip
+#define __RET_IP__		, (unsigned long)__builtin_return_address(0)
+
+/*
+ * This must be called with lock->wait_lock held.
+ */
+extern void debug_mutex_set_owner(struct mutex *lock,
+				  struct thread_info *new_owner __IP_DECL__);
+
+static inline void debug_mutex_clear_owner(struct mutex *lock)
+{
+	lock->owner = NULL;
+}
+
+extern void debug_mutex_init_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_wake_waiter(struct mutex *lock,
+				    struct mutex_waiter *waiter);
+extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_add_waiter(struct mutex *lock,
+				   struct mutex_waiter *waiter,
+				   struct thread_info *ti __IP_DECL__);
+extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+				struct thread_info *ti);
+extern void debug_mutex_unlock(struct mutex *lock);
+extern void debug_mutex_init(struct mutex *lock, const char *name);
+
+#define debug_spin_lock(lock)				\
+	do {						\
+		local_irq_disable();			\
+		if (debug_mutex_on)			\
+			spin_lock(lock);		\
+	} while (0)
+
+#define debug_spin_unlock(lock)				\
+	do {						\
+		if (debug_mutex_on)			\
+			spin_unlock(lock);		\
+		local_irq_enable();			\
+		preempt_check_resched();		\
+	} while (0)
+
+#define debug_spin_lock_save(lock, flags)		\
+	do {						\
+		local_irq_save(flags);			\
+		if (debug_mutex_on)			\
+			spin_lock(lock);		\
+	} while (0)
+
+#define debug_spin_lock_restore(lock, flags)		\
+	do {						\
+		if (debug_mutex_on)			\
+			spin_unlock(lock);		\
+		local_irq_restore(flags);		\
+		preempt_check_resched();		\
+	} while (0)
+
+#define spin_lock_mutex(lock)				\
+	do {						\
+		struct mutex *l = container_of(lock, struct mutex, wait_lock); \
+							\
+		DEBUG_WARN_ON(in_interrupt());		\
+		debug_spin_lock(&debug_mutex_lock);	\
+		spin_lock(lock);			\
+		DEBUG_WARN_ON(l->magic != l);		\
+	} while (0)
+
+#define spin_unlock_mutex(lock)				\
+	do {						\
+		spin_unlock(lock);			\
+		debug_spin_unlock(&debug_mutex_lock);	\
+	} while (0)
+
+#define DEBUG_OFF()					\
+do {							\
+	if (debug_mutex_on) {				\
+		debug_mutex_on = 0;			\
+		console_verbose();			\
+		if (spin_is_locked(&debug_mutex_lock))	\
+			spin_unlock(&debug_mutex_lock);	\
+	}						\
+} while (0)
+
+#define DEBUG_BUG()					\
+do {							\
+	if (debug_mutex_on) {				\
+		DEBUG_OFF();				\
+		BUG();					\
+	}						\
+} while (0)
+
+#define DEBUG_WARN_ON(c)				\
+do {							\
+	if (unlikely(c && debug_mutex_on)) {		\
+		DEBUG_OFF();				\
+		WARN_ON(1);				\
+	}						\
+} while (0)
+
+# define DEBUG_BUG_ON(c)				\
+do {							\
+	if (unlikely(c))				\
+		DEBUG_BUG();				\
+} while (0)
+
+#ifdef CONFIG_SMP
+# define SMP_DEBUG_WARN_ON(c)			DEBUG_WARN_ON(c)
+# define SMP_DEBUG_BUG_ON(c)			DEBUG_BUG_ON(c)
+#else
+# define SMP_DEBUG_WARN_ON(c)			do { } while (0)
+# define SMP_DEBUG_BUG_ON(c)			do { } while (0)
+#endif
+
-- 
cgit v1.2.2


From de5097c2e73f826302cd8957c225b3725e0c7553 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Jan 2006 15:59:21 -0800
Subject: [PATCH] mutex subsystem, more debugging code

more mutex debugging: check for held locks during memory freeing,
task exit, enable sysrq printouts, etc.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
---
 kernel/exit.c  | 5 +++++
 kernel/sched.c | 1 +
 2 files changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index caceabf3f230..309a46fa16f8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -29,6 +29,7 @@
 #include <linux/syscalls.h>
 #include <linux/signal.h>
 #include <linux/cn_proc.h>
+#include <linux/mutex.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -869,6 +870,10 @@ fastcall NORET_TYPE void do_exit(long code)
 	mpol_free(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 #endif
+	/*
+	 * If DEBUG_MUTEXES is on, make sure we are holding no locks:
+	 */
+	mutex_debug_check_no_locks_held(tsk);
 
 	/* PF_DEAD causes final put_task_struct after we schedule. */
 	preempt_disable();
diff --git a/kernel/sched.c b/kernel/sched.c
index 92733091154c..34a945bcc022 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4386,6 +4386,7 @@ void show_state(void)
 	} while_each_thread(g, p);
 
 	read_unlock(&tasklist_lock);
+	mutex_debug_show_all_locks();
 }
 
 /**
-- 
cgit v1.2.2


From 1b1dcc1b57a49136f118a0f16367256ff9994a69 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Mon, 9 Jan 2006 15:59:24 -0800
Subject: [PATCH] mutex subsystem, semaphore to mutex: VFS, ->i_sem

This patch converts the inode semaphore to a mutex. I have tested it on
XFS and compiled as much as one can consider on an ia64. Anyway your
luck with it might be different.

Modified-by: Ingo Molnar <mingo@elte.hu>

(finished the conversion)

Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpuset.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index eab64e23bcae..2a75e44e1a41 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1513,7 +1513,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
 	struct dentry *dentry;
 	int error;
 
-	down(&dir->d_inode->i_sem);
+	mutex_lock(&dir->d_inode->i_mutex);
 	dentry = cpuset_get_dentry(dir, cft->name);
 	if (!IS_ERR(dentry)) {
 		error = cpuset_create_file(dentry, 0644 | S_IFREG);
@@ -1522,7 +1522,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
 		dput(dentry);
 	} else
 		error = PTR_ERR(dentry);
-	up(&dir->d_inode->i_sem);
+	mutex_unlock(&dir->d_inode->i_mutex);
 	return error;
 }
 
@@ -1793,7 +1793,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 
 	/*
 	 * Release manage_sem before cpuset_populate_dir() because it
-	 * will down() this new directory's i_sem and if we race with
+	 * will down() this new directory's i_mutex and if we race with
 	 * another mkdir, we might deadlock.
 	 */
 	up(&manage_sem);
@@ -1812,7 +1812,7 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	struct cpuset *c_parent = dentry->d_parent->d_fsdata;
 
-	/* the vfs holds inode->i_sem already */
+	/* the vfs holds inode->i_mutex already */
 	return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
 }
 
@@ -1823,7 +1823,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	struct cpuset *parent;
 	char *pathbuf = NULL;
 
-	/* the vfs holds both inode->i_sem already */
+	/* the vfs holds both inode->i_mutex already */
 
 	down(&manage_sem);
 	cpuset_update_task_memory_state();
-- 
cgit v1.2.2


From dbc1651f0c5b7d13acc59d3b805a7224332fb1fb Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jan 2006 22:19:33 +0300
Subject: [PATCH] rcu: don't set ->next_pending in rcu_start_batch()

I think it is better to set ->next_pending in the caller, when
it is needed. This saves one parameter, and this coincides with
cpu_quiet() beahaviour, which sets ->completed = ->cur itself.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index ccc45d49ce71..05ee48316f70 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -236,12 +236,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
  * active batch and the batch to be registered has not already occurred.
  * Caller must hold rcu_state.lock.
  */
-static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp,
-				int next_pending)
+static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
 {
-	if (next_pending)
-		rcp->next_pending = 1;
-
 	if (rcp->next_pending &&
 			rcp->completed == rcp->cur) {
 		rcp->next_pending = 0;
@@ -275,7 +271,7 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
 	if (cpus_empty(rsp->cpumask)) {
 		/* batch completed ! */
 		rcp->completed = rcp->cur;
-		rcu_start_batch(rcp, rsp, 0);
+		rcu_start_batch(rcp, rsp);
 	}
 }
 
@@ -410,7 +406,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 		if (!rcp->next_pending) {
 			/* and start it/schedule start if it's a new batch */
 			spin_lock(&rsp->lock);
-			rcu_start_batch(rcp, rsp, 1);
+			rcp->next_pending = 1;
+			rcu_start_batch(rcp, rsp);
 			spin_unlock(&rsp->lock);
 		}
 	} else {
-- 
cgit v1.2.2


From ed653a640419cd9d1ad0f8dc176478e6634a694e Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Mon, 9 Jan 2006 20:51:38 -0800
Subject: [PATCH] Remove set_fs() in stop_machine()

)

From: Brian Gerst <bgerst@didntduck.org>

Call sched_setscheduler() directly instead.

Signed-off-by: Brian Gerst <bgerst@didntduck.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/stop_machine.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b3d4dc858e35..dcfb5d731466 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -87,13 +87,9 @@ static int stop_machine(void)
 {
 	int i, ret = 0;
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-	mm_segment_t old_fs = get_fs();
 
 	/* One high-prio thread per cpu.  We'll do this one. */
-	set_fs(KERNEL_DS);
-	sys_sched_setscheduler(current->pid, SCHED_FIFO,
-				(struct sched_param __user *)&param);
-	set_fs(old_fs);
+	sched_setscheduler(current, SCHED_FIFO, &param);
 
 	atomic_set(&stopmachine_thread_ack, 0);
 	stopmachine_num_threads = 0;
-- 
cgit v1.2.2


From cc57165874e938ef684d71ba7d36e7088b551489 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Mon, 9 Jan 2006 20:51:41 -0800
Subject: [PATCH] kdump: dynamic per cpu allocation of memory for saving cpu
 registers

- In case of system crash, current state of cpu registers is saved in memory
  in elf note format.  So far memory for storing elf notes was being allocated
  statically for NR_CPUS.

- This patch introduces dynamic allocation of memory for storing elf notes.
  It uses alloc_percpu() interface.  This should lead to better memory usage.

- Introduced based on Andi Kleen's and Eric W. Biederman's suggestions.

- This patch also moves memory allocation for elf notes from architecture
  dependent portion to architecture independent portion.  Now crash_notes is
  architecture independent.  The whole idea is that size of memory to be
  allocated per cpu (MAX_NOTE_BYTES) can be architecture dependent and
  allocation of this memory can be architecture independent.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kexec.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2c95848fbce8..1197de8b2a94 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -26,6 +26,9 @@
 #include <asm/system.h>
 #include <asm/semaphore.h>
 
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t* crash_notes;
+
 /* Location of the reserved area for the crash kernel */
 struct resource crashk_res = {
 	.name  = "Crash kernel",
@@ -1060,3 +1063,16 @@ void crash_kexec(struct pt_regs *regs)
 		xchg(&kexec_lock, 0);
 	}
 }
+
+static int __init crash_notes_memory_init(void)
+{
+	/* Allocate memory for saving cpu registers. */
+	crash_notes = alloc_percpu(note_buf_t);
+	if (!crash_notes) {
+		printk("Kexec: Memory allocation for saving cpu register"
+		" states failed\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+module_init(crash_notes_memory_init)
-- 
cgit v1.2.2


From 51be5606d9ff9eb27ed6514f6172fbd7578a25d6 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Mon, 9 Jan 2006 20:51:42 -0800
Subject: [PATCH] kdump: export per cpu crash notes pointer through sysfs

- Kexec on panic functionality allocates memory for saving cpu registers in
  case of system crash event.  Address of this allocated memory needs to be
  exported to user space, which is used by kexec-tools.

- Previously, a single /sys/kernel/crash_notes entry was being exported as
  memory allocated was a single continuous array.  Now memory allocation being
  dyanmic and per cpu based, address of per cpu buffer is exported through
  "/sys/devices/system/cpu/cpuX/crash_notes"

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/ksysfs.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 99af8b05eeaa..d5eeae0fa5bc 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -51,16 +51,6 @@ static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, s
 KERNEL_ATTR_RW(uevent_helper);
 #endif
 
-#ifdef CONFIG_KEXEC
-#include <asm/kexec.h>
-
-static ssize_t crash_notes_show(struct subsystem *subsys, char *page)
-{
-	return sprintf(page, "%p\n", (void *)crash_notes);
-}
-KERNEL_ATTR_RO(crash_notes);
-#endif
-
 decl_subsys(kernel, NULL, NULL);
 EXPORT_SYMBOL_GPL(kernel_subsys);
 
@@ -68,9 +58,6 @@ static struct attribute * kernel_attrs[] = {
 #ifdef CONFIG_HOTPLUG
 	&uevent_seqnum_attr.attr,
 	&uevent_helper_attr.attr,
-#endif
-#ifdef CONFIG_KEXEC
-	&crash_notes_attr.attr,
 #endif
 	NULL
 };
-- 
cgit v1.2.2


From e996e58133c475bcf3a229f716b9457267cbaa0a Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Mon, 9 Jan 2006 20:51:44 -0800
Subject: [PATCH] kdump: save registers early (inline functions)

- If system panics then cpu register states are captured through funciton
  crash_get_current_regs().  This is not a inline function hence a stack frame
  is pushed on to the stack and then cpu register state is captured.  Later
  this frame is popped and new frames are pushed (machine_kexec).

- In theory this is not very right as we are capturing register states for a
  frame and that frame is no more valid.  This seems to have created back
  trace problems for ppc64.

- This patch fixes it up.  The very first thing it does after entering
  crash_kexec() is to capture the register states.  Anyway we don't want the
  back trace beyond crash_kexec().  crash_get_current_regs() has been made
  inline

- crash_setup_regs() is the top architecture dependent function which should
  be responsible for capturing the register states as well as to do some
  architecture dependent tricks.  For ex.  fixing up ss and esp for i386.
  crash_setup_regs() has also been made inline to ensure no new call frame is
  pushed onto stack.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kexec.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 1197de8b2a94..de1441656efd 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1057,7 +1057,9 @@ void crash_kexec(struct pt_regs *regs)
 	if (!locked) {
 		image = xchg(&kexec_crash_image, NULL);
 		if (image) {
-			machine_crash_shutdown(regs);
+			struct pt_regs fixed_regs;
+			crash_setup_regs(&fixed_regs, regs);
+			machine_crash_shutdown(&fixed_regs);
 			machine_kexec(image);
 		}
 		xchg(&kexec_lock, 0);
-- 
cgit v1.2.2


From 4ae362be509306eafa6441603686d33fefe321c1 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Mon, 9 Jan 2006 20:51:50 -0800
Subject: [PATCH] kdump: read previous kernel's memory

- Moving the crash_dump.c file to arch dependent part as kmap_atomic_pfn is
  specific to i386 and highmem may not exist in other archs.

- Use ioremap for x86_64 to map the previous kernel memory.

- In copy_oldmem_page(), we now directly copy to the user/kernel buffer and
  avoid the unneccesary copy to a kmalloc'd page.

Signed-off-by: Rachita Kothiyal <rachita@in.ibm.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/Makefile     |  1 -
 kernel/crash_dump.c | 64 -----------------------------------------------------
 2 files changed, 65 deletions(-)
 delete mode 100644 kernel/crash_dump.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index a940bac02837..1e039700c0ad 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
 obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
deleted file mode 100644
index fccb27dbc623..000000000000
--- a/kernel/crash_dump.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- *	kernel/crash_dump.c - Memory preserving reboot related code.
- *
- *	Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
- *	Copyright (C) IBM Corporation, 2004. All rights reserved
- */
-
-#include <linux/smp_lock.h>
-#include <linux/errno.h>
-#include <linux/proc_fs.h>
-#include <linux/bootmem.h>
-#include <linux/highmem.h>
-#include <linux/crash_dump.h>
-
-#include <asm/io.h>
-#include <asm/uaccess.h>
-#include <asm/kexec.h>
-
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-
-#ifndef HAVE_ARCH_COPY_OLDMEM_PAGE
-/**
- * copy_oldmem_page - copy one page from "oldmem"
- * @pfn: page frame number to be copied
- * @buf: target memory address for the copy; this can be in kernel address
- *	space or user address space (see @userbuf)
- * @csize: number of bytes to copy
- * @offset: offset in bytes into the page (based on pfn) to begin the copy
- * @userbuf: if set, @buf is in user address space, use copy_to_user(),
- *	otherwise @buf is in kernel address space, use memcpy().
- *
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
- */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
-				size_t csize, unsigned long offset, int userbuf)
-{
-	void *page, *vaddr;
-
-	if (!csize)
-		return 0;
-
-	page = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!page)
-		return -ENOMEM;
-
-	vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
-	copy_page(page, vaddr);
-	kunmap_atomic(vaddr, KM_PTE0);
-
-	if (userbuf) {
-		if (copy_to_user(buf, (page + offset), csize)) {
-			kfree(page);
-			return -EFAULT;
-		}
-	} else {
-		memcpy(buf, (page + offset), csize);
-	}
-
-	kfree(page);
-	return csize;
-}
-#endif
-- 
cgit v1.2.2


From 3a0f69d59ba41fbcad6a17b6e8aab02bf45e20ce Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 9 Jan 2006 20:52:08 -0800
Subject: [PATCH] common compat_sys_timer_create

The comment in compat.c is wrong, every architecture provides a
get_compat_sigevent() for the IPC compat code already.

This basically moves the x86_64 version to common code and removes all the
others.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Paul Mackerras <paulus@samba.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Acked-by: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/compat.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 102296e21ea8..256e5d9f0647 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -514,6 +514,24 @@ static int put_compat_itimerspec(struct compat_itimerspec __user *dst,
 	return 0;
 } 
 
+long compat_sys_timer_create(clockid_t which_clock,
+			struct compat_sigevent __user *timer_event_spec,
+			timer_t __user *created_timer_id)
+{
+	struct sigevent __user *event = NULL;
+
+	if (timer_event_spec) {
+		struct sigevent kevent;
+
+		event = compat_alloc_user_space(sizeof(*event));
+		if (get_compat_sigevent(&kevent, timer_event_spec) ||
+		    copy_to_user(event, &kevent, sizeof(*event)))
+			return -EFAULT;
+	}
+
+	return sys_timer_create(which_clock, event, created_timer_id);
+}
+
 long compat_sys_timer_settime(timer_t timer_id, int flags,
 			  struct compat_itimerspec __user *new, 
 			  struct compat_itimerspec __user *old)
@@ -649,8 +667,6 @@ int get_compat_sigevent(struct sigevent *event,
 		? -EFAULT : 0;
 }
 
-/* timer_create is architecture specific because it needs sigevent conversion */
-
 long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask,
 		       unsigned long bitmap_size)
 {
-- 
cgit v1.2.2


From 67924be88607303a4459fe1c7dcd75527c338cf4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:21 -0800
Subject: [PATCH] hrtimer: remove duplicate div_long_long_rem implementation

make posix-timers.c use the generic calc64.h facility

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-timers.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5870efb3e200..fdb710777439 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -35,6 +35,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/time.h>
+#include <linux/calc64.h>
 
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
@@ -48,15 +49,6 @@
 #include <linux/workqueue.h>
 #include <linux/module.h>
 
-#ifndef div_long_long_rem
-#include <asm/div64.h>
-
-#define div_long_long_rem(dividend,divisor,remainder) ({ \
-		       u64 result = dividend;		\
-		       *remainder = do_div(result,divisor); \
-		       result; })
-
-#endif
 #define CLOCK_REALTIME_RES TICK_NSEC  /* In nano seconds. */
 
 static inline u64  mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2)
-- 
cgit v1.2.2


From 753be6222728996974e9e12c185108fcabbb7c6e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:22 -0800
Subject: [PATCH] hrtimer: deinline mktime and set_normalized_timespec

mktime() and set_normalized_timespec() are large inline functions used in many
places: deinline them.

From: George Anzinger, off-by-1 bugfix

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index b94bfa8c03e0..fa569885e22b 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -583,6 +583,67 @@ void getnstimestamp(struct timespec *ts)
 }
 EXPORT_SYMBOL_GPL(getnstimestamp);
 
+/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
+ * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
+ * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
+ *
+ * [For the Julian calendar (which was used in Russia before 1917,
+ * Britain & colonies before 1752, anywhere else before 1582,
+ * and is still in use by some communities) leave out the
+ * -year/100+year/400 terms, and add 10.]
+ *
+ * This algorithm was first published by Gauss (I think).
+ *
+ * WARNING: this function will overflow on 2106-02-07 06:28:16 on
+ * machines were long is 32-bit! (However, as time_t is signed, we
+ * will already get problems at other places on 2038-01-19 03:14:08)
+ */
+unsigned long
+mktime (unsigned int year, unsigned int mon,
+	unsigned int day, unsigned int hour,
+	unsigned int min, unsigned int sec)
+{
+	if (0 >= (int) (mon -= 2)) {	/* 1..12 -> 11,12,1..10 */
+		mon += 12;		/* Puts Feb last since it has leap day */
+		year -= 1;
+	}
+
+	return ((((unsigned long)
+		  (year/4 - year/100 + year/400 + 367*mon/12 + day) +
+		  year*365 - 719499
+	    )*24 + hour /* now have hours */
+	  )*60 + min /* now have minutes */
+	)*60 + sec; /* finally seconds */
+}
+
+/**
+ * set_normalized_timespec - set timespec sec and nsec parts and normalize
+ *
+ * @ts:		pointer to timespec variable to be set
+ * @sec:	seconds to set
+ * @nsec:	nanoseconds to set
+ *
+ * Set seconds and nanoseconds field of a timespec variable and
+ * normalize to the timespec storage format
+ *
+ * Note: The tv_nsec part is always in the range of
+ * 	0 <= tv_nsec < NSEC_PER_SEC
+ * For negative values only the tv_sec field is negative !
+ */
+void set_normalized_timespec (struct timespec *ts, time_t sec, long nsec)
+{
+	while (nsec >= NSEC_PER_SEC) {
+		nsec -= NSEC_PER_SEC;
+		++sec;
+	}
+	while (nsec < 0) {
+		nsec += NSEC_PER_SEC;
+		--sec;
+	}
+	ts->tv_sec = sec;
+	ts->tv_nsec = nsec;
+}
+
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
-- 
cgit v1.2.2


From f4818900fa3ee1c56e96f6dede7cc4c05ed383d1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Jan 2006 20:52:23 -0800
Subject: [PATCH] hrtimer: clean up mktime and make arguments const

add 'const' to mktime arguments, and clean it up a bit

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index fa569885e22b..a0502aef43ce 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -599,12 +599,15 @@ EXPORT_SYMBOL_GPL(getnstimestamp);
  * will already get problems at other places on 2038-01-19 03:14:08)
  */
 unsigned long
-mktime (unsigned int year, unsigned int mon,
-	unsigned int day, unsigned int hour,
-	unsigned int min, unsigned int sec)
+mktime(const unsigned int year0, const unsigned int mon0,
+       const unsigned int day, const unsigned int hour,
+       const unsigned int min, const unsigned int sec)
 {
-	if (0 >= (int) (mon -= 2)) {	/* 1..12 -> 11,12,1..10 */
-		mon += 12;		/* Puts Feb last since it has leap day */
+	unsigned int mon = mon0, year = year0;
+
+	/* 1..12 -> 11,12,1..10 */
+	if (0 >= (int) (mon -= 2)) {
+		mon += 12;	/* Puts Feb last since it has leap day */
 		year -= 1;
 	}
 
@@ -630,7 +633,7 @@ mktime (unsigned int year, unsigned int mon,
  * 	0 <= tv_nsec < NSEC_PER_SEC
  * For negative values only the tv_sec field is negative !
  */
-void set_normalized_timespec (struct timespec *ts, time_t sec, long nsec)
+void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
 {
 	while (nsec >= NSEC_PER_SEC) {
 		nsec -= NSEC_PER_SEC;
-- 
cgit v1.2.2


From 199e7056895c8427dec63e1d9841ceafcfaaf314 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 9 Jan 2006 20:52:24 -0800
Subject: [PATCH] hrtimer: export deinlined mktime

This is now uninlined, but some modules use it.

Make it a non-GPL export, since the inlined mktime() was also available that
way.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index a0502aef43ce..bbc8ca194b59 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -619,6 +619,8 @@ mktime(const unsigned int year0, const unsigned int mon0,
 	)*60 + sec; /* finally seconds */
 }
 
+EXPORT_SYMBOL(mktime);
+
 /**
  * set_normalized_timespec - set timespec sec and nsec parts and normalize
  *
-- 
cgit v1.2.2


From a924b04ddea9788e09f387fe19ccbede5f09ddd8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:27 -0800
Subject: [PATCH] hrtimer: make clockid_t arguments const

add const arguments to the posix-timers.h API functions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 40 ++++++++++++++++++++++------------------
 kernel/posix-timers.c     | 38 +++++++++++++++++++++-----------------
 2 files changed, 43 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 4c68edff900b..abf6990c6eb5 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -7,7 +7,7 @@
 #include <asm/uaccess.h>
 #include <linux/errno.h>
 
-static int check_clock(clockid_t which_clock)
+static int check_clock(const clockid_t which_clock)
 {
 	int error = 0;
 	struct task_struct *p;
@@ -31,7 +31,7 @@ static int check_clock(clockid_t which_clock)
 }
 
 static inline union cpu_time_count
-timespec_to_sample(clockid_t which_clock, const struct timespec *tp)
+timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
 {
 	union cpu_time_count ret;
 	ret.sched = 0;		/* high half always zero when .cpu used */
@@ -43,7 +43,7 @@ timespec_to_sample(clockid_t which_clock, const struct timespec *tp)
 	return ret;
 }
 
-static void sample_to_timespec(clockid_t which_clock,
+static void sample_to_timespec(const clockid_t which_clock,
 			       union cpu_time_count cpu,
 			       struct timespec *tp)
 {
@@ -55,7 +55,7 @@ static void sample_to_timespec(clockid_t which_clock,
 	}
 }
 
-static inline int cpu_time_before(clockid_t which_clock,
+static inline int cpu_time_before(const clockid_t which_clock,
 				  union cpu_time_count now,
 				  union cpu_time_count then)
 {
@@ -65,7 +65,7 @@ static inline int cpu_time_before(clockid_t which_clock,
 		return cputime_lt(now.cpu, then.cpu);
 	}
 }
-static inline void cpu_time_add(clockid_t which_clock,
+static inline void cpu_time_add(const clockid_t which_clock,
 				union cpu_time_count *acc,
 			        union cpu_time_count val)
 {
@@ -75,7 +75,7 @@ static inline void cpu_time_add(clockid_t which_clock,
 		acc->cpu = cputime_add(acc->cpu, val.cpu);
 	}
 }
-static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
+static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
 						union cpu_time_count a,
 						union cpu_time_count b)
 {
@@ -151,7 +151,7 @@ static inline unsigned long long sched_ns(struct task_struct *p)
 	return (p == current) ? current_sched_time(p) : p->sched_time;
 }
 
-int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
 	int error = check_clock(which_clock);
 	if (!error) {
@@ -169,7 +169,7 @@ int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
 	return error;
 }
 
-int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp)
+int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
 {
 	/*
 	 * You can never reset a CPU clock, but we check for other errors
@@ -186,7 +186,7 @@ int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp)
 /*
  * Sample a per-thread clock for the given task.
  */
-static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p,
+static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 			    union cpu_time_count *cpu)
 {
 	switch (CPUCLOCK_WHICH(which_clock)) {
@@ -248,7 +248,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
  * Sample a process (thread group) clock for the given group_leader task.
  * Must be called with tasklist_lock held for reading.
  */
-static int cpu_clock_sample_group(clockid_t which_clock,
+static int cpu_clock_sample_group(const clockid_t which_clock,
 				  struct task_struct *p,
 				  union cpu_time_count *cpu)
 {
@@ -262,7 +262,7 @@ static int cpu_clock_sample_group(clockid_t which_clock,
 }
 
 
-int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 {
 	const pid_t pid = CPUCLOCK_PID(which_clock);
 	int error = -EINVAL;
@@ -1399,7 +1399,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 
 static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
 
-int posix_cpu_nsleep(clockid_t which_clock, int flags,
+int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		     struct timespec *rqtp)
 {
 	struct restart_block *restart_block =
@@ -1503,11 +1503,13 @@ posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
 #define PROCESS_CLOCK	MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
 #define THREAD_CLOCK	MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
 
-static int process_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+static int process_cpu_clock_getres(const clockid_t which_clock,
+				    struct timespec *tp)
 {
 	return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
 }
-static int process_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+static int process_cpu_clock_get(const clockid_t which_clock,
+				 struct timespec *tp)
 {
 	return posix_cpu_clock_get(PROCESS_CLOCK, tp);
 }
@@ -1516,16 +1518,18 @@ static int process_cpu_timer_create(struct k_itimer *timer)
 	timer->it_clock = PROCESS_CLOCK;
 	return posix_cpu_timer_create(timer);
 }
-static int process_cpu_nsleep(clockid_t which_clock, int flags,
+static int process_cpu_nsleep(const clockid_t which_clock, int flags,
 			      struct timespec *rqtp)
 {
 	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
 }
-static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+static int thread_cpu_clock_getres(const clockid_t which_clock,
+				   struct timespec *tp)
 {
 	return posix_cpu_clock_getres(THREAD_CLOCK, tp);
 }
-static int thread_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+static int thread_cpu_clock_get(const clockid_t which_clock,
+				struct timespec *tp)
 {
 	return posix_cpu_clock_get(THREAD_CLOCK, tp);
 }
@@ -1534,7 +1538,7 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
 	timer->it_clock = THREAD_CLOCK;
 	return posix_cpu_timer_create(timer);
 }
-static int thread_cpu_nsleep(clockid_t which_clock, int flags,
+static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
 			      struct timespec *rqtp)
 {
 	return -EINVAL;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index fdb710777439..69d5a4b5395b 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -151,7 +151,7 @@ static void posix_timer_fn(unsigned long);
 static u64 do_posix_clock_monotonic_gettime_parts(
 	struct timespec *tp, struct timespec *mo);
 int do_posix_clock_monotonic_gettime(struct timespec *tp);
-static int do_posix_clock_monotonic_get(clockid_t, struct timespec *tp);
+static int do_posix_clock_monotonic_get(const clockid_t, struct timespec *tp);
 
 static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
 
@@ -176,7 +176,7 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
  * the function pointer CALL in struct k_clock.
  */
 
-static inline int common_clock_getres(clockid_t which_clock,
+static inline int common_clock_getres(const clockid_t which_clock,
 				      struct timespec *tp)
 {
 	tp->tv_sec = 0;
@@ -184,13 +184,15 @@ static inline int common_clock_getres(clockid_t which_clock,
 	return 0;
 }
 
-static inline int common_clock_get(clockid_t which_clock, struct timespec *tp)
+static inline int common_clock_get(const clockid_t which_clock,
+				   struct timespec *tp)
 {
 	getnstimeofday(tp);
 	return 0;
 }
 
-static inline int common_clock_set(clockid_t which_clock, struct timespec *tp)
+static inline int common_clock_set(const clockid_t which_clock,
+				   struct timespec *tp)
 {
 	return do_sys_settimeofday(tp, NULL);
 }
@@ -207,7 +209,7 @@ static inline int common_timer_create(struct k_itimer *new_timer)
 /*
  * These ones are defined below.
  */
-static int common_nsleep(clockid_t, int flags, struct timespec *t);
+static int common_nsleep(const clockid_t, int flags, struct timespec *t);
 static void common_timer_get(struct k_itimer *, struct itimerspec *);
 static int common_timer_set(struct k_itimer *, int,
 			    struct itimerspec *, struct itimerspec *);
@@ -216,7 +218,7 @@ static int common_timer_del(struct k_itimer *timer);
 /*
  * Return nonzero iff we know a priori this clockid_t value is bogus.
  */
-static inline int invalid_clockid(clockid_t which_clock)
+static inline int invalid_clockid(const clockid_t which_clock)
 {
 	if (which_clock < 0)	/* CPU clock, posix_cpu_* will check it */
 		return 0;
@@ -522,7 +524,7 @@ static inline struct task_struct * good_sigevent(sigevent_t * event)
 	return rtn;
 }
 
-void register_posix_clock(clockid_t clock_id, struct k_clock *new_clock)
+void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
 {
 	if ((unsigned) clock_id >= MAX_CLOCKS) {
 		printk("POSIX clock register failed for clock_id %d\n",
@@ -568,7 +570,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 /* Create a POSIX.1b interval timer. */
 
 asmlinkage long
-sys_timer_create(clockid_t which_clock,
+sys_timer_create(const clockid_t which_clock,
 		 struct sigevent __user *timer_event_spec,
 		 timer_t __user * created_timer_id)
 {
@@ -1195,7 +1197,8 @@ static u64 do_posix_clock_monotonic_gettime_parts(
 	return jiff;
 }
 
-static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp)
+static int do_posix_clock_monotonic_get(const clockid_t clock,
+					struct timespec *tp)
 {
 	struct timespec wall_to_mono;
 
@@ -1212,7 +1215,7 @@ int do_posix_clock_monotonic_gettime(struct timespec *tp)
 	return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp);
 }
 
-int do_posix_clock_nosettime(clockid_t clockid, struct timespec *tp)
+int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
 {
 	return -EINVAL;
 }
@@ -1224,7 +1227,8 @@ int do_posix_clock_notimer_create(struct k_itimer *timer)
 }
 EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create);
 
-int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t)
+int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
+			       struct timespec *t)
 {
 #ifndef ENOTSUP
 	return -EOPNOTSUPP;	/* aka ENOTSUP in userland for POSIX */
@@ -1234,8 +1238,8 @@ int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t)
 }
 EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
 
-asmlinkage long
-sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp)
+asmlinkage long sys_clock_settime(const clockid_t which_clock,
+				  const struct timespec __user *tp)
 {
 	struct timespec new_tp;
 
@@ -1248,7 +1252,7 @@ sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp)
 }
 
 asmlinkage long
-sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp)
+sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp)
 {
 	struct timespec kernel_tp;
 	int error;
@@ -1265,7 +1269,7 @@ sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp)
 }
 
 asmlinkage long
-sys_clock_getres(clockid_t which_clock, struct timespec __user *tp)
+sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp)
 {
 	struct timespec rtn_tp;
 	int error;
@@ -1387,7 +1391,7 @@ void clock_was_set(void)
 long clock_nanosleep_restart(struct restart_block *restart_block);
 
 asmlinkage long
-sys_clock_nanosleep(clockid_t which_clock, int flags,
+sys_clock_nanosleep(const clockid_t which_clock, int flags,
 		    const struct timespec __user *rqtp,
 		    struct timespec __user *rmtp)
 {
@@ -1419,7 +1423,7 @@ sys_clock_nanosleep(clockid_t which_clock, int flags,
 }
 
 
-static int common_nsleep(clockid_t which_clock,
+static int common_nsleep(const clockid_t which_clock,
 			 int flags, struct timespec *tsave)
 {
 	struct timespec t, dum;
-- 
cgit v1.2.2


From 5f82b2b77e66d452c3037cc47f436d2d76fd5f06 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:29 -0800
Subject: [PATCH] hrtimer: create and use timespec_valid macro

add timespec_valid(ts) [returns false if the timespec is denorm]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-timers.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 69d5a4b5395b..6b851a1bf4b0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -712,8 +712,7 @@ out:
  */
 static int good_timespec(const struct timespec *ts)
 {
-	if ((!ts) || (ts->tv_sec < 0) ||
-			((unsigned) ts->tv_nsec >= NSEC_PER_SEC))
+	if ((!ts) || !timespec_valid(ts))
 		return 0;
 	return 1;
 }
@@ -1406,7 +1405,7 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags,
 	if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
 		return -EFAULT;
 
-	if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0)
+	if (!timespec_valid(&t))
 		return -EINVAL;
 
 	/*
-- 
cgit v1.2.2


From 718bcceb5ad72a5def2c2fd1fa3ef964be32a857 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:29 -0800
Subject: [PATCH] hrtimer: validate timespec of do_sys_settimeofday

Check if the timespec which is provided from user space is normalized.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index bbc8ca194b59..c689b53297cf 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -154,6 +154,9 @@ int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
 	static int firsttime = 1;
 	int error = 0;
 
+	if (!timespec_valid(tv))
+		return -EINVAL;
+
 	error = security_settime(tv, tz);
 	if (error)
 		return error;
-- 
cgit v1.2.2


From f8f46da3b4cbb03b43a102b1eb92b63419e10f90 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:30 -0800
Subject: [PATCH] hrtimer: introduce nsec_t type and conversion functions

- introduce the nsec_t type

- basic nsec conversion routines: timespec_to_ns(), timeval_to_ns(),
  ns_to_timespec(), ns_to_timeval().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index c689b53297cf..cf5a4582a672 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -652,6 +652,42 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
 	ts->tv_nsec = nsec;
 }
 
+/**
+ * ns_to_timespec - Convert nanoseconds to timespec
+ * @nsec:       the nanoseconds value to be converted
+ *
+ * Returns the timespec representation of the nsec parameter.
+ */
+inline struct timespec ns_to_timespec(const nsec_t nsec)
+{
+	struct timespec ts;
+
+	if (nsec)
+		ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC,
+						     &ts.tv_nsec);
+	else
+		ts.tv_sec = ts.tv_nsec = 0;
+
+	return ts;
+}
+
+/**
+ * ns_to_timeval - Convert nanoseconds to timeval
+ * @nsec:       the nanoseconds value to be converted
+ *
+ * Returns the timeval representation of the nsec parameter.
+ */
+struct timeval ns_to_timeval(const nsec_t nsec)
+{
+	struct timespec ts = ns_to_timespec(nsec);
+	struct timeval tv;
+
+	tv.tv_sec = ts.tv_sec;
+	tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000;
+
+	return tv;
+}
+
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
-- 
cgit v1.2.2


From c0a3132963db68f1fbbd0e316b73de100fee3f08 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:32 -0800
Subject: [PATCH] hrtimer: hrtimer core code

hrtimer subsystem core.  It is initialized at bootup and expired by the timer
interrupt, but is otherwise not utilized by any other subsystem yet.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/Makefile  |   3 +-
 kernel/hrtimer.c | 679 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/timer.c   |   1 +
 3 files changed, 682 insertions(+), 1 deletion(-)
 create mode 100644 kernel/hrtimer.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 1e039700c0ad..355126606d1b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,8 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o intermodule.o extable.o params.o posix-timers.o \
-	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o
+	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
+	    hrtimer.o
 
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_FUTEX) += futex.o
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
new file mode 100644
index 000000000000..690efd9d9adf
--- /dev/null
+++ b/kernel/hrtimer.c
@@ -0,0 +1,679 @@
+/*
+ *  linux/kernel/hrtimer.c
+ *
+ *  Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
+ *
+ *  High-resolution kernel timers
+ *
+ *  In contrast to the low-resolution timeout API implemented in
+ *  kernel/timer.c, hrtimers provide finer resolution and accuracy
+ *  depending on system configuration and capabilities.
+ *
+ *  These timers are currently used for:
+ *   - itimers
+ *   - POSIX timers
+ *   - nanosleep
+ *   - precise in-kernel timing
+ *
+ *  Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  Credits:
+ *	based on kernel/timer.c
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/hrtimer.h>
+#include <linux/notifier.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+
+#include <asm/uaccess.h>
+
+/**
+ * ktime_get - get the monotonic time in ktime_t format
+ *
+ * returns the time in ktime_t format
+ */
+static ktime_t ktime_get(void)
+{
+	struct timespec now;
+
+	ktime_get_ts(&now);
+
+	return timespec_to_ktime(now);
+}
+
+/**
+ * ktime_get_real - get the real (wall-) time in ktime_t format
+ *
+ * returns the time in ktime_t format
+ */
+static ktime_t ktime_get_real(void)
+{
+	struct timespec now;
+
+	getnstimeofday(&now);
+
+	return timespec_to_ktime(now);
+}
+
+EXPORT_SYMBOL_GPL(ktime_get_real);
+
+/*
+ * The timer bases:
+ */
+
+#define MAX_HRTIMER_BASES 2
+
+static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
+{
+	{
+		.index = CLOCK_REALTIME,
+		.get_time = &ktime_get_real,
+		.resolution = KTIME_REALTIME_RES,
+	},
+	{
+		.index = CLOCK_MONOTONIC,
+		.get_time = &ktime_get,
+		.resolution = KTIME_MONOTONIC_RES,
+	},
+};
+
+/**
+ * ktime_get_ts - get the monotonic clock in timespec format
+ *
+ * @ts:		pointer to timespec variable
+ *
+ * The function calculates the monotonic clock from the realtime
+ * clock and the wall_to_monotonic offset and stores the result
+ * in normalized timespec format in the variable pointed to by ts.
+ */
+void ktime_get_ts(struct timespec *ts)
+{
+	struct timespec tomono;
+	unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		getnstimeofday(ts);
+		tomono = wall_to_monotonic;
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
+				ts->tv_nsec + tomono.tv_nsec);
+}
+
+/*
+ * Functions and macros which are different for UP/SMP systems are kept in a
+ * single place
+ */
+#ifdef CONFIG_SMP
+
+#define set_curr_timer(b, t)		do { (b)->curr_timer = (t); } while (0)
+
+/*
+ * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
+ * means that all timers which are tied to this base via timer->base are
+ * locked, and the base itself is locked too.
+ *
+ * So __run_timers/migrate_timers can safely modify all timers which could
+ * be found on the lists/queues.
+ *
+ * When the timer's base is locked, and the timer removed from list, it is
+ * possible to set timer->base = NULL and drop the lock: the timer remains
+ * locked.
+ */
+static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer,
+					      unsigned long *flags)
+{
+	struct hrtimer_base *base;
+
+	for (;;) {
+		base = timer->base;
+		if (likely(base != NULL)) {
+			spin_lock_irqsave(&base->lock, *flags);
+			if (likely(base == timer->base))
+				return base;
+			/* The timer has migrated to another CPU: */
+			spin_unlock_irqrestore(&base->lock, *flags);
+		}
+		cpu_relax();
+	}
+}
+
+/*
+ * Switch the timer base to the current CPU when possible.
+ */
+static inline struct hrtimer_base *
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
+{
+	struct hrtimer_base *new_base;
+
+	new_base = &__get_cpu_var(hrtimer_bases[base->index]);
+
+	if (base != new_base) {
+		/*
+		 * We are trying to schedule the timer on the local CPU.
+		 * However we can't change timer's base while it is running,
+		 * so we keep it on the same CPU. No hassle vs. reprogramming
+		 * the event source in the high resolution case. The softirq
+		 * code will take care of this when the timer function has
+		 * completed. There is no conflict as we hold the lock until
+		 * the timer is enqueued.
+		 */
+		if (unlikely(base->curr_timer == timer))
+			return base;
+
+		/* See the comment in lock_timer_base() */
+		timer->base = NULL;
+		spin_unlock(&base->lock);
+		spin_lock(&new_base->lock);
+		timer->base = new_base;
+	}
+	return new_base;
+}
+
+#else /* CONFIG_SMP */
+
+#define set_curr_timer(b, t)		do { } while (0)
+
+static inline struct hrtimer_base *
+lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+{
+	struct hrtimer_base *base = timer->base;
+
+	spin_lock_irqsave(&base->lock, *flags);
+
+	return base;
+}
+
+#define switch_hrtimer_base(t, b)	(b)
+
+#endif	/* !CONFIG_SMP */
+
+/*
+ * Functions for the union type storage format of ktime_t which are
+ * too large for inlining:
+ */
+#if BITS_PER_LONG < 64
+# ifndef CONFIG_KTIME_SCALAR
+/**
+ * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
+ *
+ * @kt:		addend
+ * @nsec:	the scalar nsec value to add
+ *
+ * Returns the sum of kt and nsec in ktime_t format
+ */
+ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
+{
+	ktime_t tmp;
+
+	if (likely(nsec < NSEC_PER_SEC)) {
+		tmp.tv64 = nsec;
+	} else {
+		unsigned long rem = do_div(nsec, NSEC_PER_SEC);
+
+		tmp = ktime_set((long)nsec, rem);
+	}
+
+	return ktime_add(kt, tmp);
+}
+
+#else /* CONFIG_KTIME_SCALAR */
+
+# endif /* !CONFIG_KTIME_SCALAR */
+
+/*
+ * Divide a ktime value by a nanosecond value
+ */
+static unsigned long ktime_divns(const ktime_t kt, nsec_t div)
+{
+	u64 dclc, inc, dns;
+	int sft = 0;
+
+	dclc = dns = ktime_to_ns(kt);
+	inc = div;
+	/* Make sure the divisor is less than 2^32: */
+	while (div >> 32) {
+		sft++;
+		div >>= 1;
+	}
+	dclc >>= sft;
+	do_div(dclc, (unsigned long) div);
+
+	return (unsigned long) dclc;
+}
+
+#else /* BITS_PER_LONG < 64 */
+# define ktime_divns(kt, div)		(unsigned long)((kt).tv64 / (div))
+#endif /* BITS_PER_LONG >= 64 */
+
+/*
+ * Counterpart to lock_timer_base above:
+ */
+static inline
+void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+{
+	spin_unlock_irqrestore(&timer->base->lock, *flags);
+}
+
+/**
+ * hrtimer_forward - forward the timer expiry
+ *
+ * @timer:	hrtimer to forward
+ * @interval:	the interval to forward
+ *
+ * Forward the timer expiry so it will expire in the future.
+ * The number of overruns is added to the overrun field.
+ */
+unsigned long
+hrtimer_forward(struct hrtimer *timer, const ktime_t interval)
+{
+	unsigned long orun = 1;
+	ktime_t delta, now;
+
+	now = timer->base->get_time();
+
+	delta = ktime_sub(now, timer->expires);
+
+	if (delta.tv64 < 0)
+		return 0;
+
+	if (unlikely(delta.tv64 >= interval.tv64)) {
+		nsec_t incr = ktime_to_ns(interval);
+
+		orun = ktime_divns(delta, incr);
+		timer->expires = ktime_add_ns(timer->expires, incr * orun);
+		if (timer->expires.tv64 > now.tv64)
+			return orun;
+		/*
+		 * This (and the ktime_add() below) is the
+		 * correction for exact:
+		 */
+		orun++;
+	}
+	timer->expires = ktime_add(timer->expires, interval);
+
+	return orun;
+}
+
+/*
+ * enqueue_hrtimer - internal function to (re)start a timer
+ *
+ * The timer is inserted in expiry order. Insertion into the
+ * red black tree is O(log(n)). Must hold the base lock.
+ */
+static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+{
+	struct rb_node **link = &base->active.rb_node;
+	struct list_head *prev = &base->pending;
+	struct rb_node *parent = NULL;
+	struct hrtimer *entry;
+
+	/*
+	 * Find the right place in the rbtree:
+	 */
+	while (*link) {
+		parent = *link;
+		entry = rb_entry(parent, struct hrtimer, node);
+		/*
+		 * We dont care about collisions. Nodes with
+		 * the same expiry time stay together.
+		 */
+		if (timer->expires.tv64 < entry->expires.tv64)
+			link = &(*link)->rb_left;
+		else {
+			link = &(*link)->rb_right;
+			prev = &entry->list;
+		}
+	}
+
+	/*
+	 * Insert the timer to the rbtree and to the sorted list:
+	 */
+	rb_link_node(&timer->node, parent, link);
+	rb_insert_color(&timer->node, &base->active);
+	list_add(&timer->list, prev);
+
+	timer->state = HRTIMER_PENDING;
+}
+
+
+/*
+ * __remove_hrtimer - internal function to remove a timer
+ *
+ * Caller must hold the base lock.
+ */
+static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+{
+	/*
+	 * Remove the timer from the sorted list and from the rbtree:
+	 */
+	list_del(&timer->list);
+	rb_erase(&timer->node, &base->active);
+}
+
+/*
+ * remove hrtimer, called with base lock held
+ */
+static inline int
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+{
+	if (hrtimer_active(timer)) {
+		__remove_hrtimer(timer, base);
+		timer->state = HRTIMER_INACTIVE;
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * hrtimer_start - (re)start an relative timer on the current CPU
+ *
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ *
+ * Returns:
+ *  0 on success
+ *  1 when the timer was active
+ */
+int
+hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+{
+	struct hrtimer_base *base, *new_base;
+	unsigned long flags;
+	int ret;
+
+	base = lock_hrtimer_base(timer, &flags);
+
+	/* Remove an active timer from the queue: */
+	ret = remove_hrtimer(timer, base);
+
+	/* Switch the timer base, if necessary: */
+	new_base = switch_hrtimer_base(timer, base);
+
+	if (mode == HRTIMER_REL)
+		tim = ktime_add(tim, new_base->get_time());
+	timer->expires = tim;
+
+	enqueue_hrtimer(timer, new_base);
+
+	unlock_hrtimer_base(timer, &flags);
+
+	return ret;
+}
+
+/**
+ * hrtimer_try_to_cancel - try to deactivate a timer
+ *
+ * @timer:	hrtimer to stop
+ *
+ * Returns:
+ *  0 when the timer was not active
+ *  1 when the timer was active
+ * -1 when the timer is currently excuting the callback function and
+ *    can not be stopped
+ */
+int hrtimer_try_to_cancel(struct hrtimer *timer)
+{
+	struct hrtimer_base *base;
+	unsigned long flags;
+	int ret = -1;
+
+	base = lock_hrtimer_base(timer, &flags);
+
+	if (base->curr_timer != timer)
+		ret = remove_hrtimer(timer, base);
+
+	unlock_hrtimer_base(timer, &flags);
+
+	return ret;
+
+}
+
+/**
+ * hrtimer_cancel - cancel a timer and wait for the handler to finish.
+ *
+ * @timer:	the timer to be cancelled
+ *
+ * Returns:
+ *  0 when the timer was not active
+ *  1 when the timer was active
+ */
+int hrtimer_cancel(struct hrtimer *timer)
+{
+	for (;;) {
+		int ret = hrtimer_try_to_cancel(timer);
+
+		if (ret >= 0)
+			return ret;
+	}
+}
+
+/**
+ * hrtimer_get_remaining - get remaining time for the timer
+ *
+ * @timer:	the timer to read
+ */
+ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
+{
+	struct hrtimer_base *base;
+	unsigned long flags;
+	ktime_t rem;
+
+	base = lock_hrtimer_base(timer, &flags);
+	rem = ktime_sub(timer->expires, timer->base->get_time());
+	unlock_hrtimer_base(timer, &flags);
+
+	return rem;
+}
+
+/**
+ * hrtimer_rebase - rebase an initialized hrtimer to a different base
+ *
+ * @timer:	the timer to be rebased
+ * @clock_id:	the clock to be used
+ */
+void hrtimer_rebase(struct hrtimer *timer, const clockid_t clock_id)
+{
+	struct hrtimer_base *bases;
+
+	bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
+	timer->base = &bases[clock_id];
+}
+
+/**
+ * hrtimer_init - initialize a timer to the given clock
+ *
+ * @timer:	the timer to be initialized
+ * @clock_id:	the clock to be used
+ */
+void hrtimer_init(struct hrtimer *timer, const clockid_t clock_id)
+{
+	memset(timer, 0, sizeof(struct hrtimer));
+	hrtimer_rebase(timer, clock_id);
+}
+
+/**
+ * hrtimer_get_res - get the timer resolution for a clock
+ *
+ * @which_clock: which clock to query
+ * @tp:		 pointer to timespec variable to store the resolution
+ *
+ * Store the resolution of the clock selected by which_clock in the
+ * variable pointed to by tp.
+ */
+int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
+{
+	struct hrtimer_base *bases;
+
+	tp->tv_sec = 0;
+	bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
+	tp->tv_nsec = bases[which_clock].resolution;
+
+	return 0;
+}
+
+/*
+ * Expire the per base hrtimer-queue:
+ */
+static inline void run_hrtimer_queue(struct hrtimer_base *base)
+{
+	ktime_t now = base->get_time();
+
+	spin_lock_irq(&base->lock);
+
+	while (!list_empty(&base->pending)) {
+		struct hrtimer *timer;
+		int (*fn)(void *);
+		int restart;
+		void *data;
+
+		timer = list_entry(base->pending.next, struct hrtimer, list);
+		if (now.tv64 <= timer->expires.tv64)
+			break;
+
+		fn = timer->function;
+		data = timer->data;
+		set_curr_timer(base, timer);
+		__remove_hrtimer(timer, base);
+		spin_unlock_irq(&base->lock);
+
+		/*
+		 * fn == NULL is special case for the simplest timer
+		 * variant - wake up process and do not restart:
+		 */
+		if (!fn) {
+			wake_up_process(data);
+			restart = HRTIMER_NORESTART;
+		} else
+			restart = fn(data);
+
+		spin_lock_irq(&base->lock);
+
+		if (restart == HRTIMER_RESTART)
+			enqueue_hrtimer(timer, base);
+		else
+			timer->state = HRTIMER_EXPIRED;
+	}
+	set_curr_timer(base, NULL);
+	spin_unlock_irq(&base->lock);
+}
+
+/*
+ * Called from timer softirq every jiffy, expire hrtimers:
+ */
+void hrtimer_run_queues(void)
+{
+	struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
+	int i;
+
+	for (i = 0; i < MAX_HRTIMER_BASES; i++)
+		run_hrtimer_queue(&base[i]);
+}
+
+/*
+ * Functions related to boot-time initialization:
+ */
+static void __devinit init_hrtimers_cpu(int cpu)
+{
+	struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu);
+	int i;
+
+	for (i = 0; i < MAX_HRTIMER_BASES; i++) {
+		spin_lock_init(&base->lock);
+		INIT_LIST_HEAD(&base->pending);
+		base++;
+	}
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void migrate_hrtimer_list(struct hrtimer_base *old_base,
+				struct hrtimer_base *new_base)
+{
+	struct hrtimer *timer;
+	struct rb_node *node;
+
+	while ((node = rb_first(&old_base->active))) {
+		timer = rb_entry(node, struct hrtimer, node);
+		__remove_hrtimer(timer, old_base);
+		timer->base = new_base;
+		enqueue_hrtimer(timer, new_base);
+	}
+}
+
+static void migrate_hrtimers(int cpu)
+{
+	struct hrtimer_base *old_base, *new_base;
+	int i;
+
+	BUG_ON(cpu_online(cpu));
+	old_base = per_cpu(hrtimer_bases, cpu);
+	new_base = get_cpu_var(hrtimer_bases);
+
+	local_irq_disable();
+
+	for (i = 0; i < MAX_HRTIMER_BASES; i++) {
+
+		spin_lock(&new_base->lock);
+		spin_lock(&old_base->lock);
+
+		BUG_ON(old_base->curr_timer);
+
+		migrate_hrtimer_list(old_base, new_base);
+
+		spin_unlock(&old_base->lock);
+		spin_unlock(&new_base->lock);
+		old_base++;
+		new_base++;
+	}
+
+	local_irq_enable();
+	put_cpu_var(hrtimer_bases);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
+					unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+
+	case CPU_UP_PREPARE:
+		init_hrtimers_cpu(cpu);
+		break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		migrate_hrtimers(cpu);
+		break;
+#endif
+
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata hrtimers_nb = {
+	.notifier_call = hrtimer_cpu_notify,
+};
+
+void __init hrtimers_init(void)
+{
+	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
+			  (void *)(long)smp_processor_id());
+	register_cpu_notifier(&hrtimers_nb);
+}
+
diff --git a/kernel/timer.c b/kernel/timer.c
index 074b4bd5cfd8..80bf2acf6b08 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -858,6 +858,7 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	tvec_base_t *base = &__get_cpu_var(tvec_bases);
 
+ 	hrtimer_run_queues();
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
 }
-- 
cgit v1.2.2


From 2ff678b8da6478d861c1b0ecb3ac14575760e906 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:34 -0800
Subject: [PATCH] hrtimer: switch itimers to hrtimer

switch itimers to a hrtimers-based implementation

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c   |   2 +-
 kernel/fork.c   |   6 ++--
 kernel/itimer.c | 106 +++++++++++++++++++++++++++-----------------------------
 3 files changed, 55 insertions(+), 59 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 309a46fa16f8..e75a51f33768 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -842,7 +842,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	}
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
- 		del_timer_sync(&tsk->signal->real_timer);
+ 		hrtimer_cancel(&tsk->signal->real_timer);
 		exit_itimers(tsk->signal);
 		acct_process(code);
 	}
diff --git a/kernel/fork.c b/kernel/fork.c
index b18d64554feb..3bdcab49998d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -801,10 +801,10 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
 
-	sig->it_real_value = sig->it_real_incr = 0;
+	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC);
+	sig->it_real_incr.tv64 = 0;
 	sig->real_timer.function = it_real_fn;
-	sig->real_timer.data = (unsigned long) tsk;
-	init_timer(&sig->real_timer);
+	sig->real_timer.data = tsk;
 
 	sig->it_virt_expires = cputime_zero;
 	sig->it_virt_incr = cputime_zero;
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 7c1b25e25e47..c2c05c4ff28d 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -12,36 +12,46 @@
 #include <linux/syscalls.h>
 #include <linux/time.h>
 #include <linux/posix-timers.h>
+#include <linux/hrtimer.h>
 
 #include <asm/uaccess.h>
 
-static unsigned long it_real_value(struct signal_struct *sig)
+/**
+ * itimer_get_remtime - get remaining time for the timer
+ *
+ * @timer: the timer to read
+ *
+ * Returns the delta between the expiry time and now, which can be
+ * less than zero or 1usec for an pending expired timer
+ */
+static struct timeval itimer_get_remtime(struct hrtimer *timer)
 {
-	unsigned long val = 0;
-	if (timer_pending(&sig->real_timer)) {
-		val = sig->real_timer.expires - jiffies;
+	ktime_t rem = hrtimer_get_remaining(timer);
 
-		/* look out for negative/zero itimer.. */
-		if ((long) val <= 0)
-			val = 1;
-	}
-	return val;
+	/*
+	 * Racy but safe: if the itimer expires after the above
+	 * hrtimer_get_remtime() call but before this condition
+	 * then we return 0 - which is correct.
+	 */
+	if (hrtimer_active(timer)) {
+		if (rem.tv64 <= 0)
+			rem.tv64 = NSEC_PER_USEC;
+	} else
+		rem.tv64 = 0;
+
+	return ktime_to_timeval(rem);
 }
 
 int do_getitimer(int which, struct itimerval *value)
 {
 	struct task_struct *tsk = current;
-	unsigned long interval, val;
 	cputime_t cinterval, cval;
 
 	switch (which) {
 	case ITIMER_REAL:
-		spin_lock_irq(&tsk->sighand->siglock);
-		interval = tsk->signal->it_real_incr;
-		val = it_real_value(tsk->signal);
-		spin_unlock_irq(&tsk->sighand->siglock);
-		jiffies_to_timeval(val, &value->it_value);
-		jiffies_to_timeval(interval, &value->it_interval);
+		value->it_value = itimer_get_remtime(&tsk->signal->real_timer);
+		value->it_interval =
+			ktime_to_timeval(tsk->signal->it_real_incr);
 		break;
 	case ITIMER_VIRTUAL:
 		read_lock(&tasklist_lock);
@@ -113,59 +123,45 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
 }
 
 
-void it_real_fn(unsigned long __data)
+/*
+ * The timer is automagically restarted, when interval != 0
+ */
+int it_real_fn(void *data)
 {
-	struct task_struct * p = (struct task_struct *) __data;
-	unsigned long inc = p->signal->it_real_incr;
+	struct task_struct *tsk = (struct task_struct *) data;
 
-	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p);
+	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk);
 
-	/*
-	 * Now restart the timer if necessary.  We don't need any locking
-	 * here because do_setitimer makes sure we have finished running
-	 * before it touches anything.
-	 * Note, we KNOW we are (or should be) at a jiffie edge here so
-	 * we don't need the +1 stuff.  Also, we want to use the prior
-	 * expire value so as to not "slip" a jiffie if we are late.
-	 * Deal with requesting a time prior to "now" here rather than
-	 * in add_timer.
-	 */
-	if (!inc)
-		return;
-	while (time_before_eq(p->signal->real_timer.expires, jiffies))
-		p->signal->real_timer.expires += inc;
-	add_timer(&p->signal->real_timer);
+	if (tsk->signal->it_real_incr.tv64 != 0) {
+		hrtimer_forward(&tsk->signal->real_timer,
+			       tsk->signal->it_real_incr);
+
+		return HRTIMER_RESTART;
+	}
+	return HRTIMER_NORESTART;
 }
 
 int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 {
 	struct task_struct *tsk = current;
- 	unsigned long val, interval, expires;
+	struct hrtimer *timer;
+	ktime_t expires;
 	cputime_t cval, cinterval, nval, ninterval;
 
 	switch (which) {
 	case ITIMER_REAL:
-again:
-		spin_lock_irq(&tsk->sighand->siglock);
-		interval = tsk->signal->it_real_incr;
-		val = it_real_value(tsk->signal);
-		/* We are sharing ->siglock with it_real_fn() */
-		if (try_to_del_timer_sync(&tsk->signal->real_timer) < 0) {
-			spin_unlock_irq(&tsk->sighand->siglock);
-			goto again;
-		}
-		tsk->signal->it_real_incr =
-			timeval_to_jiffies(&value->it_interval);
-		expires = timeval_to_jiffies(&value->it_value);
-		if (expires)
-			mod_timer(&tsk->signal->real_timer,
-				  jiffies + 1 + expires);
-		spin_unlock_irq(&tsk->sighand->siglock);
+		timer = &tsk->signal->real_timer;
+		hrtimer_cancel(timer);
 		if (ovalue) {
-			jiffies_to_timeval(val, &ovalue->it_value);
-			jiffies_to_timeval(interval,
-					   &ovalue->it_interval);
+			ovalue->it_value = itimer_get_remtime(timer);
+			ovalue->it_interval
+				= ktime_to_timeval(tsk->signal->it_real_incr);
 		}
+		tsk->signal->it_real_incr =
+			timeval_to_ktime(value->it_interval);
+		expires = timeval_to_ktime(value->it_value);
+		if (expires.tv64 != 0)
+			hrtimer_start(timer, expires, HRTIMER_REL);
 		break;
 	case ITIMER_VIRTUAL:
 		nval = timeval_to_cputime(&value->it_value);
-- 
cgit v1.2.2


From 10c94ec16dd187f8d8dfdbb088e98330c05bf03c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:35 -0800
Subject: [PATCH] hrtimer: create hrtimer nanosleep API

introduce the hrtimer_nanosleep() and hrtimer_nanosleep_real() APIs.  Not yet
used by any code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/hrtimer.c | 127 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 127 insertions(+)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 690efd9d9adf..64d37a3c5948 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -580,6 +580,133 @@ void hrtimer_run_queues(void)
 		run_hrtimer_queue(&base[i]);
 }
 
+/*
+ * Sleep related functions:
+ */
+
+/**
+ * schedule_hrtimer - sleep until timeout
+ *
+ * @timer:	hrtimer variable initialized with the correct clock base
+ * @mode:	timeout value is abs/rel
+ *
+ * Make the current task sleep until @timeout is
+ * elapsed.
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to
+ * pass before the routine returns. The routine will return 0
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task. In this case the remaining time
+ * will be returned
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ */
+static ktime_t __sched
+schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode)
+{
+	/* fn stays NULL, meaning single-shot wakeup: */
+	timer->data = current;
+
+	hrtimer_start(timer, timer->expires, mode);
+
+	schedule();
+	hrtimer_cancel(timer);
+
+	/* Return the remaining time: */
+	if (timer->state != HRTIMER_EXPIRED)
+		return ktime_sub(timer->expires, timer->base->get_time());
+	else
+		return (ktime_t) {.tv64 = 0 };
+}
+
+static inline ktime_t __sched
+schedule_hrtimer_interruptible(struct hrtimer *timer,
+			       const enum hrtimer_mode mode)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	return schedule_hrtimer(timer, mode);
+}
+
+static long __sched
+nanosleep_restart(struct restart_block *restart, clockid_t clockid)
+{
+	struct timespec __user *rmtp, tu;
+	void *rfn_save = restart->fn;
+	struct hrtimer timer;
+	ktime_t rem;
+
+	restart->fn = do_no_restart_syscall;
+
+	hrtimer_init(&timer, clockid);
+
+	timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
+
+	rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS);
+
+	if (rem.tv64 <= 0)
+		return 0;
+
+	rmtp = (struct timespec __user *) restart->arg2;
+	tu = ktime_to_timespec(rem);
+	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
+		return -EFAULT;
+
+	restart->fn = rfn_save;
+
+	/* The other values in restart are already filled in */
+	return -ERESTART_RESTARTBLOCK;
+}
+
+static long __sched nanosleep_restart_mono(struct restart_block *restart)
+{
+	return nanosleep_restart(restart, CLOCK_MONOTONIC);
+}
+
+static long __sched nanosleep_restart_real(struct restart_block *restart)
+{
+	return nanosleep_restart(restart, CLOCK_REALTIME);
+}
+
+long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
+		       const enum hrtimer_mode mode, const clockid_t clockid)
+{
+	struct restart_block *restart;
+	struct hrtimer timer;
+	struct timespec tu;
+	ktime_t rem;
+
+	hrtimer_init(&timer, clockid);
+
+	timer.expires = timespec_to_ktime(*rqtp);
+
+	rem = schedule_hrtimer_interruptible(&timer, mode);
+	if (rem.tv64 <= 0)
+		return 0;
+
+	/* Absolute timers do not update the rmtp value: */
+	if (mode == HRTIMER_ABS)
+		return -ERESTARTNOHAND;
+
+	tu = ktime_to_timespec(rem);
+
+	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
+		return -EFAULT;
+
+	restart = &current_thread_info()->restart_block;
+	restart->fn = (clockid == CLOCK_MONOTONIC) ?
+		nanosleep_restart_mono : nanosleep_restart_real;
+	restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF;
+	restart->arg1 = timer.expires.tv64 >> 32;
+	restart->arg2 = (unsigned long) rmtp;
+
+	return -ERESTART_RESTARTBLOCK;
+}
+
 /*
  * Functions related to boot-time initialization:
  */
-- 
cgit v1.2.2


From 6ba1b91213e81aa92b5cf7539f7d2a94ff54947c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:36 -0800
Subject: [PATCH] hrtimer: switch sys_nanosleep to hrtimer

convert sys_nanosleep() to use hrtimer_nanosleep()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/hrtimer.c | 14 ++++++++++++++
 kernel/timer.c   | 56 --------------------------------------------------------
 2 files changed, 14 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 64d37a3c5948..11fde0caee56 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -707,6 +707,20 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 	return -ERESTART_RESTARTBLOCK;
 }
 
+asmlinkage long
+sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
+{
+	struct timespec tu;
+
+	if (copy_from_user(&tu, rqtp, sizeof(tu)))
+		return -EFAULT;
+
+	if (!timespec_valid(&tu))
+		return -EINVAL;
+
+	return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC);
+}
+
 /*
  * Functions related to boot-time initialization:
  */
diff --git a/kernel/timer.c b/kernel/timer.c
index 80bf2acf6b08..4f1cb0ab5251 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1120,62 +1120,6 @@ asmlinkage long sys_gettid(void)
 	return current->pid;
 }
 
-static long __sched nanosleep_restart(struct restart_block *restart)
-{
-	unsigned long expire = restart->arg0, now = jiffies;
-	struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;
-	long ret;
-
-	/* Did it expire while we handled signals? */
-	if (!time_after(expire, now))
-		return 0;
-
-	expire = schedule_timeout_interruptible(expire - now);
-
-	ret = 0;
-	if (expire) {
-		struct timespec t;
-		jiffies_to_timespec(expire, &t);
-
-		ret = -ERESTART_RESTARTBLOCK;
-		if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
-			ret = -EFAULT;
-		/* The 'restart' block is already filled in */
-	}
-	return ret;
-}
-
-asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
-{
-	struct timespec t;
-	unsigned long expire;
-	long ret;
-
-	if (copy_from_user(&t, rqtp, sizeof(t)))
-		return -EFAULT;
-
-	if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
-		return -EINVAL;
-
-	expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
-	expire = schedule_timeout_interruptible(expire);
-
-	ret = 0;
-	if (expire) {
-		struct restart_block *restart;
-		jiffies_to_timespec(expire, &t);
-		if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
-			return -EFAULT;
-
-		restart = &current_thread_info()->restart_block;
-		restart->fn = nanosleep_restart;
-		restart->arg0 = jiffies + expire;
-		restart->arg1 = (unsigned long) rmtp;
-		ret = -ERESTART_RESTARTBLOCK;
-	}
-	return ret;
-}
-
 /*
  * sys_sysinfo - fill in sysinfo struct
  */ 
-- 
cgit v1.2.2


From 97735f25d2ba898ec5e13746451525580631c834 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:37 -0800
Subject: [PATCH] hrtimer: switch clock_nanosleep to hrtimer nanosleep API

Switch clock_nanosleep to use the new nanosleep functions in hrtimer.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c |  23 ++++---
 kernel/posix-timers.c     | 151 +++++++++-------------------------------------
 2 files changed, 41 insertions(+), 133 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index abf6990c6eb5..520f6c59948d 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1400,7 +1400,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
 
 int posix_cpu_nsleep(const clockid_t which_clock, int flags,
-		     struct timespec *rqtp)
+		     struct timespec *rqtp, struct timespec __user *rmtp)
 {
 	struct restart_block *restart_block =
 	    &current_thread_info()->restart_block;
@@ -1425,7 +1425,6 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 	error = posix_cpu_timer_create(&timer);
 	timer.it_process = current;
 	if (!error) {
-		struct timespec __user *rmtp;
 		static struct itimerspec zero_it;
 		struct itimerspec it = { .it_value = *rqtp,
 					 .it_interval = {} };
@@ -1472,7 +1471,6 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		/*
 		 * Report back to the user the time still remaining.
 		 */
-		rmtp = (struct timespec __user *) restart_block->arg1;
 		if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
 		    copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
 			return -EFAULT;
@@ -1480,6 +1478,7 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		restart_block->fn = posix_cpu_clock_nanosleep_restart;
 		/* Caller already set restart_block->arg1 */
 		restart_block->arg0 = which_clock;
+		restart_block->arg1 = (unsigned long) rmtp;
 		restart_block->arg2 = rqtp->tv_sec;
 		restart_block->arg3 = rqtp->tv_nsec;
 
@@ -1493,10 +1492,15 @@ static long
 posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
 {
 	clockid_t which_clock = restart_block->arg0;
-	struct timespec t = { .tv_sec = restart_block->arg2,
-			      .tv_nsec = restart_block->arg3 };
+	struct timespec __user *rmtp;
+	struct timespec t;
+
+	rmtp = (struct timespec __user *) restart_block->arg1;
+	t.tv_sec = restart_block->arg2;
+	t.tv_nsec = restart_block->arg3;
+
 	restart_block->fn = do_no_restart_syscall;
-	return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t);
+	return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp);
 }
 
 
@@ -1519,9 +1523,10 @@ static int process_cpu_timer_create(struct k_itimer *timer)
 	return posix_cpu_timer_create(timer);
 }
 static int process_cpu_nsleep(const clockid_t which_clock, int flags,
-			      struct timespec *rqtp)
+			      struct timespec *rqtp,
+			      struct timespec __user *rmtp)
 {
-	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
+	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
 }
 static int thread_cpu_clock_getres(const clockid_t which_clock,
 				   struct timespec *tp)
@@ -1539,7 +1544,7 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
 	return posix_cpu_timer_create(timer);
 }
 static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
-			      struct timespec *rqtp)
+			      struct timespec *rqtp, struct timespec __user *rmtp)
 {
 	return -EINVAL;
 }
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 6b851a1bf4b0..ba900587b815 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -209,7 +209,8 @@ static inline int common_timer_create(struct k_itimer *new_timer)
 /*
  * These ones are defined below.
  */
-static int common_nsleep(const clockid_t, int flags, struct timespec *t);
+static int common_nsleep(const clockid_t, int flags, struct timespec *t,
+			 struct timespec __user *rmtp);
 static void common_timer_get(struct k_itimer *, struct itimerspec *);
 static int common_timer_set(struct k_itimer *, int,
 			    struct itimerspec *, struct itimerspec *);
@@ -1227,7 +1228,7 @@ int do_posix_clock_notimer_create(struct k_itimer *timer)
 EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create);
 
 int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
-			       struct timespec *t)
+			       struct timespec *t, struct timespec __user *r)
 {
 #ifndef ENOTSUP
 	return -EOPNOTSUPP;	/* aka ENOTSUP in userland for POSIX */
@@ -1387,7 +1388,28 @@ void clock_was_set(void)
 	up(&clock_was_set_lock);
 }
 
-long clock_nanosleep_restart(struct restart_block *restart_block);
+/*
+ * nanosleep for monotonic and realtime clocks
+ */
+static int common_nsleep(const clockid_t which_clock, int flags,
+			 struct timespec *tsave, struct timespec __user *rmtp)
+{
+	int mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
+	int clockid = which_clock;
+
+	switch (which_clock) {
+	case CLOCK_REALTIME:
+		/* Posix madness. Only absolute timers on clock realtime
+		   are affected by clock set. */
+		if (mode == HRTIMER_ABS)
+			clockid = CLOCK_MONOTONIC;
+	case CLOCK_MONOTONIC:
+		break;
+	default:
+		return -EINVAL;
+	}
+	return hrtimer_nanosleep(tsave, rmtp, mode, clockid);
+}
 
 asmlinkage long
 sys_clock_nanosleep(const clockid_t which_clock, int flags,
@@ -1395,9 +1417,6 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags,
 		    struct timespec __user *rmtp)
 {
 	struct timespec t;
-	struct restart_block *restart_block =
-	    &(current_thread_info()->restart_block);
-	int ret;
 
 	if (invalid_clockid(which_clock))
 		return -EINVAL;
@@ -1408,122 +1427,6 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags,
 	if (!timespec_valid(&t))
 		return -EINVAL;
 
-	/*
-	 * Do this here as nsleep function does not have the real address.
-	 */
-	restart_block->arg1 = (unsigned long)rmtp;
-
-	ret = CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t));
-
-	if ((ret == -ERESTART_RESTARTBLOCK) && rmtp &&
-					copy_to_user(rmtp, &t, sizeof (t)))
-		return -EFAULT;
-	return ret;
-}
-
-
-static int common_nsleep(const clockid_t which_clock,
-			 int flags, struct timespec *tsave)
-{
-	struct timespec t, dum;
-	DECLARE_WAITQUEUE(abs_wqueue, current);
-	u64 rq_time = (u64)0;
-	s64 left;
-	int abs;
-	struct restart_block *restart_block =
-	    &current_thread_info()->restart_block;
-
-	abs_wqueue.flags = 0;
-	abs = flags & TIMER_ABSTIME;
-
-	if (restart_block->fn == clock_nanosleep_restart) {
-		/*
-		 * Interrupted by a non-delivered signal, pick up remaining
-		 * time and continue.  Remaining time is in arg2 & 3.
-		 */
-		restart_block->fn = do_no_restart_syscall;
-
-		rq_time = restart_block->arg3;
-		rq_time = (rq_time << 32) + restart_block->arg2;
-		if (!rq_time)
-			return -EINTR;
-		left = rq_time - get_jiffies_64();
-		if (left <= (s64)0)
-			return 0;	/* Already passed */
-	}
-
-	if (abs && (posix_clocks[which_clock].clock_get !=
-			    posix_clocks[CLOCK_MONOTONIC].clock_get))
-		add_wait_queue(&nanosleep_abs_wqueue, &abs_wqueue);
-
-	do {
-		t = *tsave;
-		if (abs || !rq_time) {
-			adjust_abs_time(&posix_clocks[which_clock], &t, abs,
-					&rq_time, &dum);
-		}
-
-		left = rq_time - get_jiffies_64();
-		if (left >= (s64)MAX_JIFFY_OFFSET)
-			left = (s64)MAX_JIFFY_OFFSET;
-		if (left < (s64)0)
-			break;
-
-		schedule_timeout_interruptible(left);
-
-		left = rq_time - get_jiffies_64();
-	} while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING));
-
-	if (abs_wqueue.task_list.next)
-		finish_wait(&nanosleep_abs_wqueue, &abs_wqueue);
-
-	if (left > (s64)0) {
-
-		/*
-		 * Always restart abs calls from scratch to pick up any
-		 * clock shifting that happened while we are away.
-		 */
-		if (abs)
-			return -ERESTARTNOHAND;
-
-		left *= TICK_NSEC;
-		tsave->tv_sec = div_long_long_rem(left, 
-						  NSEC_PER_SEC, 
-						  &tsave->tv_nsec);
-		/*
-		 * Restart works by saving the time remaing in 
-		 * arg2 & 3 (it is 64-bits of jiffies).  The other
-		 * info we need is the clock_id (saved in arg0). 
-		 * The sys_call interface needs the users 
-		 * timespec return address which _it_ saves in arg1.
-		 * Since we have cast the nanosleep call to a clock_nanosleep
-		 * both can be restarted with the same code.
-		 */
-		restart_block->fn = clock_nanosleep_restart;
-		restart_block->arg0 = which_clock;
-		/*
-		 * Caller sets arg1
-		 */
-		restart_block->arg2 = rq_time & 0xffffffffLL;
-		restart_block->arg3 = rq_time >> 32;
-
-		return -ERESTART_RESTARTBLOCK;
-	}
-
-	return 0;
-}
-/*
- * This will restart clock_nanosleep.
- */
-long
-clock_nanosleep_restart(struct restart_block *restart_block)
-{
-	struct timespec t;
-	int ret = common_nsleep(restart_block->arg0, 0, &t);
-
-	if ((ret == -ERESTART_RESTARTBLOCK) && restart_block->arg1 &&
-	    copy_to_user((struct timespec __user *)(restart_block->arg1), &t,
-			 sizeof (t)))
-		return -EFAULT;
-	return ret;
+	return CLOCK_DISPATCH(which_clock, nsleep,
+			      (which_clock, flags, &t, rmtp));
 }
-- 
cgit v1.2.2


From becf8b5d00f4b47e847f98322cdaf8cd16243861 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jan 2006 20:52:38 -0800
Subject: [PATCH] hrtimer: convert posix timers completely

- convert posix-timers.c to use hrtimers

- remove the now obsolete abslist code

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-timers.c | 717 ++++++++++----------------------------------------
 1 file changed, 135 insertions(+), 582 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ba900587b815..9e66e614862a 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -35,7 +35,6 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/time.h>
-#include <linux/calc64.h>
 
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
@@ -49,12 +48,6 @@
 #include <linux/workqueue.h>
 #include <linux/module.h>
 
-#define CLOCK_REALTIME_RES TICK_NSEC  /* In nano seconds. */
-
-static inline u64  mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2)
-{
-	return (u64)mpy1 * mpy2;
-}
 /*
  * Management arrays for POSIX timers.	 Timers are kept in slab memory
  * Timer ids are allocated by an external routine that keeps track of the
@@ -140,18 +133,18 @@ static DEFINE_SPINLOCK(idr_lock);
  */
 
 static struct k_clock posix_clocks[MAX_CLOCKS];
+
 /*
- * We only have one real clock that can be set so we need only one abs list,
- * even if we should want to have several clocks with differing resolutions.
+ * These ones are defined below.
  */
-static struct k_clock_abs abs_list = {.list = LIST_HEAD_INIT(abs_list.list),
-				      .lock = SPIN_LOCK_UNLOCKED};
+static int common_nsleep(const clockid_t, int flags, struct timespec *t,
+			 struct timespec __user *rmtp);
+static void common_timer_get(struct k_itimer *, struct itimerspec *);
+static int common_timer_set(struct k_itimer *, int,
+			    struct itimerspec *, struct itimerspec *);
+static int common_timer_del(struct k_itimer *timer);
 
-static void posix_timer_fn(unsigned long);
-static u64 do_posix_clock_monotonic_gettime_parts(
-	struct timespec *tp, struct timespec *mo);
-int do_posix_clock_monotonic_gettime(struct timespec *tp);
-static int do_posix_clock_monotonic_get(const clockid_t, struct timespec *tp);
+static int posix_timer_fn(void *data);
 
 static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
 
@@ -184,10 +177,12 @@ static inline int common_clock_getres(const clockid_t which_clock,
 	return 0;
 }
 
-static inline int common_clock_get(const clockid_t which_clock,
-				   struct timespec *tp)
+/*
+ * Get real time for posix timers
+ */
+static int common_clock_get(clockid_t which_clock, struct timespec *tp)
 {
-	getnstimeofday(tp);
+	ktime_get_real_ts(tp);
 	return 0;
 }
 
@@ -199,25 +194,14 @@ static inline int common_clock_set(const clockid_t which_clock,
 
 static inline int common_timer_create(struct k_itimer *new_timer)
 {
-	INIT_LIST_HEAD(&new_timer->it.real.abs_timer_entry);
-	init_timer(&new_timer->it.real.timer);
-	new_timer->it.real.timer.data = (unsigned long) new_timer;
+	hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock);
+	new_timer->it.real.timer.data = new_timer;
 	new_timer->it.real.timer.function = posix_timer_fn;
 	return 0;
 }
 
 /*
- * These ones are defined below.
- */
-static int common_nsleep(const clockid_t, int flags, struct timespec *t,
-			 struct timespec __user *rmtp);
-static void common_timer_get(struct k_itimer *, struct itimerspec *);
-static int common_timer_set(struct k_itimer *, int,
-			    struct itimerspec *, struct itimerspec *);
-static int common_timer_del(struct k_itimer *timer);
-
-/*
- * Return nonzero iff we know a priori this clockid_t value is bogus.
+ * Return nonzero if we know a priori this clockid_t value is bogus.
  */
 static inline int invalid_clockid(const clockid_t which_clock)
 {
@@ -227,26 +211,32 @@ static inline int invalid_clockid(const clockid_t which_clock)
 		return 1;
 	if (posix_clocks[which_clock].clock_getres != NULL)
 		return 0;
-#ifndef CLOCK_DISPATCH_DIRECT
 	if (posix_clocks[which_clock].res != 0)
 		return 0;
-#endif
 	return 1;
 }
 
+/*
+ * Get monotonic time for posix timers
+ */
+static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
+{
+	ktime_get_ts(tp);
+	return 0;
+}
 
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
 static __init int init_posix_timers(void)
 {
-	struct k_clock clock_realtime = {.res = CLOCK_REALTIME_RES,
-					 .abs_struct = &abs_list
+	struct k_clock clock_realtime = {
+		.clock_getres = hrtimer_get_res,
 	};
-	struct k_clock clock_monotonic = {.res = CLOCK_REALTIME_RES,
-		.abs_struct = NULL,
-		.clock_get = do_posix_clock_monotonic_get,
-		.clock_set = do_posix_clock_nosettime
+	struct k_clock clock_monotonic = {
+		.clock_getres = hrtimer_get_res,
+		.clock_get = posix_ktime_get_ts,
+		.clock_set = do_posix_clock_nosettime,
 	};
 
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
@@ -260,117 +250,17 @@ static __init int init_posix_timers(void)
 
 __initcall(init_posix_timers);
 
-static void tstojiffie(struct timespec *tp, int res, u64 *jiff)
-{
-	long sec = tp->tv_sec;
-	long nsec = tp->tv_nsec + res - 1;
-
-	if (nsec >= NSEC_PER_SEC) {
-		sec++;
-		nsec -= NSEC_PER_SEC;
-	}
-
-	/*
-	 * The scaling constants are defined in <linux/time.h>
-	 * The difference between there and here is that we do the
-	 * res rounding and compute a 64-bit result (well so does that
-	 * but it then throws away the high bits).
-  	 */
-	*jiff =  (mpy_l_X_l_ll(sec, SEC_CONVERSION) +
-		  (mpy_l_X_l_ll(nsec, NSEC_CONVERSION) >> 
-		   (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
-}
-
-/*
- * This function adjusts the timer as needed as a result of the clock
- * being set.  It should only be called for absolute timers, and then
- * under the abs_list lock.  It computes the time difference and sets
- * the new jiffies value in the timer.  It also updates the timers
- * reference wall_to_monotonic value.  It is complicated by the fact
- * that tstojiffies() only handles positive times and it needs to work
- * with both positive and negative times.  Also, for negative offsets,
- * we need to defeat the res round up.
- *
- * Return is true if there is a new time, else false.
- */
-static long add_clockset_delta(struct k_itimer *timr,
-			       struct timespec *new_wall_to)
-{
-	struct timespec delta;
-	int sign = 0;
-	u64 exp;
-
-	set_normalized_timespec(&delta,
-				new_wall_to->tv_sec -
-				timr->it.real.wall_to_prev.tv_sec,
-				new_wall_to->tv_nsec -
-				timr->it.real.wall_to_prev.tv_nsec);
-	if (likely(!(delta.tv_sec | delta.tv_nsec)))
-		return 0;
-	if (delta.tv_sec < 0) {
-		set_normalized_timespec(&delta,
-					-delta.tv_sec,
-					1 - delta.tv_nsec -
-					posix_clocks[timr->it_clock].res);
-		sign++;
-	}
-	tstojiffie(&delta, posix_clocks[timr->it_clock].res, &exp);
-	timr->it.real.wall_to_prev = *new_wall_to;
-	timr->it.real.timer.expires += (sign ? -exp : exp);
-	return 1;
-}
-
-static void remove_from_abslist(struct k_itimer *timr)
-{
-	if (!list_empty(&timr->it.real.abs_timer_entry)) {
-		spin_lock(&abs_list.lock);
-		list_del_init(&timr->it.real.abs_timer_entry);
-		spin_unlock(&abs_list.lock);
-	}
-}
-
 static void schedule_next_timer(struct k_itimer *timr)
 {
-	struct timespec new_wall_to;
-	struct now_struct now;
-	unsigned long seq;
-
-	/*
-	 * Set up the timer for the next interval (if there is one).
-	 * Note: this code uses the abs_timer_lock to protect
-	 * it.real.wall_to_prev and must hold it until exp is set, not exactly
-	 * obvious...
-
-	 * This function is used for CLOCK_REALTIME* and
-	 * CLOCK_MONOTONIC* timers.  If we ever want to handle other
-	 * CLOCKs, the calling code (do_schedule_next_timer) would need
-	 * to pull the "clock" info from the timer and dispatch the
-	 * "other" CLOCKs "next timer" code (which, I suppose should
-	 * also be added to the k_clock structure).
-	 */
-	if (!timr->it.real.incr)
+	if (timr->it.real.interval.tv64 == 0)
 		return;
 
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		new_wall_to =	wall_to_monotonic;
-		posix_get_now(&now);
-	} while (read_seqretry(&xtime_lock, seq));
-
-	if (!list_empty(&timr->it.real.abs_timer_entry)) {
-		spin_lock(&abs_list.lock);
-		add_clockset_delta(timr, &new_wall_to);
-
-		posix_bump_timer(timr, now);
-
-		spin_unlock(&abs_list.lock);
-	} else {
-		posix_bump_timer(timr, now);
-	}
+	timr->it_overrun += hrtimer_forward(&timr->it.real.timer,
+					    timr->it.real.interval);
 	timr->it_overrun_last = timr->it_overrun;
 	timr->it_overrun = -1;
 	++timr->it_requeue_pending;
-	add_timer(&timr->it.real.timer);
+	hrtimer_restart(&timr->it.real.timer);
 }
 
 /*
@@ -391,31 +281,23 @@ void do_schedule_next_timer(struct siginfo *info)
 
 	timr = lock_timer(info->si_tid, &flags);
 
-	if (!timr || timr->it_requeue_pending != info->si_sys_private)
-		goto exit;
+	if (timr && timr->it_requeue_pending == info->si_sys_private) {
+		if (timr->it_clock < 0)
+			posix_cpu_timer_schedule(timr);
+		else
+			schedule_next_timer(timr);
 
-	if (timr->it_clock < 0)	/* CPU clock */
-		posix_cpu_timer_schedule(timr);
-	else
-		schedule_next_timer(timr);
-	info->si_overrun = timr->it_overrun_last;
-exit:
-	if (timr)
-		unlock_timer(timr, flags);
+		info->si_overrun = timr->it_overrun_last;
+	}
+
+	unlock_timer(timr, flags);
 }
 
 int posix_timer_event(struct k_itimer *timr,int si_private)
 {
 	memset(&timr->sigq->info, 0, sizeof(siginfo_t));
 	timr->sigq->info.si_sys_private = si_private;
-	/*
-	 * Send signal to the process that owns this timer.
-
-	 * This code assumes that all the possible abs_lists share the
-	 * same lock (there is only one list at this time). If this is
-	 * not the case, the CLOCK info would need to be used to find
-	 * the proper abs list lock.
-	 */
+	/* Send signal to the process that owns this timer.*/
 
 	timr->sigq->info.si_signo = timr->it_sigev_signo;
 	timr->sigq->info.si_errno = 0;
@@ -449,64 +331,35 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
 
  * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
  */
-static void posix_timer_fn(unsigned long __data)
+static int posix_timer_fn(void *data)
 {
-	struct k_itimer *timr = (struct k_itimer *) __data;
+	struct k_itimer *timr = data;
 	unsigned long flags;
-	unsigned long seq;
-	struct timespec delta, new_wall_to;
-	u64 exp = 0;
-	int do_notify = 1;
+	int si_private = 0;
+	int ret = HRTIMER_NORESTART;
 
 	spin_lock_irqsave(&timr->it_lock, flags);
-	if (!list_empty(&timr->it.real.abs_timer_entry)) {
-		spin_lock(&abs_list.lock);
-		do {
-			seq = read_seqbegin(&xtime_lock);
-			new_wall_to =	wall_to_monotonic;
-		} while (read_seqretry(&xtime_lock, seq));
-		set_normalized_timespec(&delta,
-					new_wall_to.tv_sec -
-					timr->it.real.wall_to_prev.tv_sec,
-					new_wall_to.tv_nsec -
-					timr->it.real.wall_to_prev.tv_nsec);
-		if (likely((delta.tv_sec | delta.tv_nsec ) == 0)) {
-			/* do nothing, timer is on time */
-		} else if (delta.tv_sec < 0) {
-			/* do nothing, timer is already late */
-		} else {
-			/* timer is early due to a clock set */
-			tstojiffie(&delta,
-				   posix_clocks[timr->it_clock].res,
-				   &exp);
-			timr->it.real.wall_to_prev = new_wall_to;
-			timr->it.real.timer.expires += exp;
-			add_timer(&timr->it.real.timer);
-			do_notify = 0;
-		}
-		spin_unlock(&abs_list.lock);
 
-	}
-	if (do_notify)  {
-		int si_private=0;
+	if (timr->it.real.interval.tv64 != 0)
+		si_private = ++timr->it_requeue_pending;
 
-		if (timr->it.real.incr)
-			si_private = ++timr->it_requeue_pending;
-		else {
-			remove_from_abslist(timr);
+	if (posix_timer_event(timr, si_private)) {
+		/*
+		 * signal was not sent because of sig_ignor
+		 * we will not get a call back to restart it AND
+		 * it should be restarted.
+		 */
+		if (timr->it.real.interval.tv64 != 0) {
+			timr->it_overrun +=
+				hrtimer_forward(&timr->it.real.timer,
+						timr->it.real.interval);
+			ret = HRTIMER_RESTART;
 		}
-
-		if (posix_timer_event(timr, si_private))
-			/*
-			 * signal was not sent because of sig_ignor
-			 * we will not get a call back to restart it AND
-			 * it should be restarted.
-			 */
-			schedule_next_timer(timr);
 	}
-	unlock_timer(timr, flags); /* hold thru abs lock to keep irq off */
-}
 
+	unlock_timer(timr, flags);
+	return ret;
+}
 
 static inline struct task_struct * good_sigevent(sigevent_t * event)
 {
@@ -597,8 +450,7 @@ sys_timer_create(const clockid_t which_clock,
 		goto out;
 	}
 	spin_lock_irq(&idr_lock);
-	error = idr_get_new(&posix_timers_id,
-			    (void *) new_timer,
+	error = idr_get_new(&posix_timers_id, (void *) new_timer,
 			    &new_timer_id);
 	spin_unlock_irq(&idr_lock);
 	if (error == -EAGAIN)
@@ -698,26 +550,6 @@ out:
 	return error;
 }
 
-/*
- * good_timespec
- *
- * This function checks the elements of a timespec structure.
- *
- * Arguments:
- * ts	     : Pointer to the timespec structure to check
- *
- * Return value:
- * If a NULL pointer was passed in, or the tv_nsec field was less than 0
- * or greater than NSEC_PER_SEC, or the tv_sec field was less than 0,
- * this function returns 0. Otherwise it returns 1.
- */
-static int good_timespec(const struct timespec *ts)
-{
-	if ((!ts) || !timespec_valid(ts))
-		return 0;
-	return 1;
-}
-
 /*
  * Locking issues: We need to protect the result of the id look up until
  * we get the timer locked down so it is not deleted under us.  The
@@ -770,39 +602,39 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
 static void
 common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 {
-	unsigned long expires;
-	struct now_struct now;
-
-	do
-		expires = timr->it.real.timer.expires;
-	while ((volatile long) (timr->it.real.timer.expires) != expires);
-
-	posix_get_now(&now);
-
-	if (expires &&
-	    ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) &&
-	    !timr->it.real.incr &&
-	    posix_time_before(&timr->it.real.timer, &now))
-		timr->it.real.timer.expires = expires = 0;
-	if (expires) {
-		if (timr->it_requeue_pending & REQUEUE_PENDING ||
-		    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
-			posix_bump_timer(timr, now);
-			expires = timr->it.real.timer.expires;
-		}
-		else
-			if (!timer_pending(&timr->it.real.timer))
-				expires = 0;
-		if (expires)
-			expires -= now.jiffies;
-	}
-	jiffies_to_timespec(expires, &cur_setting->it_value);
-	jiffies_to_timespec(timr->it.real.incr, &cur_setting->it_interval);
+	ktime_t remaining;
+	struct hrtimer *timer = &timr->it.real.timer;
 
-	if (cur_setting->it_value.tv_sec < 0) {
-		cur_setting->it_value.tv_nsec = 1;
-		cur_setting->it_value.tv_sec = 0;
+	memset(cur_setting, 0, sizeof(struct itimerspec));
+	remaining = hrtimer_get_remaining(timer);
+
+	/* Time left ? or timer pending */
+	if (remaining.tv64 > 0 || hrtimer_active(timer))
+		goto calci;
+	/* interval timer ? */
+	if (timr->it.real.interval.tv64 == 0)
+		return;
+	/*
+	 * When a requeue is pending or this is a SIGEV_NONE timer
+	 * move the expiry time forward by intervals, so expiry is >
+	 * now.
+	 */
+	if (timr->it_requeue_pending & REQUEUE_PENDING ||
+	    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
+		timr->it_overrun +=
+			hrtimer_forward(timer, timr->it.real.interval);
+		remaining = hrtimer_get_remaining(timer);
 	}
+ calci:
+	/* interval timer ? */
+	if (timr->it.real.interval.tv64 != 0)
+		cur_setting->it_interval =
+			ktime_to_timespec(timr->it.real.interval);
+	/* Return 0 only, when the timer is expired and not pending */
+	if (remaining.tv64 <= 0)
+		cur_setting->it_value.tv_nsec = 1;
+	else
+		cur_setting->it_value = ktime_to_timespec(remaining);
 }
 
 /* Get the time remaining on a POSIX.1b interval timer. */
@@ -826,6 +658,7 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting)
 
 	return 0;
 }
+
 /*
  * Get the number of overruns of a POSIX.1b interval timer.  This is to
  * be the overrun of the timer last delivered.  At the same time we are
@@ -835,7 +668,6 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting)
  * the call back to do_schedule_next_timer().  So all we need to do is
  * to pick up the frozen overrun.
  */
-
 asmlinkage long
 sys_timer_getoverrun(timer_t timer_id)
 {
@@ -852,84 +684,6 @@ sys_timer_getoverrun(timer_t timer_id)
 
 	return overrun;
 }
-/*
- * Adjust for absolute time
- *
- * If absolute time is given and it is not CLOCK_MONOTONIC, we need to
- * adjust for the offset between the timer clock (CLOCK_MONOTONIC) and
- * what ever clock he is using.
- *
- * If it is relative time, we need to add the current (CLOCK_MONOTONIC)
- * time to it to get the proper time for the timer.
- */
-static int adjust_abs_time(struct k_clock *clock, struct timespec *tp, 
-			   int abs, u64 *exp, struct timespec *wall_to)
-{
-	struct timespec now;
-	struct timespec oc = *tp;
-	u64 jiffies_64_f;
-	int rtn =0;
-
-	if (abs) {
-		/*
-		 * The mask pick up the 4 basic clocks 
-		 */
-		if (!((clock - &posix_clocks[0]) & ~CLOCKS_MASK)) {
-			jiffies_64_f = do_posix_clock_monotonic_gettime_parts(
-				&now,  wall_to);
-			/*
-			 * If we are doing a MONOTONIC clock
-			 */
-			if((clock - &posix_clocks[0]) & CLOCKS_MONO){
-				now.tv_sec += wall_to->tv_sec;
-				now.tv_nsec += wall_to->tv_nsec;
-			}
-		} else {
-			/*
-			 * Not one of the basic clocks
-			 */
-			clock->clock_get(clock - posix_clocks, &now);
-			jiffies_64_f = get_jiffies_64();
-		}
-		/*
-		 * Take away now to get delta and normalize
-		 */
-		set_normalized_timespec(&oc, oc.tv_sec - now.tv_sec,
-					oc.tv_nsec - now.tv_nsec);
-	}else{
-		jiffies_64_f = get_jiffies_64();
-	}
-	/*
-	 * Check if the requested time is prior to now (if so set now)
-	 */
-	if (oc.tv_sec < 0)
-		oc.tv_sec = oc.tv_nsec = 0;
-
-	if (oc.tv_sec | oc.tv_nsec)
-		set_normalized_timespec(&oc, oc.tv_sec,
-					oc.tv_nsec + clock->res);
-	tstojiffie(&oc, clock->res, exp);
-
-	/*
-	 * Check if the requested time is more than the timer code
-	 * can handle (if so we error out but return the value too).
-	 */
-	if (*exp > ((u64)MAX_JIFFY_OFFSET))
-			/*
-			 * This is a considered response, not exactly in
-			 * line with the standard (in fact it is silent on
-			 * possible overflows).  We assume such a large 
-			 * value is ALMOST always a programming error and
-			 * try not to compound it by setting a really dumb
-			 * value.
-			 */
-			rtn = -EINVAL;
-	/*
-	 * return the actual jiffies expire time, full 64 bits
-	 */
-	*exp += jiffies_64_f;
-	return rtn;
-}
 
 /* Set a POSIX.1b interval timer. */
 /* timr->it_lock is taken. */
@@ -937,68 +691,48 @@ static inline int
 common_timer_set(struct k_itimer *timr, int flags,
 		 struct itimerspec *new_setting, struct itimerspec *old_setting)
 {
-	struct k_clock *clock = &posix_clocks[timr->it_clock];
-	u64 expire_64;
+	struct hrtimer *timer = &timr->it.real.timer;
 
 	if (old_setting)
 		common_timer_get(timr, old_setting);
 
 	/* disable the timer */
-	timr->it.real.incr = 0;
+	timr->it.real.interval.tv64 = 0;
 	/*
 	 * careful here.  If smp we could be in the "fire" routine which will
 	 * be spinning as we hold the lock.  But this is ONLY an SMP issue.
 	 */
-	if (try_to_del_timer_sync(&timr->it.real.timer) < 0) {
-#ifdef CONFIG_SMP
-		/*
-		 * It can only be active if on an other cpu.  Since
-		 * we have cleared the interval stuff above, it should
-		 * clear once we release the spin lock.  Of course once
-		 * we do that anything could happen, including the
-		 * complete melt down of the timer.  So return with
-		 * a "retry" exit status.
-		 */
+	if (hrtimer_try_to_cancel(timer) < 0)
 		return TIMER_RETRY;
-#endif
-	}
-
-	remove_from_abslist(timr);
 
 	timr->it_requeue_pending = (timr->it_requeue_pending + 2) & 
 		~REQUEUE_PENDING;
 	timr->it_overrun_last = 0;
-	timr->it_overrun = -1;
-	/*
-	 *switch off the timer when it_value is zero
-	 */
-	if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) {
-		timr->it.real.timer.expires = 0;
-		return 0;
-	}
 
-	if (adjust_abs_time(clock,
-			    &new_setting->it_value, flags & TIMER_ABSTIME, 
-			    &expire_64, &(timr->it.real.wall_to_prev))) {
-		return -EINVAL;
-	}
-	timr->it.real.timer.expires = (unsigned long)expire_64;
-	tstojiffie(&new_setting->it_interval, clock->res, &expire_64);
-	timr->it.real.incr = (unsigned long)expire_64;
+	/* switch off the timer when it_value is zero */
+	if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
+		return 0;
 
-	/*
-	 * We do not even queue SIGEV_NONE timers!  But we do put them
-	 * in the abs list so we can do that right.
+	/* Posix madness. Only absolute CLOCK_REALTIME timers
+	 * are affected by clock sets. So we must reiniatilize
+	 * the timer.
 	 */
-	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE))
-		add_timer(&timr->it.real.timer);
-
-	if (flags & TIMER_ABSTIME && clock->abs_struct) {
-		spin_lock(&clock->abs_struct->lock);
-		list_add_tail(&(timr->it.real.abs_timer_entry),
-			      &(clock->abs_struct->list));
-		spin_unlock(&clock->abs_struct->lock);
-	}
+	if (timr->it_clock == CLOCK_REALTIME && (flags & TIMER_ABSTIME))
+		hrtimer_rebase(timer, CLOCK_REALTIME);
+	else
+		hrtimer_rebase(timer, CLOCK_MONOTONIC);
+
+	timer->expires = timespec_to_ktime(new_setting->it_value);
+
+	/* Convert interval */
+	timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
+
+	/* SIGEV_NONE timers are not queued ! See common_timer_get */
+	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
+		return 0;
+
+	hrtimer_start(timer, timer->expires, (flags & TIMER_ABSTIME) ?
+		      HRTIMER_ABS : HRTIMER_REL);
 	return 0;
 }
 
@@ -1020,8 +754,8 @@ sys_timer_settime(timer_t timer_id, int flags,
 	if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
 		return -EFAULT;
 
-	if ((!good_timespec(&new_spec.it_interval)) ||
-	    (!good_timespec(&new_spec.it_value)))
+	if (!timespec_valid(&new_spec.it_interval) ||
+	    !timespec_valid(&new_spec.it_value))
 		return -EINVAL;
 retry:
 	timr = lock_timer(timer_id, &flag);
@@ -1037,8 +771,8 @@ retry:
 		goto retry;
 	}
 
-	if (old_setting && !error && copy_to_user(old_setting,
-						  &old_spec, sizeof (old_spec)))
+	if (old_setting && !error &&
+	    copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
 		error = -EFAULT;
 
 	return error;
@@ -1046,24 +780,10 @@ retry:
 
 static inline int common_timer_del(struct k_itimer *timer)
 {
-	timer->it.real.incr = 0;
+	timer->it.real.interval.tv64 = 0;
 
-	if (try_to_del_timer_sync(&timer->it.real.timer) < 0) {
-#ifdef CONFIG_SMP
-		/*
-		 * It can only be active if on an other cpu.  Since
-		 * we have cleared the interval stuff above, it should
-		 * clear once we release the spin lock.  Of course once
-		 * we do that anything could happen, including the
-		 * complete melt down of the timer.  So return with
-		 * a "retry" exit status.
-		 */
+	if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0)
 		return TIMER_RETRY;
-#endif
-	}
-
-	remove_from_abslist(timer);
-
 	return 0;
 }
 
@@ -1079,24 +799,16 @@ sys_timer_delete(timer_t timer_id)
 	struct k_itimer *timer;
 	long flags;
 
-#ifdef CONFIG_SMP
-	int error;
 retry_delete:
-#endif
 	timer = lock_timer(timer_id, &flags);
 	if (!timer)
 		return -EINVAL;
 
-#ifdef CONFIG_SMP
-	error = timer_delete_hook(timer);
-
-	if (error == TIMER_RETRY) {
+	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
 		goto retry_delete;
 	}
-#else
-	timer_delete_hook(timer);
-#endif
+
 	spin_lock(&current->sighand->siglock);
 	list_del(&timer->list);
 	spin_unlock(&current->sighand->siglock);
@@ -1113,6 +825,7 @@ retry_delete:
 	release_posix_timer(timer, IT_ID_SET);
 	return 0;
 }
+
 /*
  * return timer owned by the process, used by exit_itimers
  */
@@ -1120,22 +833,13 @@ static inline void itimer_delete(struct k_itimer *timer)
 {
 	unsigned long flags;
 
-#ifdef CONFIG_SMP
-	int error;
 retry_delete:
-#endif
 	spin_lock_irqsave(&timer->it_lock, flags);
 
-#ifdef CONFIG_SMP
-	error = timer_delete_hook(timer);
-
-	if (error == TIMER_RETRY) {
+	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
 		goto retry_delete;
 	}
-#else
-	timer_delete_hook(timer);
-#endif
 	list_del(&timer->list);
 	/*
 	 * This keeps any tasks waiting on the spin lock from thinking
@@ -1164,57 +868,7 @@ void exit_itimers(struct signal_struct *sig)
 	}
 }
 
-/*
- * And now for the "clock" calls
- *
- * These functions are called both from timer functions (with the timer
- * spin_lock_irq() held and from clock calls with no locking.	They must
- * use the save flags versions of locks.
- */
-
-/*
- * We do ticks here to avoid the irq lock ( they take sooo long).
- * The seqlock is great here.  Since we a reader, we don't really care
- * if we are interrupted since we don't take lock that will stall us or
- * any other cpu. Voila, no irq lock is needed.
- *
- */
-
-static u64 do_posix_clock_monotonic_gettime_parts(
-	struct timespec *tp, struct timespec *mo)
-{
-	u64 jiff;
-	unsigned int seq;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		getnstimeofday(tp);
-		*mo = wall_to_monotonic;
-		jiff = jiffies_64;
-
-	} while(read_seqretry(&xtime_lock, seq));
-
-	return jiff;
-}
-
-static int do_posix_clock_monotonic_get(const clockid_t clock,
-					struct timespec *tp)
-{
-	struct timespec wall_to_mono;
-
-	do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono);
-
-	set_normalized_timespec(tp, tp->tv_sec + wall_to_mono.tv_sec,
-				tp->tv_nsec + wall_to_mono.tv_nsec);
-
-	return 0;
-}
-
-int do_posix_clock_monotonic_gettime(struct timespec *tp)
-{
-	return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp);
-}
-
+/* Not available / possible... functions */
 int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
 {
 	return -EINVAL;
@@ -1287,107 +941,6 @@ sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp)
 	return error;
 }
 
-/*
- * The standard says that an absolute nanosleep call MUST wake up at
- * the requested time in spite of clock settings.  Here is what we do:
- * For each nanosleep call that needs it (only absolute and not on
- * CLOCK_MONOTONIC* (as it can not be set)) we thread a little structure
- * into the "nanosleep_abs_list".  All we need is the task_struct pointer.
- * When ever the clock is set we just wake up all those tasks.	 The rest
- * is done by the while loop in clock_nanosleep().
- *
- * On locking, clock_was_set() is called from update_wall_clock which
- * holds (or has held for it) a write_lock_irq( xtime_lock) and is
- * called from the timer bh code.  Thus we need the irq save locks.
- *
- * Also, on the call from update_wall_clock, that is done as part of a
- * softirq thing.  We don't want to delay the system that much (possibly
- * long list of timers to fix), so we defer that work to keventd.
- */
-
-static DECLARE_WAIT_QUEUE_HEAD(nanosleep_abs_wqueue);
-static DECLARE_WORK(clock_was_set_work, (void(*)(void*))clock_was_set, NULL);
-
-static DECLARE_MUTEX(clock_was_set_lock);
-
-void clock_was_set(void)
-{
-	struct k_itimer *timr;
-	struct timespec new_wall_to;
-	LIST_HEAD(cws_list);
-	unsigned long seq;
-
-
-	if (unlikely(in_interrupt())) {
-		schedule_work(&clock_was_set_work);
-		return;
-	}
-	wake_up_all(&nanosleep_abs_wqueue);
-
-	/*
-	 * Check if there exist TIMER_ABSTIME timers to correct.
-	 *
-	 * Notes on locking: This code is run in task context with irq
-	 * on.  We CAN be interrupted!  All other usage of the abs list
-	 * lock is under the timer lock which holds the irq lock as
-	 * well.  We REALLY don't want to scan the whole list with the
-	 * interrupt system off, AND we would like a sequence lock on
-	 * this code as well.  Since we assume that the clock will not
-	 * be set often, it seems ok to take and release the irq lock
-	 * for each timer.  In fact add_timer will do this, so this is
-	 * not an issue.  So we know when we are done, we will move the
-	 * whole list to a new location.  Then as we process each entry,
-	 * we will move it to the actual list again.  This way, when our
-	 * copy is empty, we are done.  We are not all that concerned
-	 * about preemption so we will use a semaphore lock to protect
-	 * aginst reentry.  This way we will not stall another
-	 * processor.  It is possible that this may delay some timers
-	 * that should have expired, given the new clock, but even this
-	 * will be minimal as we will always update to the current time,
-	 * even if it was set by a task that is waiting for entry to
-	 * this code.  Timers that expire too early will be caught by
-	 * the expire code and restarted.
-
-	 * Absolute timers that repeat are left in the abs list while
-	 * waiting for the task to pick up the signal.  This means we
-	 * may find timers that are not in the "add_timer" list, but are
-	 * in the abs list.  We do the same thing for these, save
-	 * putting them back in the "add_timer" list.  (Note, these are
-	 * left in the abs list mainly to indicate that they are
-	 * ABSOLUTE timers, a fact that is used by the re-arm code, and
-	 * for which we have no other flag.)
-
-	 */
-
-	down(&clock_was_set_lock);
-	spin_lock_irq(&abs_list.lock);
-	list_splice_init(&abs_list.list, &cws_list);
-	spin_unlock_irq(&abs_list.lock);
-	do {
-		do {
-			seq = read_seqbegin(&xtime_lock);
-			new_wall_to =	wall_to_monotonic;
-		} while (read_seqretry(&xtime_lock, seq));
-
-		spin_lock_irq(&abs_list.lock);
-		if (list_empty(&cws_list)) {
-			spin_unlock_irq(&abs_list.lock);
-			break;
-		}
-		timr = list_entry(cws_list.next, struct k_itimer,
-				  it.real.abs_timer_entry);
-
-		list_del_init(&timr->it.real.abs_timer_entry);
-		if (add_clockset_delta(timr, &new_wall_to) &&
-		    del_timer(&timr->it.real.timer))  /* timer run yet? */
-			add_timer(&timr->it.real.timer);
-		list_add(&timr->it.real.abs_timer_entry, &abs_list.list);
-		spin_unlock_irq(&abs_list.lock);
-	} while (1);
-
-	up(&clock_was_set_lock);
-}
-
 /*
  * nanosleep for monotonic and realtime clocks
  */
@@ -1401,7 +954,7 @@ static int common_nsleep(const clockid_t which_clock, int flags,
 	case CLOCK_REALTIME:
 		/* Posix madness. Only absolute timers on clock realtime
 		   are affected by clock set. */
-		if (mode == HRTIMER_ABS)
+		if (mode != HRTIMER_ABS)
 			clockid = CLOCK_MONOTONIC;
 	case CLOCK_MONOTONIC:
 		break;
-- 
cgit v1.2.2


From 69778e325c3007c8c8a653fcee2c298ffe071fd4 Mon Sep 17 00:00:00 2001
From: Matt Helsley <matthltc@us.ibm.com>
Date: Mon, 9 Jan 2006 20:52:39 -0800
Subject: [PATCH] Export ktime_get_ts()

This series removes the getnstimestamp() function from kernel/time.c in favor
of kernel/hrtimer.c's ktime_get_ts() function which currently does exactly the
same thing: retrieves a high-resolution (ns) timespec structure and performs
the wall_to_monotonic adjustment.

This patch:

Export ktime_get_ts() to be used as a timestamp function since it uses
getnstimefoday() and does the wall_to_monotonic adjustment.

Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/hrtimer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 11fde0caee56..f073a2461faa 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -108,6 +108,7 @@ void ktime_get_ts(struct timespec *ts)
 	set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
 				ts->tv_nsec + tomono.tv_nsec);
 }
+EXPORT_SYMBOL_GPL(ktime_get_ts);
 
 /*
  * Functions and macros which are different for UP/SMP systems are kept in a
-- 
cgit v1.2.2


From d1c0b8f835aeba85aa428aaec6d521ef4639c7fa Mon Sep 17 00:00:00 2001
From: Matt Helsley <matthltc@us.ibm.com>
Date: Mon, 9 Jan 2006 20:52:40 -0800
Subject: [PATCH] Remove getnstimestamp()

Remove getnstimestamp() in favor of ktime.h's ktime_get_ts()

Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time.c | 22 ----------------------
 1 file changed, 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index cf5a4582a672..169e8329e0b6 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -564,28 +564,6 @@ void getnstimeofday(struct timespec *tv)
 EXPORT_SYMBOL_GPL(getnstimeofday);
 #endif
 
-void getnstimestamp(struct timespec *ts)
-{
-	unsigned int seq;
-	struct timespec wall2mono;
-
-	/* synchronize with settimeofday() changes */
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		getnstimeofday(ts);
-		wall2mono = wall_to_monotonic;
-	} while(unlikely(read_seqretry(&xtime_lock, seq)));
-
-	/* adjust to monotonicaly-increasing values */
-	ts->tv_sec += wall2mono.tv_sec;
-	ts->tv_nsec += wall2mono.tv_nsec;
-	while (unlikely(ts->tv_nsec >= NSEC_PER_SEC)) {
-		ts->tv_nsec -= NSEC_PER_SEC;
-		ts->tv_sec++;
-	}
-}
-EXPORT_SYMBOL_GPL(getnstimestamp);
-
 /* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
  * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
  * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
-- 
cgit v1.2.2


From 2d14e39da8712cff8a92298f464a25afb4283ccf Mon Sep 17 00:00:00 2001
From: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Date: Mon, 9 Jan 2006 20:52:41 -0800
Subject: [PATCH] kprobes: enable funcions only for required arch

Kernel/kprobes.c defines get_insn_slot() and free_insn_slot() which are
currently required _only_ for x86_64 and powerpc (which has no-exec support).

FYI, get{free}_insn_slot() functions manages the memory page which is mapped
as executable, required for instruction emulation.

This patch moves those two functions under __ARCH_WANT_KPROBES_INSN_SLOT and
defines __ARCH_WANT_KPROBES_INSN_SLOT in arch specific kprobes.h file.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3bb71e63a37e..3897630d2335 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(kprobe_lock);	/* Protects kprobe_table */
 DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 
+#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
 /*
  * kprobe->ainsn.insn points to the copy of the instruction to be
  * single-stepped. x86_64, POWER4 and above have no-exec support and
@@ -151,6 +152,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
 		}
 	}
 }
+#endif
 
 /* We have preemption disabled.. so it is safe to use __ versions */
 static inline void set_kprobe_instance(struct kprobe *kp)
-- 
cgit v1.2.2


From 49a2a1b83ba6fa40c41968d6a28ba16e7ed0c3f7 Mon Sep 17 00:00:00 2001
From: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Date: Mon, 9 Jan 2006 20:52:43 -0800
Subject: [PATCH] kprobes: changed from using spinlock to mutex

Since Kprobes runtime exception handlers is now lock free as this code path is
now using RCU to walk through the list, there is no need for the
register/unregister{_kprobe} to use spin_{lock/unlock}_isr{save/restore}.  The
serialization during registration/unregistration is now possible using just a
mutex.

In the above process, this patch also fixes a minor memory leak for x86_64 and
powerpc.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 91 ++++++++++++++++++++++++++------------------------------
 1 file changed, 43 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3897630d2335..f14ccd35e9b6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -48,7 +48,7 @@
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
-static DEFINE_SPINLOCK(kprobe_lock);	/* Protects kprobe_table */
+static DECLARE_MUTEX(kprobe_mutex);	/* Protects kprobe_table */
 DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 
@@ -167,7 +167,7 @@ static inline void reset_kprobe_instance(void)
 
 /*
  * This routine is called either:
- * 	- under the kprobe_lock spinlock - during kprobe_[un]register()
+ * 	- under the kprobe_mutex - during kprobe_[un]register()
  * 				OR
  * 	- with preemption disabled - from arch/xxx/kernel/kprobes.c
  */
@@ -420,7 +420,6 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 /*
  * This is the second or subsequent kprobe at the address - handle
  * the intricacies
- * TODO: Move kcalloc outside the spin_lock
  */
 static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 					  struct kprobe *p)
@@ -442,25 +441,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 	return ret;
 }
 
-/* kprobe removal house-keeping routines */
-static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
-{
-	arch_disarm_kprobe(p);
-	hlist_del_rcu(&p->hlist);
-	spin_unlock_irqrestore(&kprobe_lock, flags);
-	arch_remove_kprobe(p);
-}
-
-static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
-		struct kprobe *p, unsigned long flags)
-{
-	list_del_rcu(&p->list);
-	if (list_empty(&old_p->list))
-		cleanup_kprobe(old_p, flags);
-	else
-		spin_unlock_irqrestore(&kprobe_lock, flags);
-}
-
 static int __kprobes in_kprobes_functions(unsigned long addr)
 {
 	if (addr >= (unsigned long)__kprobes_text_start
@@ -472,7 +452,6 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
 int __kprobes register_kprobe(struct kprobe *p)
 {
 	int ret = 0;
-	unsigned long flags = 0;
 	struct kprobe *old_p;
 	struct module *mod;
 
@@ -484,18 +463,17 @@ int __kprobes register_kprobe(struct kprobe *p)
 			(unlikely(!try_module_get(mod))))
 		return -EINVAL;
 
-	if ((ret = arch_prepare_kprobe(p)) != 0)
-		goto rm_kprobe;
-
 	p->nmissed = 0;
-	spin_lock_irqsave(&kprobe_lock, flags);
+	down(&kprobe_mutex);
 	old_p = get_kprobe(p->addr);
 	if (old_p) {
 		ret = register_aggr_kprobe(old_p, p);
 		goto out;
 	}
 
-	arch_copy_kprobe(p);
+	if ((ret = arch_prepare_kprobe(p)) != 0)
+		goto out;
+
 	INIT_HLIST_NODE(&p->hlist);
 	hlist_add_head_rcu(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
@@ -503,10 +481,8 @@ int __kprobes register_kprobe(struct kprobe *p)
   	arch_arm_kprobe(p);
 
 out:
-	spin_unlock_irqrestore(&kprobe_lock, flags);
-rm_kprobe:
-	if (ret == -EEXIST)
-		arch_remove_kprobe(p);
+	up(&kprobe_mutex);
+
 	if (ret && mod)
 		module_put(mod);
 	return ret;
@@ -514,29 +490,48 @@ rm_kprobe:
 
 void __kprobes unregister_kprobe(struct kprobe *p)
 {
-	unsigned long flags;
-	struct kprobe *old_p;
 	struct module *mod;
+	struct kprobe *old_p, *cleanup_p;
 
-	spin_lock_irqsave(&kprobe_lock, flags);
+	down(&kprobe_mutex);
 	old_p = get_kprobe(p->addr);
-	if (old_p) {
-		/* cleanup_*_kprobe() does the spin_unlock_irqrestore */
-		if (old_p->pre_handler == aggr_pre_handler)
-			cleanup_aggr_kprobe(old_p, p, flags);
-		else
-			cleanup_kprobe(p, flags);
+	if (unlikely(!old_p)) {
+		up(&kprobe_mutex);
+		return;
+	}
 
-		synchronize_sched();
+	if ((old_p->pre_handler == aggr_pre_handler) &&
+		(p->list.next == &old_p->list) &&
+		(p->list.prev == &old_p->list)) {
+		/* Only one element in the aggregate list */
+		arch_disarm_kprobe(p);
+		hlist_del_rcu(&old_p->hlist);
+		cleanup_p = old_p;
+	} else if (old_p == p) {
+		/* Only one kprobe element in the hash list */
+		arch_disarm_kprobe(p);
+		hlist_del_rcu(&p->hlist);
+		cleanup_p = p;
+	} else {
+		list_del_rcu(&p->list);
+		cleanup_p = NULL;
+	}
 
-		if ((mod = module_text_address((unsigned long)p->addr)))
-			module_put(mod);
+	up(&kprobe_mutex);
 
-		if (old_p->pre_handler == aggr_pre_handler &&
-				list_empty(&old_p->list))
+	synchronize_sched();
+	if ((mod = module_text_address((unsigned long)p->addr)))
+		module_put(mod);
+
+	if (cleanup_p) {
+		if (cleanup_p->pre_handler == aggr_pre_handler) {
+			list_del_rcu(&p->list);
 			kfree(old_p);
-	} else
-		spin_unlock_irqrestore(&kprobe_lock, flags);
+		}
+		down(&kprobe_mutex);
+		arch_remove_kprobe(p);
+		up(&kprobe_mutex);
+	}
 }
 
 static struct notifier_block kprobe_exceptions_nb = {
-- 
cgit v1.2.2


From f709b122343fb9a010b6cf2d5559641f1820f7c9 Mon Sep 17 00:00:00 2001
From: Keshavamurthy Anil S <anil.s.keshavamurthy@intel.com>
Date: Mon, 9 Jan 2006 20:52:44 -0800
Subject: [PATCH] kprobes-changed-from-using-spinlock-to-mutex fix

Based on some feedback from Oleg Nesterov, I have made few changes to
previously posted patch.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f14ccd35e9b6..f1c0e61a2cb4 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -431,7 +431,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 		copy_kprobe(old_p, p);
 		ret = add_new_kprobe(old_p, p);
 	} else {
-		ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC);
+		ap = kcalloc(1, sizeof(struct kprobe), GFP_KERNEL);
 		if (!ap)
 			return -ENOMEM;
 		add_aggr_kprobe(ap, old_p);
@@ -491,7 +491,8 @@ out:
 void __kprobes unregister_kprobe(struct kprobe *p)
 {
 	struct module *mod;
-	struct kprobe *old_p, *cleanup_p;
+	struct kprobe *old_p, *list_p;
+	int cleanup_p;
 
 	down(&kprobe_mutex);
 	old_p = get_kprobe(p->addr);
@@ -499,22 +500,25 @@ void __kprobes unregister_kprobe(struct kprobe *p)
 		up(&kprobe_mutex);
 		return;
 	}
-
-	if ((old_p->pre_handler == aggr_pre_handler) &&
+	if (p != old_p) {
+		list_for_each_entry_rcu(list_p, &old_p->list, list)
+			if (list_p == p)
+			/* kprobe p is a valid probe */
+				goto valid_p;
+		up(&kprobe_mutex);
+		return;
+	}
+valid_p:
+	if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) &&
 		(p->list.next == &old_p->list) &&
-		(p->list.prev == &old_p->list)) {
-		/* Only one element in the aggregate list */
+		(p->list.prev == &old_p->list))) {
+		/* Only probe on the hash list */
 		arch_disarm_kprobe(p);
 		hlist_del_rcu(&old_p->hlist);
-		cleanup_p = old_p;
-	} else if (old_p == p) {
-		/* Only one kprobe element in the hash list */
-		arch_disarm_kprobe(p);
-		hlist_del_rcu(&p->hlist);
-		cleanup_p = p;
+		cleanup_p = 1;
 	} else {
 		list_del_rcu(&p->list);
-		cleanup_p = NULL;
+		cleanup_p = 0;
 	}
 
 	up(&kprobe_mutex);
@@ -524,7 +528,7 @@ void __kprobes unregister_kprobe(struct kprobe *p)
 		module_put(mod);
 
 	if (cleanup_p) {
-		if (cleanup_p->pre_handler == aggr_pre_handler) {
+		if (p != old_p) {
 			list_del_rcu(&p->list);
 			kfree(old_p);
 		}
-- 
cgit v1.2.2


From e597c2984c64609c6e1e1ac803f00f7550705860 Mon Sep 17 00:00:00 2001
From: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Date: Mon, 9 Jan 2006 20:52:45 -0800
Subject: [PATCH] kprobes: arch_remove_kprobe

Currently arch_remove_kprobes() is only implemented/required for x86_64 and
powerpc.  All other architecture like IA64, i386 and sparc64 implementes a
dummy function which is being called from arch independent kprobes.c file.

This patch removes the dummy functions and replaces it with
#define arch_remove_kprobe(p, s)	do { } while(0)

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f1c0e61a2cb4..19c42cbf91a0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -532,9 +532,7 @@ valid_p:
 			list_del_rcu(&p->list);
 			kfree(old_p);
 		}
-		down(&kprobe_mutex);
-		arch_remove_kprobe(p);
-		up(&kprobe_mutex);
+		arch_remove_kprobe(p, &kprobe_mutex);
 	}
 }
 
-- 
cgit v1.2.2


From 0498b63504f818e5ab39c818cd6f7b41319a1187 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Mon, 9 Jan 2006 20:52:46 -0800
Subject: [PATCH] kprobes: fix build breakage

The following patch (against 2.6.15-rc5-mm3) fixes a kprobes build break
due to changes introduced in the kprobe locking in 2.6.15-rc5-mm3.  In
addition, the patch reverts back the open-coding of kprobe_mutex.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 19c42cbf91a0..f24cbab558f1 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -48,7 +48,7 @@
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
-static DECLARE_MUTEX(kprobe_mutex);	/* Protects kprobe_table */
+DECLARE_MUTEX(kprobe_mutex);		/* Protects kprobe_table */
 DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 
@@ -532,7 +532,7 @@ valid_p:
 			list_del_rcu(&p->list);
 			kfree(old_p);
 		}
-		arch_remove_kprobe(p, &kprobe_mutex);
+		arch_remove_kprobe(p);
 	}
 }
 
-- 
cgit v1.2.2


From a0d50069ed3620e9a4d8939bbc3e0d5a692ab58a Mon Sep 17 00:00:00 2001
From: Keshavamurthy Anil S <anil.s.keshavamurthy@intel.com>
Date: Mon, 9 Jan 2006 20:52:46 -0800
Subject: [PATCH] Kprobes: conversion from kcalloc to kzalloc

Signed-of-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f24cbab558f1..34a885bb82e0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -431,7 +431,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 		copy_kprobe(old_p, p);
 		ret = add_new_kprobe(old_p, p);
 	} else {
-		ap = kcalloc(1, sizeof(struct kprobe), GFP_KERNEL);
+		ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
 		if (!ap)
 			return -ENOMEM;
 		add_aggr_kprobe(ap, old_p);
-- 
cgit v1.2.2


From 3795e1616f16905889761536cdc266ebc51855e5 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Mon, 9 Jan 2006 20:54:39 -0800
Subject: [PATCH] Decrease number of pointer derefs in exit.c

Decrease the number of pointer derefs in kernel/exit.c

Benefits of the patch:
 - Fewer pointer dereferences should make the code slightly faster.
 - Size of generated code is smaller
 - improved readability

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index e75a51f33768..802722814925 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1071,6 +1071,9 @@ static int wait_task_zombie(task_t *p, int noreap,
 	}
 
 	if (likely(p->real_parent == p->parent) && likely(p->signal)) {
+		struct signal_struct *psig;
+		struct signal_struct *sig;
+
 		/*
 		 * The resource counters for the group leader are in its
 		 * own task_struct.  Those for dead threads in the group
@@ -1087,24 +1090,26 @@ static int wait_task_zombie(task_t *p, int noreap,
 		 * here reaping other children at the same time.
 		 */
 		spin_lock_irq(&p->parent->sighand->siglock);
-		p->parent->signal->cutime =
-			cputime_add(p->parent->signal->cutime,
+		psig = p->parent->signal;
+		sig = p->signal;
+		psig->cutime =
+			cputime_add(psig->cutime,
 			cputime_add(p->utime,
-			cputime_add(p->signal->utime,
-				    p->signal->cutime)));
-		p->parent->signal->cstime =
-			cputime_add(p->parent->signal->cstime,
+			cputime_add(sig->utime,
+				    sig->cutime)));
+		psig->cstime =
+			cputime_add(psig->cstime,
 			cputime_add(p->stime,
-			cputime_add(p->signal->stime,
-				    p->signal->cstime)));
-		p->parent->signal->cmin_flt +=
-			p->min_flt + p->signal->min_flt + p->signal->cmin_flt;
-		p->parent->signal->cmaj_flt +=
-			p->maj_flt + p->signal->maj_flt + p->signal->cmaj_flt;
-		p->parent->signal->cnvcsw +=
-			p->nvcsw + p->signal->nvcsw + p->signal->cnvcsw;
-		p->parent->signal->cnivcsw +=
-			p->nivcsw + p->signal->nivcsw + p->signal->cnivcsw;
+			cputime_add(sig->stime,
+				    sig->cstime)));
+		psig->cmin_flt +=
+			p->min_flt + sig->min_flt + sig->cmin_flt;
+		psig->cmaj_flt +=
+			p->maj_flt + sig->maj_flt + sig->cmaj_flt;
+		psig->cnvcsw +=
+			p->nvcsw + sig->nvcsw + sig->cnvcsw;
+		psig->cnivcsw +=
+			p->nivcsw + sig->nivcsw + sig->cnivcsw;
 		spin_unlock_irq(&p->parent->sighand->siglock);
 	}
 
-- 
cgit v1.2.2


From d974837ae076101d33a59eefc6bfef923eaa0e32 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 9 Jan 2006 20:54:49 -0800
Subject: [PATCH] kernel/resource.c: __check_region(): remove pointless
 __deprecated

If a __deprecated is desired it should go to the prototype in the header
(where it currently isn't).

But at this place it's pointless.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/resource.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 92285d822de6..e3080fcc66a3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -464,7 +464,7 @@ struct resource * __request_region(struct resource *parent, unsigned long start,
 
 EXPORT_SYMBOL(__request_region);
 
-int __deprecated __check_region(struct resource *parent, unsigned long start, unsigned long n)
+int __check_region(struct resource *parent, unsigned long start, unsigned long n)
 {
 	struct resource * res;
 
-- 
cgit v1.2.2


From 69a0b3157983925f14fe0bdc49622d5389538d8d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 10 Jan 2006 16:48:02 +0300
Subject: [PATCH] rcu: join rcu_ctrlblk and rcu_state

This patch moves rcu_state into the rcu_ctrlblk. I think there
are no reasons why we should have 2 different variables to control
rcu state. Every user of rcu_state has also "rcu_ctrlblk *rcp" in
the parameter list.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c | 82 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 38 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 05ee48316f70..e18f9190eafa 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -49,22 +49,18 @@
 #include <linux/cpu.h>
 
 /* Definition for rcupdate control block. */
-struct rcu_ctrlblk rcu_ctrlblk = 
-	{ .cur = -300, .completed = -300 };
-struct rcu_ctrlblk rcu_bh_ctrlblk =
-	{ .cur = -300, .completed = -300 };
-
-/* Bookkeeping of the progress of the grace period */
-struct rcu_state {
-	spinlock_t	lock; /* Guard this struct and writes to rcu_ctrlblk */
-	cpumask_t	cpumask; /* CPUs that need to switch in order    */
-	                              /* for current batch to proceed.        */
+struct rcu_ctrlblk rcu_ctrlblk = {
+	.cur = -300,
+	.completed = -300,
+	.lock = SPIN_LOCK_UNLOCKED,
+	.cpumask = CPU_MASK_NONE,
+};
+struct rcu_ctrlblk rcu_bh_ctrlblk = {
+	.cur = -300,
+	.completed = -300,
+	.lock = SPIN_LOCK_UNLOCKED,
+	.cpumask = CPU_MASK_NONE,
 };
-
-static struct rcu_state rcu_state ____cacheline_internodealigned_in_smp =
-	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
-static struct rcu_state rcu_bh_state ____cacheline_internodealigned_in_smp =
-	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
 
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
@@ -220,13 +216,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
  *   This is done by rcu_start_batch. The start is not broadcasted to
  *   all cpus, they must pick this up by comparing rcp->cur with
  *   rdp->quiescbatch. All cpus are recorded  in the
- *   rcu_state.cpumask bitmap.
+ *   rcu_ctrlblk.cpumask bitmap.
  * - All cpus must go through a quiescent state.
  *   Since the start of the grace period is not broadcasted, at least two
  *   calls to rcu_check_quiescent_state are required:
  *   The first call just notices that a new grace period is running. The
  *   following calls check if there was a quiescent state since the beginning
- *   of the grace period. If so, it updates rcu_state.cpumask. If
+ *   of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
  *   the bitmap is empty, then the grace period is completed.
  *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
  *   period (if necessary).
@@ -234,9 +230,9 @@ static void rcu_do_batch(struct rcu_data *rdp)
 /*
  * Register a new batch of callbacks, and start it up if there is currently no
  * active batch and the batch to be registered has not already occurred.
- * Caller must hold rcu_state.lock.
+ * Caller must hold rcu_ctrlblk.lock.
  */
-static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
+static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 {
 	if (rcp->next_pending &&
 			rcp->completed == rcp->cur) {
@@ -251,11 +247,11 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
 		/*
 		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
 		 * Barrier  Otherwise it can cause tickless idle CPUs to be
-		 * included in rsp->cpumask, which will extend graceperiods
+		 * included in rcp->cpumask, which will extend graceperiods
 		 * unnecessarily.
 		 */
 		smp_mb();
-		cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask);
+		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
 
 	}
 }
@@ -265,13 +261,13 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
  * Clear it from the cpu mask and complete the grace period if it was the last
  * cpu. Start another grace period if someone has further entries pending
  */
-static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
+static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
 {
-	cpu_clear(cpu, rsp->cpumask);
-	if (cpus_empty(rsp->cpumask)) {
+	cpu_clear(cpu, rcp->cpumask);
+	if (cpus_empty(rcp->cpumask)) {
 		/* batch completed ! */
 		rcp->completed = rcp->cur;
-		rcu_start_batch(rcp, rsp);
+		rcu_start_batch(rcp);
 	}
 }
 
@@ -281,7 +277,7 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp)
  * quiescent cycle, then indicate that it has done so.
  */
 static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
-			struct rcu_state *rsp, struct rcu_data *rdp)
+					struct rcu_data *rdp)
 {
 	if (rdp->quiescbatch != rcp->cur) {
 		/* start new grace period: */
@@ -306,15 +302,15 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 		return;
 	rdp->qs_pending = 0;
 
-	spin_lock(&rsp->lock);
+	spin_lock(&rcp->lock);
 	/*
 	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
 	 * during cpu startup. Ignore the quiescent state.
 	 */
 	if (likely(rdp->quiescbatch == rcp->cur))
-		cpu_quiet(rdp->cpu, rcp, rsp);
+		cpu_quiet(rdp->cpu, rcp);
 
-	spin_unlock(&rsp->lock);
+	spin_unlock(&rcp->lock);
 }
 
 
@@ -335,16 +331,16 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
 }
 
 static void __rcu_offline_cpu(struct rcu_data *this_rdp,
-	struct rcu_ctrlblk *rcp, struct rcu_state *rsp, struct rcu_data *rdp)
+				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
 	/* if the cpu going offline owns the grace period
 	 * we can block indefinitely waiting for it, so flush
 	 * it here
 	 */
-	spin_lock_bh(&rsp->lock);
+	spin_lock_bh(&rcp->lock);
 	if (rcp->cur != rcp->completed)
-		cpu_quiet(rdp->cpu, rcp, rsp);
-	spin_unlock_bh(&rsp->lock);
+		cpu_quiet(rdp->cpu, rcp);
+	spin_unlock_bh(&rcp->lock);
 	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
 	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
 
@@ -354,9 +350,9 @@ static void rcu_offline_cpu(int cpu)
 	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
 	struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
 
-	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk, &rcu_state,
+	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
 					&per_cpu(rcu_data, cpu));
-	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, &rcu_bh_state,
+	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
 					&per_cpu(rcu_bh_data, cpu));
 	put_cpu_var(rcu_data);
 	put_cpu_var(rcu_bh_data);
@@ -375,7 +371,7 @@ static void rcu_offline_cpu(int cpu)
  * This does the RCU processing work from tasklet context. 
  */
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
-			struct rcu_state *rsp, struct rcu_data *rdp)
+					struct rcu_data *rdp)
 {
 	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
 		*rdp->donetail = rdp->curlist;
@@ -405,25 +401,23 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 
 		if (!rcp->next_pending) {
 			/* and start it/schedule start if it's a new batch */
-			spin_lock(&rsp->lock);
+			spin_lock(&rcp->lock);
 			rcp->next_pending = 1;
-			rcu_start_batch(rcp, rsp);
-			spin_unlock(&rsp->lock);
+			rcu_start_batch(rcp);
+			spin_unlock(&rcp->lock);
 		}
 	} else {
 		local_irq_enable();
 	}
-	rcu_check_quiescent_state(rcp, rsp, rdp);
+	rcu_check_quiescent_state(rcp, rdp);
 	if (rdp->donelist)
 		rcu_do_batch(rdp);
 }
 
 static void rcu_process_callbacks(unsigned long unused)
 {
-	__rcu_process_callbacks(&rcu_ctrlblk, &rcu_state,
-				&__get_cpu_var(rcu_data));
-	__rcu_process_callbacks(&rcu_bh_ctrlblk, &rcu_bh_state,
-				&__get_cpu_var(rcu_bh_data));
+	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
 }
 
 static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-- 
cgit v1.2.2


From a9c828155ae8d1a1576f9648cc4d9677aea53f89 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 10 Jan 2006 17:24:53 +0300
Subject: [PATCH] rcu: fix hotplug-cpu ->donelist leak

Pointed out by Srivatsa Vaddagiri <vatsa@in.ibm.com>.

rcu_do_batch() stops after processing maxbatch callbacks
on ->donelist leaving rcu_tasklet in TASKLET_STATE_SCHED
state.

If CPU_DEAD event happens remaining ->donelist entries are
lost, rcu_offline_cpu() kills this tasklet.

With this patch ->donelist migrates along with ->curlist
and ->nxtlist to the current cpu.

Compile tested.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index e18f9190eafa..0cf8146bd585 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -343,8 +343,9 @@ static void __rcu_offline_cpu(struct rcu_data *this_rdp,
 	spin_unlock_bh(&rcp->lock);
 	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
 	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
-
+	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
 }
+
 static void rcu_offline_cpu(int cpu)
 {
 	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
-- 
cgit v1.2.2


From 042c904c3e35e95ac911e8a2bf4097099b059e1a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@redhat.com>
Date: Tue, 10 Jan 2006 15:48:59 -0500
Subject: [PATCH] remove unnecessary asm/mutex.h from kernel/mutex-debug.c

Remove unnecessary (and incorrect) inclusion of asm/mutex.h, pointed out
by David Howells.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/mutex-debug.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 4fcb051a8b9e..3dec75363786 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -20,8 +20,6 @@
 #include <linux/kallsyms.h>
 #include <linux/interrupt.h>
 
-#include <asm/mutex.h>
-
 #include "mutex-debug.h"
 
 /*
-- 
cgit v1.2.2


From 73165b88ffd29813bf73b331eaf90d3521443236 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 10 Jan 2006 22:07:44 +0100
Subject: [PATCH] fix i386 mutex fastpath on FRAME_POINTER && !DEBUG_MUTEXES

Call the mutex slowpath more conservatively - e.g.  FRAME_POINTERS can
change the calling convention, in which case a direct branch to the
slowpath becomes illegal.  Bug found by Hugh Dickins.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/mutex.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/mutex.c b/kernel/mutex.c
index 7eb960661441..d3dcb8b44bac 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -84,12 +84,6 @@ void fastcall __sched mutex_lock(struct mutex *lock)
 	/*
 	 * The locking fastpath is the 1->0 transition from
 	 * 'unlocked' into 'locked' state.
-	 *
-	 * NOTE: if asm/mutex.h is included, then some architectures
-	 * rely on mutex_lock() having _no other code_ here but this
-	 * fastpath. That allows the assembly fastpath to do
-	 * tail-merging optimizations. (If you want to put testcode
-	 * here, do it under #ifndef CONFIG_MUTEX_DEBUG.)
 	 */
 	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
 }
@@ -115,8 +109,6 @@ void fastcall __sched mutex_unlock(struct mutex *lock)
 	/*
 	 * The unlocking fastpath is the 0->1 transition from 'locked'
 	 * into 'unlocked' state:
-	 *
-	 * NOTE: no other code must be here - see mutex_lock() .
 	 */
 	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
 }
@@ -261,7 +253,6 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__);
  */
 int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
 {
-	/* NOTE: no other code must be here - see mutex_lock() */
 	return __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_interruptible_slowpath);
 }
-- 
cgit v1.2.2


From c544bdb1999e04eb10035f1c3262a103859d94b2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 10 Jan 2006 22:10:36 +0100
Subject: [PATCH] mark mutex_lock*() as might_sleep()

Mark mutex_lock() and mutex_lock_interruptible() as might_sleep()
functions.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/mutex.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/mutex.c b/kernel/mutex.c
index d3dcb8b44bac..5c2561844196 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -81,6 +81,7 @@ __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__);
  */
 void fastcall __sched mutex_lock(struct mutex *lock)
 {
+	might_sleep();
 	/*
 	 * The locking fastpath is the 1->0 transition from
 	 * 'unlocked' into 'locked' state.
@@ -253,6 +254,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__);
  */
 int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
 {
+	might_sleep();
 	return __mutex_fastpath_lock_retval
 			(&lock->count, __mutex_lock_interruptible_slowpath);
 }
-- 
cgit v1.2.2


From 02706647a49011ae1e7b4eca33e835d1681b094e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 10 Jan 2006 23:15:02 +0100
Subject: [PATCH] mutex: trivial whitespace cleanups

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/mutex-debug.c | 1 -
 kernel/mutex.c       | 5 +----
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 3dec75363786..6f829058ae4a 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -459,4 +459,3 @@ void fastcall mutex_destroy(struct mutex *lock)
 }
 
 EXPORT_SYMBOL_GPL(mutex_destroy);
-
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5c2561844196..5449b210d9ed 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -202,7 +202,7 @@ __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__)
 static fastcall noinline void
 __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
 {
-        struct mutex *lock = container_of(lock_count, struct mutex, count);
+	struct mutex *lock = container_of(lock_count, struct mutex, count);
 
 	DEBUG_WARN_ON(lock->owner != current_thread_info());
 
@@ -313,6 +313,3 @@ int fastcall mutex_trylock(struct mutex *lock)
 }
 
 EXPORT_SYMBOL(mutex_trylock);
-
-
-
-- 
cgit v1.2.2


From a4fc7ab1d065a9dd89ed0e74439ef87d4a16e980 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 11 Jan 2006 14:41:26 +0000
Subject: [PATCH] fix/simplify mutex debugging code

Let's switch mutex_debug_check_no_locks_freed() to take (addr, len) as
arguments instead, since all its callers were just calculating the 'to'
address for themselves anyway... (and sometimes doing so badly).

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/mutex-debug.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 6f829058ae4a..f4913c376950 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -333,9 +333,10 @@ void mutex_debug_check_no_locks_held(struct task_struct *task)
  * is destroyed or reinitialized - this code checks whether there is
  * any held lock in the memory range of <from> to <to>:
  */
-void mutex_debug_check_no_locks_freed(const void *from, const void *to)
+void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
 {
 	struct list_head *curr, *next;
+	const void *to = from + len;
 	unsigned long flags;
 	struct mutex *lock;
 	void *lock_addr;
@@ -437,7 +438,7 @@ void debug_mutex_init(struct mutex *lock, const char *name)
 	/*
 	 * Make sure we are not reinitializing a held lock:
 	 */
-	mutex_debug_check_no_locks_freed((void *)lock, (void *)(lock + 1));
+	mutex_debug_check_no_locks_freed((void *)lock, sizeof(*lock));
 	lock->owner = NULL;
 	INIT_LIST_HEAD(&lock->held_list);
 	lock->name = name;
-- 
cgit v1.2.2


From df019b1d8b893d0f0ee5a9b0f71486f0892561ae Mon Sep 17 00:00:00 2001
From: Keshavamurthy Anil S <anil.s.keshavamurthy@intel.com>
Date: Wed, 11 Jan 2006 12:17:41 -0800
Subject: [PATCH] kprobes: fix unloading of self probed module

When a kprobes modules is written in such a way that probes are inserted on
itself, then unload of that moudle was not possible due to reference
couning on the same module.

The below patch makes a check and incrementes the module refcount only if
it is not a self probed module.

We need to allow modules to probe themself for kprobes performance
measurements

This patch has been tested on several x86_64, ppc64 and IA64 architectures.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 42 ++++++++++++++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 34a885bb82e0..3ea6325228da 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -449,19 +449,32 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
 	return 0;
 }
 
-int __kprobes register_kprobe(struct kprobe *p)
+static int __kprobes __register_kprobe(struct kprobe *p,
+	unsigned long called_from)
 {
 	int ret = 0;
 	struct kprobe *old_p;
-	struct module *mod;
+	struct module *probed_mod;
 
 	if ((!kernel_text_address((unsigned long) p->addr)) ||
 		in_kprobes_functions((unsigned long) p->addr))
 		return -EINVAL;
 
-	if ((mod = module_text_address((unsigned long) p->addr)) &&
-			(unlikely(!try_module_get(mod))))
-		return -EINVAL;
+	p->mod_refcounted = 0;
+	/* Check are we probing a module */
+	if ((probed_mod = module_text_address((unsigned long) p->addr))) {
+		struct module *calling_mod = module_text_address(called_from);
+		/* We must allow modules to probe themself and
+		 * in this case avoid incrementing the module refcount,
+		 * so as to allow unloading of self probing modules.
+		 */
+		if (calling_mod && (calling_mod != probed_mod)) {
+			if (unlikely(!try_module_get(probed_mod)))
+				return -EINVAL;
+			p->mod_refcounted = 1;
+		} else
+			probed_mod = NULL;
+	}
 
 	p->nmissed = 0;
 	down(&kprobe_mutex);
@@ -483,11 +496,17 @@ int __kprobes register_kprobe(struct kprobe *p)
 out:
 	up(&kprobe_mutex);
 
-	if (ret && mod)
-		module_put(mod);
+	if (ret && probed_mod)
+		module_put(probed_mod);
 	return ret;
 }
 
+int __kprobes register_kprobe(struct kprobe *p)
+{
+	return __register_kprobe(p,
+		(unsigned long)__builtin_return_address(0));
+}
+
 void __kprobes unregister_kprobe(struct kprobe *p)
 {
 	struct module *mod;
@@ -524,7 +543,8 @@ valid_p:
 	up(&kprobe_mutex);
 
 	synchronize_sched();
-	if ((mod = module_text_address((unsigned long)p->addr)))
+	if (p->mod_refcounted &&
+	    (mod = module_text_address((unsigned long)p->addr)))
 		module_put(mod);
 
 	if (cleanup_p) {
@@ -547,7 +567,8 @@ int __kprobes register_jprobe(struct jprobe *jp)
 	jp->kp.pre_handler = setjmp_pre_handler;
 	jp->kp.break_handler = longjmp_break_handler;
 
-	return register_kprobe(&jp->kp);
+	return __register_kprobe(&jp->kp,
+		(unsigned long)__builtin_return_address(0));
 }
 
 void __kprobes unregister_jprobe(struct jprobe *jp)
@@ -587,7 +608,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
 
 	rp->nmissed = 0;
 	/* Establish function entry probe point */
-	if ((ret = register_kprobe(&rp->kp)) != 0)
+	if ((ret = __register_kprobe(&rp->kp,
+		(unsigned long)__builtin_return_address(0))) != 0)
 		free_rp_inst(rp);
 	return ret;
 }
-- 
cgit v1.2.2


From e16885c5ad624a6efe1b1bf764e075d75f65a788 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 11 Jan 2006 12:17:45 -0800
Subject: [PATCH] uninline capable()

Uninline capable().  Saves 2K of kernel text on a generic .config, and 1K on a
tiny config.  In addition it makes the use of capable more consistent between
CONFIG_SECURITY and !CONFIG_SECURITY

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index b6941e06d5d5..9ccf713491f9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -223,6 +223,18 @@ int unregister_reboot_notifier(struct notifier_block * nb)
 
 EXPORT_SYMBOL(unregister_reboot_notifier);
 
+#ifndef CONFIG_SECURITY
+int capable(int cap)
+{
+        if (cap_raised(current->cap_effective, cap)) {
+	       current->flags |= PF_SUPERPRIV;
+	       return 1;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(capable);
+#endif
+
 static int set_one_prio(struct task_struct *p, int niceval, int error)
 {
 	int no_nice;
-- 
cgit v1.2.2


From c59ede7b78db329949d9cdcd7064e22d357560ef Mon Sep 17 00:00:00 2001
From: "Randy.Dunlap" <rdunlap@xenotime.net>
Date: Wed, 11 Jan 2006 12:17:46 -0800
Subject: [PATCH] move capable() to capability.h

- Move capable() from sched.h to capability.h;

- Use <linux/capability.h> where capable() is used
	(in include/, block/, ipc/, kernel/, a few drivers/,
	mm/, security/, & sound/;
	many more drivers/ to go)

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/acct.c       | 1 +
 kernel/capability.c | 1 +
 kernel/exit.c       | 1 +
 kernel/fork.c       | 1 +
 kernel/kexec.c      | 1 +
 kernel/module.c     | 1 +
 kernel/ptrace.c     | 1 +
 kernel/sched.c      | 1 +
 kernel/signal.c     | 1 +
 kernel/sys.c        | 1 +
 kernel/sysctl.c     | 1 +
 kernel/time.c       | 1 +
 kernel/uid16.c      | 1 +
 13 files changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 38d57fa6b78f..065d8b4e51ef 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -47,6 +47,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/acct.h>
+#include <linux/capability.h>
 #include <linux/file.h>
 #include <linux/tty.h>
 #include <linux/security.h>
diff --git a/kernel/capability.c b/kernel/capability.c
index 8986a37a67ea..bfa3c92e16f2 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,7 @@
  * 30 May 2002:	Cleanup, Robert M. Love <rml@tech9.net>
  */ 
 
+#include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/security.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index 802722814925..f8e609ff1893 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -10,6 +10,7 @@
 #include <linux/interrupt.h>
 #include <linux/smp_lock.h>
 #include <linux/module.h>
+#include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/personality.h>
 #include <linux/tty.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 3bdcab49998d..16a776ec2c0b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -28,6 +28,7 @@
 #include <linux/binfmts.h>
 #include <linux/mman.h>
 #include <linux/fs.h>
+#include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/security.h>
diff --git a/kernel/kexec.c b/kernel/kexec.c
index de1441656efd..bf39d28e4c0e 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -6,6 +6,7 @@
  * Version 2.  See the file COPYING for more details.
  */
 
+#include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/file.h>
 #include <linux/slab.h>
diff --git a/kernel/module.c b/kernel/module.c
index e4276046a1b6..618ed6e23ecc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -28,6 +28,7 @@
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
 #include <linux/rcupdate.h>
+#include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/moduleparam.h>
 #include <linux/errno.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index cceaf09ac413..5f33cdb6fff5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -7,6 +7,7 @@
  * to continually duplicate across every architecture.
  */
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index 34a945bcc022..d129e560cc0d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -27,6 +27,7 @@
 #include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
+#include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/security.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 08aa5b263f36..1da2e74beb97 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -25,6 +25,7 @@
 #include <linux/posix-timers.h>
 #include <linux/signal.h>
 #include <linux/audit.h>
+#include <linux/capability.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 9ccf713491f9..d09cac23fdfd 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/kexec.h>
 #include <linux/workqueue.h>
+#include <linux/capability.h>
 #include <linux/device.h>
 #include <linux/key.h>
 #include <linux/times.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 03b0598f2369..62d4d9566876 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
+#include <linux/capability.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
 #include <linux/capability.h>
diff --git a/kernel/time.c b/kernel/time.c
index 169e8329e0b6..7477b1d2079e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -29,6 +29,7 @@
 
 #include <linux/module.h>
 #include <linux/timex.h>
+#include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/syscalls.h>
diff --git a/kernel/uid16.c b/kernel/uid16.c
index f669941e8b26..aa25605027c8 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -10,6 +10,7 @@
 #include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
+#include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/security.h>
-- 
cgit v1.2.2


From 4cef0c61380888af99b054937ed391fb28757836 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 11 Jan 2006 22:44:57 +0100
Subject: [PATCH] x86_64: Make the cpu_*_maps in kernel/sched.c read mostly

They are referred to often so avoid potential false sharing for them.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d129e560cc0d..c0c60c926d5e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3980,12 +3980,12 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
  * method, such as ACPI for e.g.
  */
 
-cpumask_t cpu_present_map;
+cpumask_t cpu_present_map __read_mostly;
 EXPORT_SYMBOL(cpu_present_map);
 
 #ifndef CONFIG_SMP
-cpumask_t cpu_online_map = CPU_MASK_ALL;
-cpumask_t cpu_possible_map = CPU_MASK_ALL;
+cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
 #endif
 
 long sched_getaffinity(pid_t pid, cpumask_t *mask)
-- 
cgit v1.2.2


From 5fd63b308569060ffa40af52ed122d9734111bff Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Wed, 11 Jan 2006 22:46:15 +0100
Subject: [PATCH] x86_64: Inclusion of ScaleMP vSMP architecture patches -
 vsmp_align

vSMP specific alignment patch to
1. Define INTERNODE_CACHE_SHIFT for vSMP
2. Use this for alignment of critical structures
3. Use INTERNODE_CACHE_SHIFT for ARCH_MIN_TASKALIGN,
   and let the slab align task_struct allocations to the internode cacheline size
4. Introduce and use ARCH_MIN_MMSTRUCT_ALIGN for mm_struct slab allocations.

Signed-off-by: Ravikiran Thirumalai <kiran@scalemp.com>
Signed-off-by: Shai Fultheim <shai@scalemp.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 16a776ec2c0b..4ae8cfc1c89c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1298,6 +1298,10 @@ long do_fork(unsigned long clone_flags,
 	return pid;
 }
 
+#ifndef ARCH_MIN_MMSTRUCT_ALIGN
+#define ARCH_MIN_MMSTRUCT_ALIGN 0
+#endif
+
 void __init proc_caches_init(void)
 {
 	sighand_cachep = kmem_cache_create("sighand_cache",
@@ -1316,6 +1320,6 @@ void __init proc_caches_init(void)
 			sizeof(struct vm_area_struct), 0,
 			SLAB_PANIC, NULL, NULL);
 	mm_cachep = kmem_cache_create("mm_struct",
-			sizeof(struct mm_struct), 0,
+			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 }
-- 
cgit v1.2.2


From 288867ec5c377db82933b64460ce050e5c998ee9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 12 Jan 2006 11:25:54 +0100
Subject: [hrtimer] Remove listhead from hrtimer struct

The list_head in the hrtimer structure was introduced for easy access
to the first timer with the further extensions of real high resolution
timers in mind, but it turned out in the course of development that
it is not necessary for the standard use case. Remove the list head
and access the first expiry timer by a datafield in the timer base.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f073a2461faa..e6e8278bcb18 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -314,7 +314,6 @@ hrtimer_forward(struct hrtimer *timer, const ktime_t interval)
 static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 {
 	struct rb_node **link = &base->active.rb_node;
-	struct list_head *prev = &base->pending;
 	struct rb_node *parent = NULL;
 	struct hrtimer *entry;
 
@@ -330,22 +329,23 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 		 */
 		if (timer->expires.tv64 < entry->expires.tv64)
 			link = &(*link)->rb_left;
-		else {
+		else
 			link = &(*link)->rb_right;
-			prev = &entry->list;
-		}
 	}
 
 	/*
-	 * Insert the timer to the rbtree and to the sorted list:
+	 * Insert the timer to the rbtree and check whether it
+	 * replaces the first pending timer
 	 */
 	rb_link_node(&timer->node, parent, link);
 	rb_insert_color(&timer->node, &base->active);
-	list_add(&timer->list, prev);
 
 	timer->state = HRTIMER_PENDING;
-}
 
+	if (!base->first || timer->expires.tv64 <
+	    rb_entry(base->first, struct hrtimer, node)->expires.tv64)
+		base->first = &timer->node;
+}
 
 /*
  * __remove_hrtimer - internal function to remove a timer
@@ -355,9 +355,11 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 {
 	/*
-	 * Remove the timer from the sorted list and from the rbtree:
+	 * Remove the timer from the rbtree and replace the
+	 * first entry pointer if necessary.
 	 */
-	list_del(&timer->list);
+	if (base->first == &timer->node)
+		base->first = rb_next(&timer->node);
 	rb_erase(&timer->node, &base->active);
 }
 
@@ -529,16 +531,17 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 static inline void run_hrtimer_queue(struct hrtimer_base *base)
 {
 	ktime_t now = base->get_time();
+	struct rb_node *node;
 
 	spin_lock_irq(&base->lock);
 
-	while (!list_empty(&base->pending)) {
+	while ((node = base->first)) {
 		struct hrtimer *timer;
 		int (*fn)(void *);
 		int restart;
 		void *data;
 
-		timer = list_entry(base->pending.next, struct hrtimer, list);
+		timer = rb_entry(node, struct hrtimer, node);
 		if (now.tv64 <= timer->expires.tv64)
 			break;
 
@@ -732,7 +735,6 @@ static void __devinit init_hrtimers_cpu(int cpu)
 
 	for (i = 0; i < MAX_HRTIMER_BASES; i++) {
 		spin_lock_init(&base->lock);
-		INIT_LIST_HEAD(&base->pending);
 		base++;
 	}
 }
-- 
cgit v1.2.2


From e2787630c1abb075c935cf47e91beb7c656f48c4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 12 Jan 2006 11:36:14 +0100
Subject: [hrtimer] Change resolution storage to ktime_t format

Change the storage format of the per base resolution to ktime_t to
make it easier accessible in the hrtimers code.

Change the resolution from (NSEC_PER_SEC/HZ) to TICK_NSEC as Roman
pointed out. TICK_NSEC is closer to the real resolution.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index e6e8278bcb18..76d759ce6231 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -518,9 +518,8 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 {
 	struct hrtimer_base *bases;
 
-	tp->tv_sec = 0;
 	bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
-	tp->tv_nsec = bases[which_clock].resolution;
+	*tp = ktime_to_timespec(bases[which_clock].resolution);
 
 	return 0;
 }
-- 
cgit v1.2.2


From c9db4fa11526affde83603fe52595bd1260c1354 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 12 Jan 2006 11:47:34 +0100
Subject: [hrtimer] Enforce resolution as lower limit of intervals

Roman Zippel pointed out that the missing lower limit of intervals
leads to an accounting error in the overrun count. Enforce the lower
limit of intervals to resolution in the timer forwarding code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 76d759ce6231..04ccab099e84 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -275,7 +275,7 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
  * The number of overruns is added to the overrun field.
  */
 unsigned long
-hrtimer_forward(struct hrtimer *timer, const ktime_t interval)
+hrtimer_forward(struct hrtimer *timer, ktime_t interval)
 {
 	unsigned long orun = 1;
 	ktime_t delta, now;
@@ -287,6 +287,9 @@ hrtimer_forward(struct hrtimer *timer, const ktime_t interval)
 	if (delta.tv64 < 0)
 		return 0;
 
+	if (interval.tv64 < timer->base->resolution.tv64)
+		interval.tv64 = timer->base->resolution.tv64;
+
 	if (unlikely(delta.tv64 >= interval.tv64)) {
 		nsec_t incr = ktime_to_ns(interval);
 
-- 
cgit v1.2.2


From 198e2f181163233b379dc7ce8a6d7516b84042e7 Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Thu, 12 Jan 2006 01:05:30 -0800
Subject: [PATCH] scheduler cache-hot-autodetect

)

From: Ingo Molnar <mingo@elte.hu>

This is the latest version of the scheduler cache-hot-auto-tune patch.

The first problem was that detection time scaled with O(N^2), which is
unacceptable on larger SMP and NUMA systems. To solve this:

- I've added a 'domain distance' function, which is used to cache
  measurement results. Each distance is only measured once. This means
  that e.g. on NUMA distances of 0, 1 and 2 might be measured, on HT
  distances 0 and 1, and on SMP distance 0 is measured. The code walks
  the domain tree to determine the distance, so it automatically follows
  whatever hierarchy an architecture sets up. This cuts down on the boot
  time significantly and removes the O(N^2) limit. The only assumption
  is that migration costs can be expressed as a function of domain
  distance - this covers the overwhelming majority of existing systems,
  and is a good guess even for more assymetric systems.

  [ People hacking systems that have assymetries that break this
    assumption (e.g. different CPU speeds) should experiment a bit with
    the cpu_distance() function. Adding a ->migration_distance factor to
    the domain structure would be one possible solution - but lets first
    see the problem systems, if they exist at all. Lets not overdesign. ]

Another problem was that only a single cache-size was used for measuring
the cost of migration, and most architectures didnt set that variable
up. Furthermore, a single cache-size does not fit NUMA hierarchies with
L3 caches and does not fit HT setups, where different CPUs will often
have different 'effective cache sizes'. To solve this problem:

- Instead of relying on a single cache-size provided by the platform and
  sticking to it, the code now auto-detects the 'effective migration
  cost' between two measured CPUs, via iterating through a wide range of
  cachesizes. The code searches for the maximum migration cost, which
  occurs when the working set of the test-workload falls just below the
  'effective cache size'. I.e. real-life optimized search is done for
  the maximum migration cost, between two real CPUs.

  This, amongst other things, has the positive effect hat if e.g. two
  CPUs share a L2/L3 cache, a different (and accurate) migration cost
  will be found than between two CPUs on the same system that dont share
  any caches.

(The reliable measurement of migration costs is tricky - see the source
for details.)

Furthermore i've added various boot-time options to override/tune
migration behavior.

Firstly, there's a blanket override for autodetection:

	migration_cost=1000,2000,3000

will override the depth 0/1/2 values with 1msec/2msec/3msec values.

Secondly, there's a global factor that can be used to increase (or
decrease) the autodetected values:

	migration_factor=120

will increase the autodetected values by 20%. This option is useful to
tune things in a workload-dependent way - e.g. if a workload is
cache-insensitive then CPU utilization can be maximized by specifying
migration_factor=0.

I've tested the autodetection code quite extensively on x86, on 3
P3/Xeon/2MB, and the autodetected values look pretty good:

Dual Celeron (128K L2 cache):

 ---------------------
 migration cost matrix (max_cache_size: 131072, cpu: 467 MHz):
 ---------------------
           [00]    [01]
 [00]:     -     1.7(1)
 [01]:   1.7(1)    -
 ---------------------
 cacheflush times [2]: 0.0 (0) 1.7 (1784008)
 ---------------------

Here the slow memory subsystem dominates system performance, and even
though caches are small, the migration cost is 1.7 msecs.

Dual HT P4 (512K L2 cache):

 ---------------------
 migration cost matrix (max_cache_size: 524288, cpu: 2379 MHz):
 ---------------------
           [00]    [01]    [02]    [03]
 [00]:     -     0.4(1)  0.0(0)  0.4(1)
 [01]:   0.4(1)    -     0.4(1)  0.0(0)
 [02]:   0.0(0)  0.4(1)    -     0.4(1)
 [03]:   0.4(1)  0.0(0)  0.4(1)    -
 ---------------------
 cacheflush times [2]: 0.0 (33900) 0.4 (448514)
 ---------------------

Here it can be seen that there is no migration cost between two HT
siblings (CPU#0/2 and CPU#1/3 are separate physical CPUs). A fast memory
system makes inter-physical-CPU migration pretty cheap: 0.4 msecs.

8-way P3/Xeon [2MB L2 cache]:

 ---------------------
 migration cost matrix (max_cache_size: 2097152, cpu: 700 MHz):
 ---------------------
           [00]    [01]    [02]    [03]    [04]    [05]    [06]    [07]
 [00]:     -    19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
 [01]:  19.2(1)    -    19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
 [02]:  19.2(1) 19.2(1)    -    19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
 [03]:  19.2(1) 19.2(1) 19.2(1)    -    19.2(1) 19.2(1) 19.2(1) 19.2(1)
 [04]:  19.2(1) 19.2(1) 19.2(1) 19.2(1)    -    19.2(1) 19.2(1) 19.2(1)
 [05]:  19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)    -    19.2(1) 19.2(1)
 [06]:  19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)    -    19.2(1)
 [07]:  19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)    -
 ---------------------
 cacheflush times [2]: 0.0 (0) 19.2 (19281756)
 ---------------------

This one has huge caches and a relatively slow memory subsystem - so the
migration cost is 19 msecs.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: <wilder@us.ibm.com>
Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 468 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 468 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c0c60c926d5e..98461de1ab65 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -34,6 +34,7 @@
 #include <linux/notifier.h>
 #include <linux/profile.h>
 #include <linux/suspend.h>
+#include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/smp.h>
@@ -5082,7 +5083,470 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
 
 #define SD_NODES_PER_DOMAIN 16
 
+/*
+ * Self-tuning task migration cost measurement between source and target CPUs.
+ *
+ * This is done by measuring the cost of manipulating buffers of varying
+ * sizes. For a given buffer-size here are the steps that are taken:
+ *
+ * 1) the source CPU reads+dirties a shared buffer
+ * 2) the target CPU reads+dirties the same shared buffer
+ *
+ * We measure how long they take, in the following 4 scenarios:
+ *
+ *  - source: CPU1, target: CPU2 | cost1
+ *  - source: CPU2, target: CPU1 | cost2
+ *  - source: CPU1, target: CPU1 | cost3
+ *  - source: CPU2, target: CPU2 | cost4
+ *
+ * We then calculate the cost3+cost4-cost1-cost2 difference - this is
+ * the cost of migration.
+ *
+ * We then start off from a small buffer-size and iterate up to larger
+ * buffer sizes, in 5% steps - measuring each buffer-size separately, and
+ * doing a maximum search for the cost. (The maximum cost for a migration
+ * normally occurs when the working set size is around the effective cache
+ * size.)
+ */
+#define SEARCH_SCOPE		2
+#define MIN_CACHE_SIZE		(64*1024U)
+#define DEFAULT_CACHE_SIZE	(5*1024*1024U)
+#define ITERATIONS		2
+#define SIZE_THRESH		130
+#define COST_THRESH		130
+
+/*
+ * The migration cost is a function of 'domain distance'. Domain
+ * distance is the number of steps a CPU has to iterate down its
+ * domain tree to share a domain with the other CPU. The farther
+ * two CPUs are from each other, the larger the distance gets.
+ *
+ * Note that we use the distance only to cache measurement results,
+ * the distance value is not used numerically otherwise. When two
+ * CPUs have the same distance it is assumed that the migration
+ * cost is the same. (this is a simplification but quite practical)
+ */
+#define MAX_DOMAIN_DISTANCE 32
+
+static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
+		{ [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL };
+
+/*
+ * Allow override of migration cost - in units of microseconds.
+ * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
+ * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
+ */
+static int __init migration_cost_setup(char *str)
+{
+	int ints[MAX_DOMAIN_DISTANCE+1], i;
+
+	str = get_options(str, ARRAY_SIZE(ints), ints);
+
+	printk("#ints: %d\n", ints[0]);
+	for (i = 1; i <= ints[0]; i++) {
+		migration_cost[i-1] = (unsigned long long)ints[i]*1000;
+		printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
+	}
+	return 1;
+}
+
+__setup ("migration_cost=", migration_cost_setup);
+
+/*
+ * Global multiplier (divisor) for migration-cutoff values,
+ * in percentiles. E.g. use a value of 150 to get 1.5 times
+ * longer cache-hot cutoff times.
+ *
+ * (We scale it from 100 to 128 to long long handling easier.)
+ */
+
+#define MIGRATION_FACTOR_SCALE 128
+
+static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
+
+static int __init setup_migration_factor(char *str)
+{
+	get_option(&str, &migration_factor);
+	migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
+	return 1;
+}
+
+__setup("migration_factor=", setup_migration_factor);
+
+/*
+ * Estimated distance of two CPUs, measured via the number of domains
+ * we have to pass for the two CPUs to be in the same span:
+ */
+static unsigned long domain_distance(int cpu1, int cpu2)
+{
+	unsigned long distance = 0;
+	struct sched_domain *sd;
+
+	for_each_domain(cpu1, sd) {
+		WARN_ON(!cpu_isset(cpu1, sd->span));
+		if (cpu_isset(cpu2, sd->span))
+			return distance;
+		distance++;
+	}
+	if (distance >= MAX_DOMAIN_DISTANCE) {
+		WARN_ON(1);
+		distance = MAX_DOMAIN_DISTANCE-1;
+	}
+
+	return distance;
+}
+
+static unsigned int migration_debug;
+
+static int __init setup_migration_debug(char *str)
+{
+	get_option(&str, &migration_debug);
+	return 1;
+}
+
+__setup("migration_debug=", setup_migration_debug);
+
+/*
+ * Maximum cache-size that the scheduler should try to measure.
+ * Architectures with larger caches should tune this up during
+ * bootup. Gets used in the domain-setup code (i.e. during SMP
+ * bootup).
+ */
+unsigned int max_cache_size;
+
+static int __init setup_max_cache_size(char *str)
+{
+	get_option(&str, &max_cache_size);
+	return 1;
+}
+
+__setup("max_cache_size=", setup_max_cache_size);
+
+/*
+ * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
+ * is the operation that is timed, so we try to generate unpredictable
+ * cachemisses that still end up filling the L2 cache:
+ */
+static void touch_cache(void *__cache, unsigned long __size)
+{
+	unsigned long size = __size/sizeof(long), chunk1 = size/3,
+			chunk2 = 2*size/3;
+	unsigned long *cache = __cache;
+	int i;
+
+	for (i = 0; i < size/6; i += 8) {
+		switch (i % 6) {
+			case 0: cache[i]++;
+			case 1: cache[size-1-i]++;
+			case 2: cache[chunk1-i]++;
+			case 3: cache[chunk1+i]++;
+			case 4: cache[chunk2-i]++;
+			case 5: cache[chunk2+i]++;
+		}
+	}
+}
+
+/*
+ * Measure the cache-cost of one task migration. Returns in units of nsec.
+ */
+static unsigned long long measure_one(void *cache, unsigned long size,
+				      int source, int target)
+{
+	cpumask_t mask, saved_mask;
+	unsigned long long t0, t1, t2, t3, cost;
+
+	saved_mask = current->cpus_allowed;
+
+	/*
+	 * Flush source caches to RAM and invalidate them:
+	 */
+	sched_cacheflush();
+
+	/*
+	 * Migrate to the source CPU:
+	 */
+	mask = cpumask_of_cpu(source);
+	set_cpus_allowed(current, mask);
+	WARN_ON(smp_processor_id() != source);
+
+	/*
+	 * Dirty the working set:
+	 */
+	t0 = sched_clock();
+	touch_cache(cache, size);
+	t1 = sched_clock();
+
+	/*
+	 * Migrate to the target CPU, dirty the L2 cache and access
+	 * the shared buffer. (which represents the working set
+	 * of a migrated task.)
+	 */
+	mask = cpumask_of_cpu(target);
+	set_cpus_allowed(current, mask);
+	WARN_ON(smp_processor_id() != target);
+
+	t2 = sched_clock();
+	touch_cache(cache, size);
+	t3 = sched_clock();
+
+	cost = t1-t0 + t3-t2;
+
+	if (migration_debug >= 2)
+		printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
+			source, target, t1-t0, t1-t0, t3-t2, cost);
+	/*
+	 * Flush target caches to RAM and invalidate them:
+	 */
+	sched_cacheflush();
+
+	set_cpus_allowed(current, saved_mask);
+
+	return cost;
+}
+
+/*
+ * Measure a series of task migrations and return the average
+ * result. Since this code runs early during bootup the system
+ * is 'undisturbed' and the average latency makes sense.
+ *
+ * The algorithm in essence auto-detects the relevant cache-size,
+ * so it will properly detect different cachesizes for different
+ * cache-hierarchies, depending on how the CPUs are connected.
+ *
+ * Architectures can prime the upper limit of the search range via
+ * max_cache_size, otherwise the search range defaults to 20MB...64K.
+ */
+static unsigned long long
+measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
+{
+	unsigned long long cost1, cost2;
+	int i;
+
+	/*
+	 * Measure the migration cost of 'size' bytes, over an
+	 * average of 10 runs:
+	 *
+	 * (We perturb the cache size by a small (0..4k)
+	 *  value to compensate size/alignment related artifacts.
+	 *  We also subtract the cost of the operation done on
+	 *  the same CPU.)
+	 */
+	cost1 = 0;
+
+	/*
+	 * dry run, to make sure we start off cache-cold on cpu1,
+	 * and to get any vmalloc pagefaults in advance:
+	 */
+	measure_one(cache, size, cpu1, cpu2);
+	for (i = 0; i < ITERATIONS; i++)
+		cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
+
+	measure_one(cache, size, cpu2, cpu1);
+	for (i = 0; i < ITERATIONS; i++)
+		cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
+
+	/*
+	 * (We measure the non-migrating [cached] cost on both
+	 *  cpu1 and cpu2, to handle CPUs with different speeds)
+	 */
+	cost2 = 0;
+
+	measure_one(cache, size, cpu1, cpu1);
+	for (i = 0; i < ITERATIONS; i++)
+		cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
+
+	measure_one(cache, size, cpu2, cpu2);
+	for (i = 0; i < ITERATIONS; i++)
+		cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
+
+	/*
+	 * Get the per-iteration migration cost:
+	 */
+	do_div(cost1, 2*ITERATIONS);
+	do_div(cost2, 2*ITERATIONS);
+
+	return cost1 - cost2;
+}
+
+static unsigned long long measure_migration_cost(int cpu1, int cpu2)
+{
+	unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
+	unsigned int max_size, size, size_found = 0;
+	long long cost = 0, prev_cost;
+	void *cache;
+
+	/*
+	 * Search from max_cache_size*5 down to 64K - the real relevant
+	 * cachesize has to lie somewhere inbetween.
+	 */
+	if (max_cache_size) {
+		max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
+		size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
+	} else {
+		/*
+		 * Since we have no estimation about the relevant
+		 * search range
+		 */
+		max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
+		size = MIN_CACHE_SIZE;
+	}
+
+	if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
+		printk("cpu %d and %d not both online!\n", cpu1, cpu2);
+		return 0;
+	}
+
+	/*
+	 * Allocate the working set:
+	 */
+	cache = vmalloc(max_size);
+	if (!cache) {
+		printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
+		return 1000000; // return 1 msec on very small boxen
+	}
+
+	while (size <= max_size) {
+		prev_cost = cost;
+		cost = measure_cost(cpu1, cpu2, cache, size);
+
+		/*
+		 * Update the max:
+		 */
+		if (cost > 0) {
+			if (max_cost < cost) {
+				max_cost = cost;
+				size_found = size;
+			}
+		}
+		/*
+		 * Calculate average fluctuation, we use this to prevent
+		 * noise from triggering an early break out of the loop:
+		 */
+		fluct = abs(cost - prev_cost);
+		avg_fluct = (avg_fluct + fluct)/2;
+
+		if (migration_debug)
+			printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
+				cpu1, cpu2, size,
+				(long)cost / 1000000,
+				((long)cost / 100000) % 10,
+				(long)max_cost / 1000000,
+				((long)max_cost / 100000) % 10,
+				domain_distance(cpu1, cpu2),
+				cost, avg_fluct);
+
+		/*
+		 * If we iterated at least 20% past the previous maximum,
+		 * and the cost has dropped by more than 20% already,
+		 * (taking fluctuations into account) then we assume to
+		 * have found the maximum and break out of the loop early:
+		 */
+		if (size_found && (size*100 > size_found*SIZE_THRESH))
+			if (cost+avg_fluct <= 0 ||
+				max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
+
+				if (migration_debug)
+					printk("-> found max.\n");
+				break;
+			}
+		/*
+		 * Increase the cachesize in 5% steps:
+		 */
+		size = size * 20 / 19;
+	}
+
+	if (migration_debug)
+		printk("[%d][%d] working set size found: %d, cost: %Ld\n",
+			cpu1, cpu2, size_found, max_cost);
+
+	vfree(cache);
+
+	/*
+	 * A task is considered 'cache cold' if at least 2 times
+	 * the worst-case cost of migration has passed.
+	 *
+	 * (this limit is only listened to if the load-balancing
+	 * situation is 'nice' - if there is a large imbalance we
+	 * ignore it for the sake of CPU utilization and
+	 * processing fairness.)
+	 */
+	return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
+}
+
+static void calibrate_migration_costs(const cpumask_t *cpu_map)
+{
+	int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
+	unsigned long j0, j1, distance, max_distance = 0;
+	struct sched_domain *sd;
+
+	j0 = jiffies;
+
+	/*
+	 * First pass - calculate the cacheflush times:
+	 */
+	for_each_cpu_mask(cpu1, *cpu_map) {
+		for_each_cpu_mask(cpu2, *cpu_map) {
+			if (cpu1 == cpu2)
+				continue;
+			distance = domain_distance(cpu1, cpu2);
+			max_distance = max(max_distance, distance);
+			/*
+			 * No result cached yet?
+			 */
+			if (migration_cost[distance] == -1LL)
+				migration_cost[distance] =
+					measure_migration_cost(cpu1, cpu2);
+		}
+	}
+	/*
+	 * Second pass - update the sched domain hierarchy with
+	 * the new cache-hot-time estimations:
+	 */
+	for_each_cpu_mask(cpu, *cpu_map) {
+		distance = 0;
+		for_each_domain(cpu, sd) {
+			sd->cache_hot_time = migration_cost[distance];
+			distance++;
+		}
+	}
+	/*
+	 * Print the matrix:
+	 */
+	if (migration_debug)
+		printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
+			max_cache_size,
+#ifdef CONFIG_X86
+			cpu_khz/1000
+#else
+			-1
+#endif
+		);
+	printk("migration_cost=");
+	for (distance = 0; distance <= max_distance; distance++) {
+		if (distance)
+			printk(",");
+		printk("%ld", (long)migration_cost[distance] / 1000);
+	}
+	printk("\n");
+	j1 = jiffies;
+	if (migration_debug)
+		printk("migration: %ld seconds\n", (j1-j0)/HZ);
+
+	/*
+	 * Move back to the original CPU. NUMA-Q gets confused
+	 * if we migrate to another quad during bootup.
+	 */
+	if (raw_smp_processor_id() != orig_cpu) {
+		cpumask_t mask = cpumask_of_cpu(orig_cpu),
+			saved_mask = current->cpus_allowed;
+
+		set_cpus_allowed(current, mask);
+		set_cpus_allowed(current, saved_mask);
+	}
+}
+
 #ifdef CONFIG_NUMA
+
 /**
  * find_next_best_node - find the next node to include in a sched_domain
  * @node: node whose sched_domain we're building
@@ -5448,6 +5912,10 @@ next_sg:
 #endif
 		cpu_attach_domain(sd, i);
 	}
+	/*
+	 * Tune cache-hot values:
+	 */
+	calibrate_migration_costs(cpu_map);
 }
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
-- 
cgit v1.2.2


From d7102e95b7b9c00277562c29aad421d2d521c5f6 Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Thu, 12 Jan 2006 01:05:32 -0800
Subject: [PATCH] sched: filter affine wakeups

)

From: Nick Piggin <nickpiggin@yahoo.com.au>

Track the last waker CPU, and only consider wakeup-balancing if there's a
match between current waker CPU and the previous waker CPU.  This ensures
that there is some correlation between two subsequent wakeup events before
we move the task.  Should help random-wakeup workloads on large SMP
systems, by reducing the migration attempts by a factor of nr_cpus.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 98461de1ab65..c9dec2aa1976 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1290,6 +1290,9 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
 		}
 	}
 
+	if (p->last_waker_cpu != this_cpu)
+		goto out_set_cpu;
+
 	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 		goto out_set_cpu;
 
@@ -1360,6 +1363,8 @@ out_set_cpu:
 		cpu = task_cpu(p);
 	}
 
+	p->last_waker_cpu = this_cpu;
+
 out_activate:
 #endif /* CONFIG_SMP */
 	if (old_state == TASK_UNINTERRUPTIBLE) {
@@ -1441,9 +1446,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP)
+	p->last_waker_cpu = cpu;
+#if defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	p->oncpu = 0;
 #endif
+#endif
 #ifdef CONFIG_PREEMPT
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
-- 
cgit v1.2.2