From d55b5fdaf40846221d543937b786956e27837fda Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Mon, 7 Nov 2005 00:59:31 -0800
Subject: [PATCH] aio: remove aio_max_nr accounting race

AIO was adding a new context's max requests to the global total before
testing if that resulting total was over the global limit.  This let
innocent tasks get their new limit tested along with a racing guilty task
that was crossing the limit.  This serializes the _nr accounting with a
spinlock It also switches to using unsigned long for the global totals.
Individual contexts are still limited to an unsigned int's worth of
requests by the syscall interface.

The problem and fix were verified with a simple program that spun creating
and destroying a context while holding on to another long lived context.
Before the patch a task creating a tiny context could get a spurious EAGAIN
if it raced with a task creating a very large context that overran the
limit.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/sysctl.c')
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8e56e2495542..e1351200ce85 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -952,7 +952,7 @@ static ctl_table fs_table[] = {
 		.data		= &aio_nr,
 		.maxlen		= sizeof(aio_nr),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_doulongvec_minmax,
 	},
 	{
 		.ctl_name	= FS_AIO_MAX_NR,
@@ -960,7 +960,7 @@ static ctl_table fs_table[] = {
 		.data		= &aio_max_nr,
 		.maxlen		= sizeof(aio_max_nr),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_doulongvec_minmax,
 	},
 #ifdef CONFIG_INOTIFY
 	{
-- 
cgit v1.2.2


From 1e5d533142c1c178a31d4cc81837eb078f9269bc Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Mon, 7 Nov 2005 01:01:06 -0800
Subject: [PATCH] more kernel-doc cleanups, additions

Various core kernel-doc cleanups:
- add missing function parameters in ipc, irq/manage, kernel/sys,
  kernel/sysctl, and mm/slab;
- move description to just above function for kernel_restart()

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/sysctl.c')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e1351200ce85..c4f35f96884d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1997,6 +1997,7 @@ int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
  * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
+ * @ppos: pointer to the file position
  *
  * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
  * values from/to the user buffer, treated as an ASCII string. 
-- 
cgit v1.2.2


From 330d57fb98a916fa8e1363846540dd420e99499a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Nov 2005 10:18:40 +0000
Subject: [PATCH] Fix sysctl unregistration oops (CVE-2005-2709)

You could open the /proc/sys/net/ipv4/conf/<if>/<whatever> file, then
wait for interface to go away, try to grab as much memory as possible in
hope to hit the (kfreed) ctl_table.  Then fill it with pointers to your
function.  Then do read from file you've opened and if you are lucky,
you'll get it called as ->proc_handler() in kernel mode.

So this is at least an Oops and possibly more.  It does depend on an
interface going away though, so less of a security risk than it would
otherwise be.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 136 ++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 107 insertions(+), 29 deletions(-)

(limited to 'kernel/sysctl.c')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c4f35f96884d..9990e10192e8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -169,7 +169,7 @@ struct file_operations proc_sys_file_operations = {
 
 extern struct proc_dir_entry *proc_sys_root;
 
-static void register_proc_table(ctl_table *, struct proc_dir_entry *);
+static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *);
 static void unregister_proc_table(ctl_table *, struct proc_dir_entry *);
 #endif
 
@@ -992,10 +992,51 @@ static ctl_table dev_table[] = {
 
 extern void init_irq_proc (void);
 
+static DEFINE_SPINLOCK(sysctl_lock);
+
+/* called under sysctl_lock */
+static int use_table(struct ctl_table_header *p)
+{
+	if (unlikely(p->unregistering))
+		return 0;
+	p->used++;
+	return 1;
+}
+
+/* called under sysctl_lock */
+static void unuse_table(struct ctl_table_header *p)
+{
+	if (!--p->used)
+		if (unlikely(p->unregistering))
+			complete(p->unregistering);
+}
+
+/* called under sysctl_lock, will reacquire if has to wait */
+static void start_unregistering(struct ctl_table_header *p)
+{
+	/*
+	 * if p->used is 0, nobody will ever touch that entry again;
+	 * we'll eliminate all paths to it before dropping sysctl_lock
+	 */
+	if (unlikely(p->used)) {
+		struct completion wait;
+		init_completion(&wait);
+		p->unregistering = &wait;
+		spin_unlock(&sysctl_lock);
+		wait_for_completion(&wait);
+		spin_lock(&sysctl_lock);
+	}
+	/*
+	 * do not remove from the list until nobody holds it; walking the
+	 * list in do_sysctl() relies on that.
+	 */
+	list_del_init(&p->ctl_entry);
+}
+
 void __init sysctl_init(void)
 {
 #ifdef CONFIG_PROC_FS
-	register_proc_table(root_table, proc_sys_root);
+	register_proc_table(root_table, proc_sys_root, &root_table_header);
 	init_irq_proc();
 #endif
 }
@@ -1004,6 +1045,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
 	       void __user *newval, size_t newlen)
 {
 	struct list_head *tmp;
+	int error = -ENOTDIR;
 
 	if (nlen <= 0 || nlen >= CTL_MAXNAME)
 		return -ENOTDIR;
@@ -1012,20 +1054,30 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
 		if (!oldlenp || get_user(old_len, oldlenp))
 			return -EFAULT;
 	}
+	spin_lock(&sysctl_lock);
 	tmp = &root_table_header.ctl_entry;
 	do {
 		struct ctl_table_header *head =
 			list_entry(tmp, struct ctl_table_header, ctl_entry);
 		void *context = NULL;
-		int error = parse_table(name, nlen, oldval, oldlenp, 
+
+		if (!use_table(head))
+			continue;
+
+		spin_unlock(&sysctl_lock);
+
+		error = parse_table(name, nlen, oldval, oldlenp, 
 					newval, newlen, head->ctl_table,
 					&context);
 		kfree(context);
+
+		spin_lock(&sysctl_lock);
+		unuse_table(head);
 		if (error != -ENOTDIR)
-			return error;
-		tmp = tmp->next;
-	} while (tmp != &root_table_header.ctl_entry);
-	return -ENOTDIR;
+			break;
+	} while ((tmp = tmp->next) != &root_table_header.ctl_entry);
+	spin_unlock(&sysctl_lock);
+	return error;
 }
 
 asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
@@ -1236,12 +1288,16 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
 		return NULL;
 	tmp->ctl_table = table;
 	INIT_LIST_HEAD(&tmp->ctl_entry);
+	tmp->used = 0;
+	tmp->unregistering = NULL;
+	spin_lock(&sysctl_lock);
 	if (insert_at_head)
 		list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
 	else
 		list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
+	spin_unlock(&sysctl_lock);
 #ifdef CONFIG_PROC_FS
-	register_proc_table(table, proc_sys_root);
+	register_proc_table(table, proc_sys_root, tmp);
 #endif
 	return tmp;
 }
@@ -1255,10 +1311,13 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
  */
 void unregister_sysctl_table(struct ctl_table_header * header)
 {
-	list_del(&header->ctl_entry);
+	might_sleep();
+	spin_lock(&sysctl_lock);
+	start_unregistering(header);
 #ifdef CONFIG_PROC_FS
 	unregister_proc_table(header->ctl_table, proc_sys_root);
 #endif
+	spin_unlock(&sysctl_lock);
 	kfree(header);
 }
 
@@ -1269,7 +1328,7 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 #ifdef CONFIG_PROC_FS
 
 /* Scan the sysctl entries in table and add them all into /proc */
-static void register_proc_table(ctl_table * table, struct proc_dir_entry *root)
+static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
 {
 	struct proc_dir_entry *de;
 	int len;
@@ -1305,13 +1364,14 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root)
 			de = create_proc_entry(table->procname, mode, root);
 			if (!de)
 				continue;
+			de->set = set;
 			de->data = (void *) table;
 			if (table->proc_handler)
 				de->proc_fops = &proc_sys_file_operations;
 		}
 		table->de = de;
 		if (de->mode & S_IFDIR)
-			register_proc_table(table->child, de);
+			register_proc_table(table->child, de, set);
 	}
 }
 
@@ -1336,6 +1396,13 @@ static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root
 				continue;
 		}
 
+		/*
+		 * In any case, mark the entry as goner; we'll keep it
+		 * around if it's busy, but we'll know to do nothing with
+		 * its fields.  We are under sysctl_lock here.
+		 */
+		de->data = NULL;
+
 		/* Don't unregister proc entries that are still being used.. */
 		if (atomic_read(&de->count))
 			continue;
@@ -1349,27 +1416,38 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
 			  size_t count, loff_t *ppos)
 {
 	int op;
-	struct proc_dir_entry *de;
+	struct proc_dir_entry *de = PDE(file->f_dentry->d_inode);
 	struct ctl_table *table;
 	size_t res;
-	ssize_t error;
-	
-	de = PDE(file->f_dentry->d_inode);
-	if (!de || !de->data)
-		return -ENOTDIR;
-	table = (struct ctl_table *) de->data;
-	if (!table || !table->proc_handler)
-		return -ENOTDIR;
-	op = (write ? 002 : 004);
-	if (ctl_perm(table, op))
-		return -EPERM;
+	ssize_t error = -ENOTDIR;
 	
-	res = count;
-
-	error = (*table->proc_handler) (table, write, file, buf, &res, ppos);
-	if (error)
-		return error;
-	return res;
+	spin_lock(&sysctl_lock);
+	if (de && de->data && use_table(de->set)) {
+		/*
+		 * at that point we know that sysctl was not unregistered
+		 * and won't be until we finish
+		 */
+		spin_unlock(&sysctl_lock);
+		table = (struct ctl_table *) de->data;
+		if (!table || !table->proc_handler)
+			goto out;
+		error = -EPERM;
+		op = (write ? 002 : 004);
+		if (ctl_perm(table, op))
+			goto out;
+		
+		/* careful: calling conventions are nasty here */
+		res = count;
+		error = (*table->proc_handler)(table, write, file,
+						buf, &res, ppos);
+		if (!error)
+			error = res;
+	out:
+		spin_lock(&sysctl_lock);
+		unuse_table(de->set);
+	}
+	spin_unlock(&sysctl_lock);
+	return error;
 }
 
 static int proc_opensys(struct inode *inode, struct file *file)
-- 
cgit v1.2.2