From 4b5d37ac02954572e80e09255bb5737277aaee8e Mon Sep 17 00:00:00 2001
From: Giancarlo Formicuccia <giancarlo.formicuccia@gmail.com>
Date: Fri, 9 Sep 2005 13:01:22 -0700
Subject: [PATCH] Clear task_struct->fs_excl on fork()

An oversight.  We don't want to carry the IO scheduler's "we hold exclusive fs
resources" hint over to the child across fork().

Acked-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index 7e1ead9a6ba4..dfeadf466f18 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -176,6 +176,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 
 	/* One for us, one for whoever does the "release_task()" (usually parent) */
 	atomic_set(&tsk->usage,2);
+	atomic_set(&tsk->fs_excl, 0);
 	return tsk;
 }
 
-- 
cgit v1.2.2


From b0d62e6d5b3318b6b722121d945afa295f7201b5 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Fri, 9 Sep 2005 13:02:01 -0700
Subject: [PATCH] fix disassociate_ctty vs. fork race

Race is as follows. Process A forks process B, both being part of the same
session. Then, A calls disassociate_ctty while B forks C:

A				B
====				====
				fork()
				  copy_signal()
dissasociate_ctty()		....
				  attach_pid(p, PIDTYPE_SID, p->signal->session);

Now, C can have current->signal->tty pointing to a freed tty structure, as
it hasn't yet been added to the session group (to have its controlling tty
cleared on the diassociate_ctty() call).

This has shown up as an oops but could be even more serious.  I haven't
tried to create a test case, but a customer has verified that the patch
below resolves the issue, which was occuring quite frequently.  I'll try
and post the test case if i can.

The patch simply checks for a NULL tty *after* it has been attached to the
proper session group and clears it as necessary.  Alternatively, we could
simply do the tty assignment after the the process is added to the proper
session group.

Signed-off-by: Jason Baron <jbaron@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index dfeadf466f18..b25802065031 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1116,6 +1116,9 @@ static task_t *copy_process(unsigned long clone_flags,
 			__get_cpu_var(process_counts)++;
 	}
 
+	if (!current->signal->tty && p->signal->tty)
+		p->signal->tty = NULL;
+
 	nr_threads++;
 	total_forks++;
 	write_unlock_irq(&tasklist_lock);
-- 
cgit v1.2.2


From badf16621c1f9d1ac753be056fce11b43d6e0be5 Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Fri, 9 Sep 2005 13:04:10 -0700
Subject: [PATCH] files: break up files struct

In order for the RCU to work, the file table array, sets and their sizes must
be updated atomically.  Instead of ensuring this through too many memory
barriers, we put the arrays and their sizes in a separate structure.  This
patch takes the first step of putting the file table elements in a separate
structure fdtable that is embedded withing files_struct.  It also changes all
the users to refer to the file table using files_fdtable() macro.  Subsequent
applciation of RCU becomes easier after this.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 82 ++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 50 insertions(+), 32 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index b25802065031..ecc694debb50 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -568,21 +568,47 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
 static int count_open_files(struct files_struct *files, int size)
 {
 	int i;
+	struct fdtable *fdt;
 
 	/* Find the last open fd */
+	fdt = files_fdtable(files);
 	for (i = size/(8*sizeof(long)); i > 0; ) {
-		if (files->open_fds->fds_bits[--i])
+		if (fdt->open_fds->fds_bits[--i])
 			break;
 	}
 	i = (i+1) * 8 * sizeof(long);
 	return i;
 }
 
+static struct files_struct *alloc_files(void)
+{
+	struct files_struct *newf;
+	struct fdtable *fdt;
+
+	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+	if (!newf)
+		goto out;
+
+	atomic_set(&newf->count, 1);
+
+	spin_lock_init(&newf->file_lock);
+	fdt = files_fdtable(newf);
+	fdt->next_fd = 0;
+	fdt->max_fds = NR_OPEN_DEFAULT;
+	fdt->max_fdset = __FD_SETSIZE;
+	fdt->close_on_exec = &newf->close_on_exec_init;
+	fdt->open_fds = &newf->open_fds_init;
+	fdt->fd = &newf->fd_array[0];
+out:
+	return newf;
+}
+
 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct files_struct *oldf, *newf;
 	struct file **old_fds, **new_fds;
 	int open_files, size, i, error = 0, expand;
+	struct fdtable *old_fdt, *new_fdt;
 
 	/*
 	 * A background process may not have any files ...
@@ -603,35 +629,27 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 	 */
 	tsk->files = NULL;
 	error = -ENOMEM;
-	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
-	if (!newf) 
+	newf = alloc_files();
+	if (!newf)
 		goto out;
 
-	atomic_set(&newf->count, 1);
-
-	spin_lock_init(&newf->file_lock);
-	newf->next_fd	    = 0;
-	newf->max_fds	    = NR_OPEN_DEFAULT;
-	newf->max_fdset	    = __FD_SETSIZE;
-	newf->close_on_exec = &newf->close_on_exec_init;
-	newf->open_fds	    = &newf->open_fds_init;
-	newf->fd	    = &newf->fd_array[0];
-
 	spin_lock(&oldf->file_lock);
-
-	open_files = count_open_files(oldf, oldf->max_fdset);
+	old_fdt = files_fdtable(oldf);
+	new_fdt = files_fdtable(newf);
+	size = old_fdt->max_fdset;
+	open_files = count_open_files(oldf, old_fdt->max_fdset);
 	expand = 0;
 
 	/*
 	 * Check whether we need to allocate a larger fd array or fd set.
 	 * Note: we're not a clone task, so the open count won't  change.
 	 */
-	if (open_files > newf->max_fdset) {
-		newf->max_fdset = 0;
+	if (open_files > new_fdt->max_fdset) {
+		new_fdt->max_fdset = 0;
 		expand = 1;
 	}
-	if (open_files > newf->max_fds) {
-		newf->max_fds = 0;
+	if (open_files > new_fdt->max_fds) {
+		new_fdt->max_fds = 0;
 		expand = 1;
 	}
 
@@ -646,11 +664,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 		spin_lock(&oldf->file_lock);
 	}
 
-	old_fds = oldf->fd;
-	new_fds = newf->fd;
+	old_fds = old_fdt->fd;
+	new_fds = new_fdt->fd;
 
-	memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
-	memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
+	memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
+	memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
 
 	for (i = open_files; i != 0; i--) {
 		struct file *f = *old_fds++;
@@ -663,24 +681,24 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 			 * is partway through open().  So make sure that this
 			 * fd is available to the new process.
 			 */
-			FD_CLR(open_files - i, newf->open_fds);
+			FD_CLR(open_files - i, new_fdt->open_fds);
 		}
 		*new_fds++ = f;
 	}
 	spin_unlock(&oldf->file_lock);
 
 	/* compute the remainder to be cleared */
-	size = (newf->max_fds - open_files) * sizeof(struct file *);
+	size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
 
 	/* This is long word aligned thus could use a optimized version */ 
 	memset(new_fds, 0, size); 
 
-	if (newf->max_fdset > open_files) {
-		int left = (newf->max_fdset-open_files)/8;
+	if (new_fdt->max_fdset > open_files) {
+		int left = (new_fdt->max_fdset-open_files)/8;
 		int start = open_files / (8 * sizeof(unsigned long));
 
-		memset(&newf->open_fds->fds_bits[start], 0, left);
-		memset(&newf->close_on_exec->fds_bits[start], 0, left);
+		memset(&new_fdt->open_fds->fds_bits[start], 0, left);
+		memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
 	}
 
 	tsk->files = newf;
@@ -689,9 +707,9 @@ out:
 	return error;
 
 out_release:
-	free_fdset (newf->close_on_exec, newf->max_fdset);
-	free_fdset (newf->open_fds, newf->max_fdset);
-	free_fd_array(newf->fd, newf->max_fds);
+	free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
+	free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
+	free_fd_array(new_fdt->fd, new_fdt->max_fds);
 	kmem_cache_free(files_cachep, newf);
 	goto out;
 }
-- 
cgit v1.2.2


From ab2af1f5005069321c5d130f09cce577b03f43ef Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Fri, 9 Sep 2005 13:04:13 -0700
Subject: [PATCH] files: files struct with RCU

Patch to eliminate struct files_struct.file_lock spinlock on the reader side
and use rcu refcounting rcuref_xxx api for the f_count refcounter.  The
updates to the fdtable are done by allocating a new fdtable structure and
setting files->fdt to point to the new structure.  The fdtable structure is
protected by RCU thereby allowing lock-free lookup.  For fd arrays/sets that
are vmalloced, we use keventd to free them since RCU callbacks can't sleep.  A
global list of fdtable to be freed is not scalable, so we use a per-cpu list.
If keventd is already handling the current cpu's work, we use a timer to defer
queueing of that work.

Since the last publication, this patch has been re-written to avoid using
explicit memory barriers and use rcu_assign_pointer(), rcu_dereference()
premitives instead.  This required that the fd information is kept in a
separate structure (fdtable) and updated atomically.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index ecc694debb50..8149f3602881 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -35,6 +35,7 @@
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
 #include <linux/futex.h>
+#include <linux/rcupdate.h>
 #include <linux/ptrace.h>
 #include <linux/mount.h>
 #include <linux/audit.h>
@@ -565,13 +566,12 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
 	return 0;
 }
 
-static int count_open_files(struct files_struct *files, int size)
+static int count_open_files(struct fdtable *fdt)
 {
+	int size = fdt->max_fdset;
 	int i;
-	struct fdtable *fdt;
 
 	/* Find the last open fd */
-	fdt = files_fdtable(files);
 	for (i = size/(8*sizeof(long)); i > 0; ) {
 		if (fdt->open_fds->fds_bits[--i])
 			break;
@@ -592,13 +592,17 @@ static struct files_struct *alloc_files(void)
 	atomic_set(&newf->count, 1);
 
 	spin_lock_init(&newf->file_lock);
-	fdt = files_fdtable(newf);
+	fdt = &newf->fdtab;
 	fdt->next_fd = 0;
 	fdt->max_fds = NR_OPEN_DEFAULT;
 	fdt->max_fdset = __FD_SETSIZE;
 	fdt->close_on_exec = &newf->close_on_exec_init;
 	fdt->open_fds = &newf->open_fds_init;
 	fdt->fd = &newf->fd_array[0];
+	INIT_RCU_HEAD(&fdt->rcu);
+	fdt->free_files = NULL;
+	fdt->next = NULL;
+	rcu_assign_pointer(newf->fdt, fdt);
 out:
 	return newf;
 }
@@ -637,7 +641,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 	old_fdt = files_fdtable(oldf);
 	new_fdt = files_fdtable(newf);
 	size = old_fdt->max_fdset;
-	open_files = count_open_files(oldf, old_fdt->max_fdset);
+	open_files = count_open_files(old_fdt);
 	expand = 0;
 
 	/*
@@ -661,7 +665,14 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 		spin_unlock(&newf->file_lock);
 		if (error < 0)
 			goto out_release;
+		new_fdt = files_fdtable(newf);
+		/*
+		 * Reacquire the oldf lock and a pointer to its fd table
+		 * who knows it may have a new bigger fd table. We need
+		 * the latest pointer.
+		 */
 		spin_lock(&oldf->file_lock);
+		old_fdt = files_fdtable(oldf);
 	}
 
 	old_fds = old_fdt->fd;
@@ -683,7 +694,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 			 */
 			FD_CLR(open_files - i, new_fdt->open_fds);
 		}
-		*new_fds++ = f;
+		rcu_assign_pointer(*new_fds++, f);
 	}
 	spin_unlock(&oldf->file_lock);
 
-- 
cgit v1.2.2