From 81c29a857d3c8d6ea9c4f20d196c36bf0a07c615 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 7 Mar 2006 21:55:27 -0800
Subject: [PATCH] idle threads should have a sane ->timestamp value

Idle threads should have a sane ->timestamp value, to avoid init kernel
thread(s) from inheriting it and causing miscalculations in
try_to_wake_up().

Reported-by: Mike Galbraith <efault@gmx.de>.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3454bb869fd0..e82c99f1db64 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4335,6 +4335,7 @@ void __devinit init_idle(task_t *idle, int cpu)
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long flags;
 
+	idle->timestamp = sched_clock();
 	idle->sleep_avg = 0;
 	idle->array = NULL;
 	idle->prio = MAX_PRIO;
-- 
cgit v1.2.2


From 21a1ea9eb40411d4ee29448c53b9e4c0654d6ceb Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Tue, 7 Mar 2006 21:55:33 -0800
Subject: [PATCH] rcu batch tuning

This patch adds new tunables for RCU queue and finished batches.  There are
two types of controls - number of completed RCU updates invoked in a batch
(blimit) and monitoring for high rate of incoming RCUs on a cpu (qhimark,
qlowmark).

By default, the per-cpu batch limit is set to a small value.  If the input
RCU rate exceeds the high watermark, we do two things - force quiescent
state on all cpus and set the batch limit of the CPU to INTMAX.  Setting
batch limit to INTMAX forces all finished RCUs to be processed in one shot.
 If we have more than INTMAX RCUs queued up, then we have bigger problems
anyway.  Once the incoming queued RCUs fall below the low watermark, the
batch limit is set to the default.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c | 76 ++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 58 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 0cf8146bd585..8cf15a569fcd 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -67,7 +67,43 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 
 /* Fake initialization required by compiler */
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
-static int maxbatch = 10000;
+static int blimit = 10;
+static int qhimark = 10000;
+static int qlowmark = 100;
+#ifdef CONFIG_SMP
+static int rsinterval = 1000;
+#endif
+
+static atomic_t rcu_barrier_cpu_count;
+static struct semaphore rcu_barrier_sema;
+static struct completion rcu_barrier_completion;
+
+#ifdef CONFIG_SMP
+static void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	int cpu;
+	cpumask_t cpumask;
+	set_need_resched();
+	if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) {
+		rdp->last_rs_qlen = rdp->qlen;
+		/*
+		 * Don't send IPI to itself. With irqs disabled,
+		 * rdp->cpu is the current cpu.
+		 */
+		cpumask = rcp->cpumask;
+		cpu_clear(rdp->cpu, cpumask);
+		for_each_cpu_mask(cpu, cpumask)
+			smp_send_reschedule(cpu);
+	}
+}
+#else
+static inline void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	set_need_resched();
+}
+#endif
 
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
@@ -92,17 +128,13 @@ void fastcall call_rcu(struct rcu_head *head,
 	rdp = &__get_cpu_var(rcu_data);
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
-
-	if (unlikely(++rdp->count > 10000))
-		set_need_resched();
-
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_ctrlblk);
+	}
 	local_irq_restore(flags);
 }
 
-static atomic_t rcu_barrier_cpu_count;
-static struct semaphore rcu_barrier_sema;
-static struct completion rcu_barrier_completion;
-
 /**
  * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
  * @head: structure to be used for queueing the RCU updates.
@@ -131,12 +163,12 @@ void fastcall call_rcu_bh(struct rcu_head *head,
 	rdp = &__get_cpu_var(rcu_bh_data);
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
-	rdp->count++;
-/*
- *  Should we directly call rcu_do_batch() here ?
- *  if (unlikely(rdp->count > 10000))
- *      rcu_do_batch(rdp);
- */
+
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_bh_ctrlblk);
+	}
+
 	local_irq_restore(flags);
 }
 
@@ -199,10 +231,12 @@ static void rcu_do_batch(struct rcu_data *rdp)
 		next = rdp->donelist = list->next;
 		list->func(list);
 		list = next;
-		rdp->count--;
-		if (++count >= maxbatch)
+		rdp->qlen--;
+		if (++count >= rdp->blimit)
 			break;
 	}
+	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
 	if (!rdp->donelist)
 		rdp->donetail = &rdp->donelist;
 	else
@@ -473,6 +507,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 	rdp->quiescbatch = rcp->completed;
 	rdp->qs_pending = 0;
 	rdp->cpu = cpu;
+	rdp->blimit = blimit;
 }
 
 static void __devinit rcu_online_cpu(int cpu)
@@ -567,7 +602,12 @@ void synchronize_kernel(void)
 	synchronize_rcu();
 }
 
-module_param(maxbatch, int, 0);
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
+#ifdef CONFIG_SMP
+module_param(rsinterval, int, 0);
+#endif
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
-- 
cgit v1.2.2


From 529bf6be5c04f2e869d07bfdb122e9fd98ade714 Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Tue, 7 Mar 2006 21:55:35 -0800
Subject: [PATCH] fix file counting

I have benchmarked this on an x86_64 NUMA system and see no significant
performance difference on kernbench.  Tested on both x86_64 and powerpc.

The way we do file struct accounting is not very suitable for batched
freeing.  For scalability reasons, file accounting was
constructor/destructor based.  This meant that nr_files was decremented
only when the object was removed from the slab cache.  This is susceptible
to slab fragmentation.  With RCU based file structure, consequent batched
freeing and a test program like Serge's, we just speed this up and end up
with a very fragmented slab -

llm22:~ # cat /proc/sys/fs/file-nr
587730  0       758844

At the same time, I see only a 2000+ objects in filp cache.  The following
patch I fixes this problem.

This patch changes the file counting by removing the filp_count_lock.
Instead we use a separate percpu counter, nr_files, for now and all
accesses to it are through get_nr_files() api.  In the sysctl handler for
nr_files, we populate files_stat.nr_files before returning to user.

Counting files as an when they are created and destroyed (as opposed to
inside slab) allows us to correctly count open files with RCU.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index de2d9109194e..32b48e8ee36e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -50,6 +50,9 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 
+extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
+                     void __user *buffer, size_t *lenp, loff_t *ppos);
+
 #if defined(CONFIG_SYSCTL)
 
 /* External variables not in a header file. */
@@ -943,7 +946,7 @@ static ctl_table fs_table[] = {
 		.data		= &files_stat,
 		.maxlen		= 3*sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_nr_files,
 	},
 	{
 		.ctl_name	= FS_MAXFILE,
-- 
cgit v1.2.2


From 7cd9013be6c22f3ff6f777354f766c8c0b955e17 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 11 Mar 2006 03:27:18 -0800
Subject: [PATCH] remove __put_task_struct_cb export again

The patch '[PATCH] RCU signal handling' [1] added an export for
__put_task_struct_cb, a put_task_struct helper newly introduced in that
patch.  But the put_task_struct couldn't be used modular previously as
__put_task_struct wasn't exported.  There are not callers of it in modular
code, and it shouldn't be exported because we don't want drivers to hold
references to task_structs.

This patch removes the export and folds __put_task_struct into
__put_task_struct_cb as there's no other caller.

[1] http://www2.kernel.org/git/gitweb.cgi?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=e56d090310d7625ecb43a1eeebd479f04affb48b

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c  | 4 +++-
 kernel/sched.c | 7 -------
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index fbea12d7a943..a8eab86de7f1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -108,8 +108,10 @@ void free_task(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(free_task);
 
-void __put_task_struct(struct task_struct *tsk)
+void __put_task_struct_cb(struct rcu_head *rhp)
 {
+	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
 	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
diff --git a/kernel/sched.c b/kernel/sched.c
index e82c99f1db64..4d46e90f59c3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -178,13 +178,6 @@ static unsigned int task_timeslice(task_t *p)
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
 				< (long long) (sd)->cache_hot_time)
 
-void __put_task_struct_cb(struct rcu_head *rhp)
-{
-	__put_task_struct(container_of(rhp, struct task_struct, rcu));
-}
-
-EXPORT_SYMBOL_GPL(__put_task_struct_cb);
-
 /*
  * These are the runqueue data structures:
  */
-- 
cgit v1.2.2


From f9a3879abf2f1a27c39915e6074b8ff15a24cb55 Mon Sep 17 00:00:00 2001
From: GOTO Masanori <gotom@sanori.org>
Date: Mon, 13 Mar 2006 21:20:44 -0800
Subject: [PATCH] Fix sigaltstack corruption among cloned threads

This patch fixes alternate signal stack corruption among cloned threads
with CLONE_SIGHAND (and CLONE_VM) for linux-2.6.16-rc6.

The value of alternate signal stack is currently inherited after a call of
clone(...  CLONE_SIGHAND | CLONE_VM).  But if sigaltstack is set by a
parent thread, and then if multiple cloned child threads (+ parent threads)
call signal handler at the same time, some threads may be conflicted -
because they share to use the same alternative signal stack region.
Finally they get sigsegv.  It's an undesirable race condition.  Note that
child threads created from NPTL pthread_create() also hit this conflict
when the parent thread uses sigaltstack, without my patch.

To fix this problem, this patch clears the child threads' sigaltstack
information like exec().  This behavior follows the SUSv3 specification.
In SUSv3, pthread_create() says "The alternate stack shall not be inherited
(when new threads are initialized)".  It means that sigaltstack should be
cleared when sigaltstack memory space is shared by cloned threads with
CLONE_SIGHAND.

Note that I chose "if (clone_flags & CLONE_SIGHAND)" line because:
  - If clone_flags line is not existed, fork() does not inherit sigaltstack.
  - CLONE_VM is another choice, but vfork() does not inherit sigaltstack.
  - CLONE_SIGHAND implies CLONE_VM, and it looks suitable.
  - CLONE_THREAD is another candidate, and includes CLONE_SIGHAND + CLONE_VM,
    but this flag has a bit different semantics.
I decided to use CLONE_SIGHAND.

[ Changed to test for CLONE_VM && !CLONE_VFORK after discussion --Linus ]

Signed-off-by: GOTO Masanori <gotom@sanori.org>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Linus Torvalds <torvalds@osdl.org>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index a8eab86de7f1..ccdfbb16c86d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1061,6 +1061,12 @@ static task_t *copy_process(unsigned long clone_flags,
 	 */
 	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
 
+	/*
+	 * sigaltstack should be cleared when sharing the same VM
+	 */
+	if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
+		p->sas_ss_sp = p->sas_ss_size = 0;
+
 	/*
 	 * Syscall tracing should be turned off in the child regardless
 	 * of CLONE_PTRACE.
-- 
cgit v1.2.2


From e0e8eb54d8ae0c4cfd1d297f6351b08a7f635c5f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 16 Mar 2006 10:31:38 -0700
Subject: [PATCH] unshare: Use rcu_assign_pointer when setting sighand

The sighand pointer only needs the rcu_read_lock on the
read side.  So only depending on task_lock protection
when setting this pointer is not enough.  We also need
a memory barrier to ensure the initialization is seen first.

Use rcu_assign_pointer as it does this for us, and clearly
documents that we are setting an rcu readable pointer.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index ccdfbb16c86d..46060cb24af0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1569,7 +1569,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 
 		if (new_sigh) {
 			sigh = current->sighand;
-			current->sighand = new_sigh;
+			rcu_assign_pointer(current->sighand, new_sigh);
 			new_sigh = sigh;
 		}
 
-- 
cgit v1.2.2


From 67890d7084085e29c51afa2514036d42643fd3cf Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Thu, 16 Mar 2006 23:04:00 -0800
Subject: [PATCH] time_interpolator: add __read_mostly

The pointer to the current time interpolator and the current list of time
interpolators are typically only changed during bootup.  Adding
__read_mostly takes them away from possibly hot cachelines.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index bf7c4193b936..2410c18dbeb1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1354,8 +1354,8 @@ void __init init_timers(void)
 
 #ifdef CONFIG_TIME_INTERPOLATION
 
-struct time_interpolator *time_interpolator;
-static struct time_interpolator *time_interpolator_list;
+struct time_interpolator *time_interpolator __read_mostly;
+static struct time_interpolator *time_interpolator_list __read_mostly;
 static DEFINE_SPINLOCK(time_interpolator_lock);
 
 static inline u64 time_interpolator_get_cycles(unsigned int src)
-- 
cgit v1.2.2


From a0a0c28c1a7109d7955815074c52cac079ab3ba5 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Thu, 16 Mar 2006 23:04:01 -0800
Subject: [PATCH] posix-timers: fix requeue accounting when signal is ignored

When the posix-timer signal is ignored then the timer is rearmed by the
callback function.  The requeue pending accounting has to be fixed up else
the state might be wrong.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-timers.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 216f574b5ffb..fa895fc2ecf5 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -353,6 +353,7 @@ static int posix_timer_fn(void *data)
 				hrtimer_forward(&timr->it.real.timer,
 						timr->it.real.interval);
 			ret = HRTIMER_RESTART;
+			++timr->it_requeue_pending;
 		}
 	}
 
-- 
cgit v1.2.2


From 2d61b86775a5676a8fba2ba2f0f869564e35c630 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 18 Mar 2006 20:41:10 +0300
Subject: [PATCH] disable unshare(CLONE_VM) for now

sys_unshare() does mmput(new_mm).  This is not enough if we have
mm->core_waiters.

This patch is a temporary fix for soon to be released 2.6.16.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
[ Checked with Uli: "I'm not planning to use unshare(CLONE_VM).  It's
  not needed for any functionality planned so far.  What we (as in Red
  Hat) need unshare() for now is the filesystem side." ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 46060cb24af0..b373322ca497 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1478,9 +1478,7 @@ static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
 
 	if ((unshare_flags & CLONE_VM) &&
 	    (mm && atomic_read(&mm->mm_users) > 1)) {
-		*new_mmp = dup_mm(current);
-		if (!*new_mmp)
-			return -ENOMEM;
+		return -EINVAL;
 	}
 
 	return 0;
-- 
cgit v1.2.2