From 83c7d09173fdb6b06b109e65895392db3e49ac9c Mon Sep 17 00:00:00 2001
From:  <dwmw2@shinybook.infradead.org>
Date: Fri, 29 Apr 2005 15:54:44 +0100
Subject: AUDIT: Avoid log pollution by untrusted strings.

We log strings from userspace, such as arguments to open(). These could
be formatted to contain \n followed by fake audit log entries. Provide
a function for logging such strings, which gives a hex dump when the
string contains anything but basic printable ASCII characters. Use it
for logging filenames.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c   | 23 +++++++++++++++++++++++
 kernel/auditsc.c |  7 ++++---
 2 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 0f84dd7af2..dca7b99615 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -720,6 +720,29 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
 	va_end(args);
 }
 
+void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len)
+{
+	int i;
+
+	for (i=0; i<len; i++)
+		audit_log_format(ab, "%02x", buf[i]);
+}
+
+void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
+{
+	const char *p = string;
+
+	while (*p) {
+		if (*p == '"' || *p == ' ' || *p < 0x20 || *p > 0x7f) {
+			audit_log_hex(ab, string, strlen(string));
+			return;
+		}
+		p++;
+	}
+	audit_log_format(ab, "\"%s\"", string);
+}
+
+
 /* This is a helper-function to print the d_path without using a static
  * buffer or allocating another buffer in addition to the one in
  * audit_buffer. */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 6f1931381b..00e87ffff1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -696,9 +696,10 @@ static void audit_log_exit(struct audit_context *context)
 		if (!ab)
 			continue; /* audit_panic has been called */
 		audit_log_format(ab, "item=%d", i);
-		if (context->names[i].name)
-			audit_log_format(ab, " name=%s",
-					 context->names[i].name);
+		if (context->names[i].name) {
+			audit_log_format(ab, " name=");
+			audit_log_untrustedstring(ab, context->names[i].name);
+		}
 		if (context->names[i].ino != (unsigned long)-1)
 			audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
 					     " uid=%d gid=%d rdev=%02x:%02x",
-- 
cgit v1.2.2


From 81b7854d52d35ed2353dd47033ae630d18322a2d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 29 Apr 2005 15:59:11 +0100
Subject: audit_log_untrustedstring() warning fix

kernel/audit.c: In function `audit_log_untrustedstring':
kernel/audit.c:736: warning: comparison is always false due to limited range of data type

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index dca7b99615..e7bff8000d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -730,7 +730,7 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len
 
 void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
 {
-	const char *p = string;
+	const unsigned char *p = string;
 
 	while (*p) {
 		if (*p == '"' || *p == ' ' || *p < 0x20 || *p > 0x7f) {
-- 
cgit v1.2.2


From 2fd6f58ba6efc82ea2c9c2630f7ff5ed9eeaf34a Mon Sep 17 00:00:00 2001
From:  <dwmw2@shinybook.infradead.org>
Date: Fri, 29 Apr 2005 16:08:28 +0100
Subject: [AUDIT] Don't allow ptrace to fool auditing, log arch of audited
 syscalls.

We were calling ptrace_notify() after auditing the syscall and arguments,
but the debugger could have _changed_ them before the syscall was actually
invoked. Reorder the calls to fix that.

While we're touching ever call to audit_syscall_entry(), we also make it
take an extra argument: the architecture of the syscall which was made,
because some architectures allow more than one type of syscall.

Also add an explicit success/failure flag to audit_syscall_exit(), for
the benefit of architectures which return that in a condition register
rather than only returning a single register.

Change type of syscall return value to 'long' not 'int'.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/auditsc.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 00e87ffff1..77e92592de 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -123,7 +123,7 @@ struct audit_context {
 	int		    major;      /* syscall number */
 	unsigned long	    argv[4];    /* syscall arguments */
 	int		    return_valid; /* return code is valid */
-	int		    return_code;/* syscall return code */
+	long		    return_code;/* syscall return code */
 	int		    auditable;  /* 1 if record should be written */
 	int		    name_count;
 	struct audit_names  names[AUDIT_NAMES];
@@ -135,6 +135,7 @@ struct audit_context {
 	uid_t		    uid, euid, suid, fsuid;
 	gid_t		    gid, egid, sgid, fsgid;
 	unsigned long	    personality;
+	int		    arch;
 
 #if AUDIT_DEBUG
 	int		    put_count;
@@ -348,6 +349,10 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_PERS:
 			result = (tsk->personality == value);
 			break;
+		case AUDIT_ARCH:
+			if (ctx) 
+				result = (ctx->arch == value);
+			break;
 
 		case AUDIT_EXIT:
 			if (ctx && ctx->return_valid)
@@ -355,7 +360,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 			break;
 		case AUDIT_SUCCESS:
 			if (ctx && ctx->return_valid)
-				result = (ctx->return_code >= 0);
+				result = (ctx->return_valid == AUDITSC_SUCCESS);
 			break;
 		case AUDIT_DEVMAJOR:
 			if (ctx) {
@@ -648,8 +653,11 @@ static void audit_log_exit(struct audit_context *context)
 	audit_log_format(ab, "syscall=%d", context->major);
 	if (context->personality != PER_LINUX)
 		audit_log_format(ab, " per=%lx", context->personality);
+	audit_log_format(ab, " arch=%x", context->arch);
 	if (context->return_valid)
-		audit_log_format(ab, " exit=%d", context->return_code);
+		audit_log_format(ab, " success=%s exit=%ld", 
+				 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
+				 context->return_code);
 	audit_log_format(ab,
 		  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
 		  " pid=%d loginuid=%d uid=%d gid=%d"
@@ -773,7 +781,7 @@ static inline unsigned int audit_serial(void)
  * then the record will be written at syscall exit time (otherwise, it
  * will only be written if another part of the kernel requests that it
  * be written). */
-void audit_syscall_entry(struct task_struct *tsk, int major,
+void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
 			 unsigned long a1, unsigned long a2,
 			 unsigned long a3, unsigned long a4)
 {
@@ -827,6 +835,7 @@ void audit_syscall_entry(struct task_struct *tsk, int major,
 	if (!audit_enabled)
 		return;
 
+	context->arch	    = arch;
 	context->major      = major;
 	context->argv[0]    = a1;
 	context->argv[1]    = a2;
@@ -850,13 +859,13 @@ void audit_syscall_entry(struct task_struct *tsk, int major,
  * filtering, or because some other part of the kernel write an audit
  * message), then write out the syscall information.  In call cases,
  * free the names stored from getname(). */
-void audit_syscall_exit(struct task_struct *tsk, int return_code)
+void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
 {
 	struct audit_context *context;
 
 	get_task_struct(tsk);
 	task_lock(tsk);
-	context = audit_get_context(tsk, 1, return_code);
+	context = audit_get_context(tsk, valid, return_code);
 	task_unlock(tsk);
 
 	/* Not having a context here is ok, since the parent may have
@@ -869,6 +878,7 @@ void audit_syscall_exit(struct task_struct *tsk, int return_code)
 
 	context->in_syscall = 0;
 	context->auditable  = 0;
+
 	if (context->previous) {
 		struct audit_context *new_context = context->previous;
 		context->previous  = NULL;
-- 
cgit v1.2.2


From d812ddbb89e323d054a7d073466225966c8350c8 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Fri, 29 Apr 2005 16:09:52 +0100
Subject: [AUDIT] Fix signedness of 'serial' in various routines.

Attached is a patch that corrects a signed/unsigned warning. I also noticed
that we needlessly init serial to 0. That only needs to occur if the kernel
was compiled without the audit system.

-Steve Grubb

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c   | 6 ++++--
 kernel/auditsc.c | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index e7bff8000d..aa35422c0c 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -620,7 +620,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx)
 	struct audit_buffer	*ab	= NULL;
 	unsigned long		flags;
 	struct timespec		t;
-	int			serial	= 0;
+	unsigned int		serial;
 
 	if (!audit_initialized)
 		return NULL;
@@ -669,8 +669,10 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx)
 		audit_get_stamp(ab->ctx, &t, &serial);
 	else
 #endif
+	{
 		t = CURRENT_TIME;
-
+		serial = 0;
+	}
 	audit_log_format(ab, "audit(%lu.%03lu:%u): ",
 			 t.tv_sec, t.tv_nsec/1000000, serial);
 	return ab;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 77e92592de..49ecd707b9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -992,7 +992,7 @@ void audit_inode(const char *name, const struct inode *inode)
 }
 
 void audit_get_stamp(struct audit_context *ctx,
-		     struct timespec *t, int *serial)
+		     struct timespec *t, unsigned int *serial)
 {
 	if (ctx) {
 		t->tv_sec  = ctx->ctime.tv_sec;
-- 
cgit v1.2.2


From c7fcb0ee74ef4cfdea02befacb55945c93641e44 Mon Sep 17 00:00:00 2001
From: Peter Martuccelli <peterm@redhat.com>
Date: Fri, 29 Apr 2005 16:10:24 +0100
Subject: [AUDIT] Avoid using %*.*s format strings.

They don't seem to work correctly (investigation ongoing), but we don't
actually need to do it anyway.

Patch from Peter Martuccelli <peterm@redhat.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index aa35422c0c..42ce282728 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -540,8 +540,8 @@ static inline int audit_log_drain(struct audit_buffer *ab)
 		if (!audit_pid) { /* No daemon */
 			int offset = ab->nlh ? NLMSG_SPACE(0) : 0;
 			int len    = skb->len - offset;
-			printk(KERN_ERR "%*.*s\n",
-			       len, len, skb->data + offset);
+			skb->data[offset + len] = '\0';
+			printk(KERN_ERR "%s\n", skb->data + offset);
 		}
 		kfree_skb(skb);
 		ab->nlh = NULL;
-- 
cgit v1.2.2


From 85c8721ff3bc96b702427a440616079e8daf8a2f Mon Sep 17 00:00:00 2001
From:  <dwmw2@shinybook.infradead.org>
Date: Fri, 29 Apr 2005 16:23:29 +0100
Subject: audit: update pointer to userspace tools, remove emacs mode tags

---
 kernel/audit.c   | 4 ++--
 kernel/auditsc.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 42ce282728..58c7d7e472 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1,4 +1,4 @@
-/* audit.c -- Auditing support -*- linux-c -*-
+/* audit.c -- Auditing support
  * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
  * System-call specific features have moved to auditsc.c
  *
@@ -38,7 +38,7 @@
  *	  6) Support low-overhead kernel-based filtering to minimize the
  *	     information that must be passed to user-space.
  *
- * Example user-space utilities: http://people.redhat.com/faith/audit/
+ * Example user-space utilities: http://people.redhat.com/sgrubb/audit/
  */
 
 #include <linux/init.h>
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 49ecd707b9..9ff2c1b103 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1,4 +1,4 @@
-/* auditsc.c -- System-call auditing support -*- linux-c -*-
+/* auditsc.c -- System-call auditing support
  * Handles all system-call specific auditing features.
  *
  * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
-- 
cgit v1.2.2


From c94c257c88c517f251da273a15c654224c7b6e21 Mon Sep 17 00:00:00 2001
From: Serge Hallyn <serue@us.ibm.com>
Date: Fri, 29 Apr 2005 16:27:17 +0100
Subject: Add audit uid to netlink credentials

Most audit control messages are sent over netlink.In order to properly
log the identity of the sender of audit control messages, we would like
to add the loginuid to the netlink_creds structure, as per the attached
patch.

Signed-off-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c   | 46 +++++++++++++++++++++++++---------------------
 kernel/auditsc.c |  5 ++++-
 2 files changed, 29 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 58c7d7e472..587d3b2eba 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -239,36 +239,36 @@ void audit_log_lost(const char *message)
 
 }
 
-static int audit_set_rate_limit(int limit)
+static int audit_set_rate_limit(int limit, uid_t loginuid)
 {
 	int old		 = audit_rate_limit;
 	audit_rate_limit = limit;
-	audit_log(current->audit_context, "audit_rate_limit=%d old=%d",
-		  audit_rate_limit, old);
+	audit_log(NULL, "audit_rate_limit=%d old=%d by auid %u",
+			audit_rate_limit, old, loginuid);
 	return old;
 }
 
-static int audit_set_backlog_limit(int limit)
+static int audit_set_backlog_limit(int limit, uid_t loginuid)
 {
 	int old		 = audit_backlog_limit;
 	audit_backlog_limit = limit;
-	audit_log(current->audit_context, "audit_backlog_limit=%d old=%d",
-		  audit_backlog_limit, old);
+	audit_log(NULL, "audit_backlog_limit=%d old=%d by auid %u",
+			audit_backlog_limit, old, loginuid);
 	return old;
 }
 
-static int audit_set_enabled(int state)
+static int audit_set_enabled(int state, uid_t loginuid)
 {
 	int old		 = audit_enabled;
 	if (state != 0 && state != 1)
 		return -EINVAL;
 	audit_enabled = state;
-	audit_log(current->audit_context, "audit_enabled=%d old=%d",
-		  audit_enabled, old);
+	audit_log(NULL, "audit_enabled=%d old=%d by auid %u",
+		  audit_enabled, old, loginuid);
 	return old;
 }
 
-static int audit_set_failure(int state)
+static int audit_set_failure(int state, uid_t loginuid)
 {
 	int old		 = audit_failure;
 	if (state != AUDIT_FAIL_SILENT
@@ -276,8 +276,8 @@ static int audit_set_failure(int state)
 	    && state != AUDIT_FAIL_PANIC)
 		return -EINVAL;
 	audit_failure = state;
-	audit_log(current->audit_context, "audit_failure=%d old=%d",
-		  audit_failure, old);
+	audit_log(NULL, "audit_failure=%d old=%d by auid %u",
+		  audit_failure, old, loginuid);
 	return old;
 }
 
@@ -344,6 +344,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	int			err;
 	struct audit_buffer	*ab;
 	u16			msg_type = nlh->nlmsg_type;
+	uid_t			loginuid; /* loginuid of sender */
 
 	err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
 	if (err)
@@ -351,6 +352,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 	pid  = NETLINK_CREDS(skb)->pid;
 	uid  = NETLINK_CREDS(skb)->uid;
+	loginuid = NETLINK_CB(skb).loginuid;
 	seq  = nlh->nlmsg_seq;
 	data = NLMSG_DATA(nlh);
 
@@ -371,34 +373,36 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			return -EINVAL;
 		status_get   = (struct audit_status *)data;
 		if (status_get->mask & AUDIT_STATUS_ENABLED) {
-			err = audit_set_enabled(status_get->enabled);
+			err = audit_set_enabled(status_get->enabled, loginuid);
 			if (err < 0) return err;
 		}
 		if (status_get->mask & AUDIT_STATUS_FAILURE) {
-			err = audit_set_failure(status_get->failure);
+			err = audit_set_failure(status_get->failure, loginuid);
 			if (err < 0) return err;
 		}
 		if (status_get->mask & AUDIT_STATUS_PID) {
 			int old   = audit_pid;
 			audit_pid = status_get->pid;
-			audit_log(current->audit_context,
-				  "audit_pid=%d old=%d", audit_pid, old);
+			audit_log(NULL, "audit_pid=%d old=%d by auid %u",
+				  audit_pid, old, loginuid);
 		}
 		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
-			audit_set_rate_limit(status_get->rate_limit);
+			audit_set_rate_limit(status_get->rate_limit, loginuid);
 		if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
-			audit_set_backlog_limit(status_get->backlog_limit);
+			audit_set_backlog_limit(status_get->backlog_limit,
+							loginuid);
 		break;
 	case AUDIT_USER:
 		ab = audit_log_start(NULL);
 		if (!ab)
 			break;	/* audit_panic has been called */
 		audit_log_format(ab,
-				 "user pid=%d uid=%d length=%d msg='%.1024s'",
+				 "user pid=%d uid=%d length=%d loginuid=%u"
+				 " msg='%.1024s'",
 				 pid, uid,
 				 (int)(nlh->nlmsg_len
 				       - ((char *)data - (char *)nlh)),
-				 (char *)data);
+				 loginuid, (char *)data);
 		ab->type = AUDIT_USER;
 		ab->pid  = pid;
 		audit_log_end(ab);
@@ -411,7 +415,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	case AUDIT_LIST:
 #ifdef CONFIG_AUDITSYSCALL
 		err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
-					   uid, seq, data);
+					   uid, seq, data, loginuid);
 #else
 		err = -EOPNOTSUPP;
 #endif
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9ff2c1b103..66148f81d7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -251,7 +251,8 @@ static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
 	return 0;
 }
 
-int audit_receive_filter(int type, int pid, int uid, int seq, void *data)
+int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
+							uid_t loginuid)
 {
 	u32		   flags;
 	struct audit_entry *entry;
@@ -286,6 +287,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data)
 			err = audit_add_rule(entry, &audit_entlist);
 		if (!err && (flags & AUDIT_AT_EXIT))
 			err = audit_add_rule(entry, &audit_extlist);
+		audit_log(NULL, "auid %u added an audit rule\n", loginuid);
 		break;
 	case AUDIT_DEL:
 		flags =((struct audit_rule *)data)->flags;
@@ -295,6 +297,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data)
 			err = audit_del_rule(data, &audit_entlist);
 		if (!err && (flags & AUDIT_AT_EXIT))
 			err = audit_del_rule(data, &audit_extlist);
+		audit_log(NULL, "auid %u removed an audit rule\n", loginuid);
 		break;
 	default:
 		return -EINVAL;
-- 
cgit v1.2.2


From 37509e749dc2072e667db806ef24b9e897f61b8a Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@osdl.org>
Date: Fri, 29 Apr 2005 17:19:14 +0100
Subject: [AUDIT] Requeue messages at head of queue, up to audit_backlog

If netlink_unicast() fails, requeue the skb back at the head of the queue
it just came from, instead of the tail. And do so unless we've exceeded
the audit_backlog limit; not according to some other arbitrary limit.

From: Chris Wright <chrisw@osdl.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 587d3b2eba..4a697c73fa 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -142,7 +142,6 @@ struct audit_buffer {
 	int		     total;
 	int		     type;
 	int		     pid;
-	int		     count; /* Times requeued */
 };
 
 void audit_set_type(struct audit_buffer *ab, int type)
@@ -526,9 +525,9 @@ static inline int audit_log_drain(struct audit_buffer *ab)
 			retval = netlink_unicast(audit_sock, skb, audit_pid,
 						 MSG_DONTWAIT);
 		}
-		if (retval == -EAGAIN && ab->count < 5) {
-			++ab->count;
-			skb_queue_tail(&ab->sklist, skb);
+		if (retval == -EAGAIN &&
+		    (atomic_read(&audit_backlog)) < audit_backlog_limit) {
+			skb_queue_head(&ab->sklist, skb);
 			audit_log_end_irq(ab);
 			return 1;
 		}
@@ -666,7 +665,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx)
 	ab->total = 0;
 	ab->type  = AUDIT_KERNEL;
 	ab->pid   = 0;
-	ab->count = 0;
 
 #ifdef CONFIG_AUDITSYSCALL
 	if (ab->ctx)
-- 
cgit v1.2.2


From 456be6cd90dbbb9b0ea01d56932d56d110d51cf7 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Fri, 29 Apr 2005 17:30:07 +0100
Subject: [AUDIT] LOGIN message credentials

Attached is a new patch that solves the issue of getting valid credentials
into the LOGIN message. The current code was assuming that the audit context
had already been copied. This is not always the case for LOGIN messages.

To solve the problem, the patch passes the task struct to the function that
emits the message where it can get valid credentials.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/auditsc.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 66148f81d7..37b3ac94bc 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1010,20 +1010,21 @@ void audit_get_stamp(struct audit_context *ctx,
 
 extern int audit_set_type(struct audit_buffer *ab, int type);
 
-int audit_set_loginuid(struct audit_context *ctx, uid_t loginuid)
+int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 {
-	if (ctx) {
+	if (task->audit_context) {
 		struct audit_buffer *ab;
 
 		ab = audit_log_start(NULL);
 		if (ab) {
 			audit_log_format(ab, "login pid=%d uid=%u "
 				"old loginuid=%u new loginuid=%u",
-				ctx->pid, ctx->uid, ctx->loginuid, loginuid);
+				task->pid, task->uid, 
+				task->audit_context->loginuid, loginuid);
 			audit_set_type(ab, AUDIT_LOGIN);
 			audit_log_end(ab);
 		}
-		ctx->loginuid = loginuid;
+		task->audit_context->loginuid = loginuid;
 	}
 	return 0;
 }
-- 
cgit v1.2.2


From 0dd8e06bdaa0a97e706ee1a489a1f6176c4ddc64 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@osdl.org>
Date: Tue, 3 May 2005 14:01:15 +0100
Subject: [PATCH] add new audit data to last skb

When adding more formatted audit data to an skb for delivery to userspace,
the kernel will attempt to reuse an skb that has spare room.  However, if
the audit message has already been fragmented to multiple skb's, the search
for spare room in the skb uses the head of the list.  This will corrupt the
audit message with trailing bytes being placed midway through the stream.
Fix is to look at the end of the list.

Signed-off-by: Chris Wright <chrisw@osdl.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 4a697c73fa..00455a9cf0 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -486,7 +486,7 @@ static void audit_log_move(struct audit_buffer *ab)
 	if (ab->len == 0)
 		return;
 
-	skb = skb_peek(&ab->sklist);
+	skb = skb_peek_tail(&ab->sklist);
 	if (!skb || skb_tailroom(skb) <= ab->len + extra) {
 		skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC);
 		if (!skb) {
-- 
cgit v1.2.2


From 012914dad25bd5cacf88af4429eecda62a06020d Mon Sep 17 00:00:00 2001
From: Russ Anderson <(rja@sgi.com)>
Date: Sat, 23 Apr 2005 00:08:00 -0700
Subject: [patch] MCA recovery module undefined symbol fix

The patch "MCA recovery improvements" added do_exit to mca_drv.c.
That's fine when the mca recovery code is built in the kernel
(CONFIG_IA64_MCA_RECOVERY=y) but breaks building the mca recovery
code as a module (CONFIG_IA64_MCA_RECOVERY=m).

Most users are currently building this as a module, as loading
and unloading the module provides a very convenient way to turn
on/off error recovery.

This patch exports do_exit, so mca_drv.c can build as a module.

Signed-off-by: Russ Anderson (rja@sgi.com)
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 kernel/exit.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 7be283d989..edaa50b5bb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -846,6 +846,8 @@ fastcall NORET_TYPE void do_exit(long code)
 	for (;;) ;
 }
 
+EXPORT_SYMBOL_GPL(do_exit);
+
 NORET_TYPE void complete_and_exit(struct completion *comp, long code)
 {
 	if (comp)
-- 
cgit v1.2.2


From 075d6eb16d273dab7b7b4b83fcee8bce4ee387ed Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 5 May 2005 16:15:09 -0700
Subject: [PATCH] ppc32: platform-specific functions missing from kallsyms.

The PPC32 kernel puts platform-specific functions into separate sections so
that unneeded parts of it can be freed when we've booted and actually
worked out what we're running on today.

This makes kallsyms ignore those functions, because they're not between
_[se]text or _[se]inittext.  Rather than teaching kallsyms about the
various pmac/chrp/etc sections, this patch adds '_[se]extratext' markers
for kallsyms.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kallsyms.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 1627f8d6e0..13bcec151b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -46,6 +46,14 @@ static inline int is_kernel_inittext(unsigned long addr)
 	return 0;
 }
 
+static inline int is_kernel_extratext(unsigned long addr)
+{
+	if (addr >= (unsigned long)_sextratext
+	    && addr <= (unsigned long)_eextratext)
+		return 1;
+	return 0;
+}
+
 static inline int is_kernel_text(unsigned long addr)
 {
 	if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext)
@@ -169,8 +177,9 @@ const char *kallsyms_lookup(unsigned long addr,
 	namebuf[0] = 0;
 
 	if ((all_var && is_kernel(addr)) ||
-	    (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr)))) {
-		unsigned long symbol_end=0;
+	    (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr) ||
+				is_kernel_extratext(addr)))) {
+		unsigned long symbol_end = 0;
 
 		/* do a binary search on the sorted kallsyms_addresses array */
 		low = 0;
-- 
cgit v1.2.2


From 7d12e522ba13ce718b7ec32b75803dece8adb072 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 5 May 2005 16:15:11 -0700
Subject: [PATCH] ppc64: remove hidden -fno-omit-frame-pointer for schedule.c

While looking at code generated by gcc4.0 I noticed some functions still
had frame pointers, even after we stopped ppc64 from defining
CONFIG_FRAME_POINTER.  It turns out kernel/Makefile hardwires
-fno-omit-frame-pointer on when compiling schedule.c.

Create CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER and define it on architectures
that dont require frame pointers in sched.c code.

(akpm: blame me for the name)

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index eb88b446c2..b01d26fe8d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -29,7 +29,7 @@ obj-$(CONFIG_SYSFS) += ksysfs.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 
-ifneq ($(CONFIG_IA64),y)
+ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 # needed for x86 only.  Why this used to be enabled for all architectures is beyond
 # me.  I suspect most platforms don't need this, but until we know that for sure
-- 
cgit v1.2.2


From 04dea5f93231204cc3ca0ab793ce76dbb10c86ba Mon Sep 17 00:00:00 2001
From: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Date: Thu, 5 May 2005 16:15:41 -0700
Subject: [PATCH] Kprobes: Oops! in unregister_kprobe()

kernel oops!  when unregister_kprobe() is called on a non-registered
kprobe.  This patch fixes the above problem by checking if the probe exists
before unregistering.

Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1d5dd1337b..d8903e60c9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -107,13 +107,17 @@ rm_kprobe:
 void unregister_kprobe(struct kprobe *p)
 {
 	unsigned long flags;
-	arch_remove_kprobe(p);
 	spin_lock_irqsave(&kprobe_lock, flags);
+	if (!get_kprobe(p->addr)) {
+		spin_unlock_irqrestore(&kprobe_lock, flags);
+		return;
+	}
 	*p->addr = p->opcode;
 	hlist_del(&p->hlist);
 	flush_icache_range((unsigned long) p->addr,
 			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 	spin_unlock_irqrestore(&kprobe_lock, flags);
+	arch_remove_kprobe(p);
 }
 
 static struct notifier_block kprobe_exceptions_nb = {
-- 
cgit v1.2.2


From 64f562c6df3cfc5d1b2b4bdbcb7951457df9c237 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <amavin@redhat.com>
Date: Thu, 5 May 2005 16:15:42 -0700
Subject: [PATCH] kprobes: Allow multiple kprobes at the same address

Allow registration of multiple kprobes at an address in an architecture
agnostic way.  Corresponding handlers will be invoked in a sequence.  But,
a kprobe and a jprobe can't (yet) co-exist at the same address.

Signed-off-by: Ananth N Mavinakayanahalli <amavin@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 131 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d8903e60c9..037142b72a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -44,6 +44,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 
 unsigned int kprobe_cpu = NR_CPUS;
 static DEFINE_SPINLOCK(kprobe_lock);
+static struct kprobe *curr_kprobe;
 
 /* Locks kprobe: irqs must be disabled */
 void lock_kprobes(void)
@@ -73,22 +74,139 @@ struct kprobe *get_kprobe(void *addr)
 	return NULL;
 }
 
+/*
+ * Aggregate handlers for multiple kprobes support - these handlers
+ * take care of invoking the individual kprobe handlers on p->list
+ */
+int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct kprobe *kp;
+
+	list_for_each_entry(kp, &p->list, list) {
+		if (kp->pre_handler) {
+			curr_kprobe = kp;
+			kp->pre_handler(kp, regs);
+			curr_kprobe = NULL;
+		}
+	}
+	return 0;
+}
+
+void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+		unsigned long flags)
+{
+	struct kprobe *kp;
+
+	list_for_each_entry(kp, &p->list, list) {
+		if (kp->post_handler) {
+			curr_kprobe = kp;
+			kp->post_handler(kp, regs, flags);
+			curr_kprobe = NULL;
+		}
+	}
+	return;
+}
+
+int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr)
+{
+	/*
+	 * if we faulted "during" the execution of a user specified
+	 * probe handler, invoke just that probe's fault handler
+	 */
+	if (curr_kprobe && curr_kprobe->fault_handler) {
+		if (curr_kprobe->fault_handler(curr_kprobe, regs, trapnr))
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Fill in the required fields of the "manager kprobe". Replace the
+ * earlier kprobe in the hlist with the manager kprobe
+ */
+static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
+{
+	ap->addr = p->addr;
+	ap->opcode = p->opcode;
+	memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn));
+
+	ap->pre_handler = aggr_pre_handler;
+	ap->post_handler = aggr_post_handler;
+	ap->fault_handler = aggr_fault_handler;
+
+	INIT_LIST_HEAD(&ap->list);
+	list_add(&p->list, &ap->list);
+
+	INIT_HLIST_NODE(&ap->hlist);
+	hlist_del(&p->hlist);
+	hlist_add_head(&ap->hlist,
+		&kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]);
+}
+
+/*
+ * This is the second or subsequent kprobe at the address - handle
+ * the intricacies
+ * TODO: Move kcalloc outside the spinlock
+ */
+static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
+{
+	int ret = 0;
+	struct kprobe *ap;
+
+	if (old_p->break_handler || p->break_handler) {
+		ret = -EEXIST;	/* kprobe and jprobe can't (yet) coexist */
+	} else if (old_p->pre_handler == aggr_pre_handler) {
+		list_add(&p->list, &old_p->list);
+	} else {
+		ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC);
+		if (!ap)
+			return -ENOMEM;
+		add_aggr_kprobe(ap, old_p);
+		list_add(&p->list, &ap->list);
+	}
+	return ret;
+}
+
+/* kprobe removal house-keeping routines */
+static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
+{
+	*p->addr = p->opcode;
+	hlist_del(&p->hlist);
+	flush_icache_range((unsigned long) p->addr,
+		   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+	spin_unlock_irqrestore(&kprobe_lock, flags);
+	arch_remove_kprobe(p);
+}
+
+static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
+		struct kprobe *p, unsigned long flags)
+{
+	list_del(&p->list);
+	if (list_empty(&old_p->list)) {
+		cleanup_kprobe(old_p, flags);
+		kfree(old_p);
+	} else
+		spin_unlock_irqrestore(&kprobe_lock, flags);
+}
+
 int register_kprobe(struct kprobe *p)
 {
 	int ret = 0;
 	unsigned long flags = 0;
+	struct kprobe *old_p;
 
 	if ((ret = arch_prepare_kprobe(p)) != 0) {
 		goto rm_kprobe;
 	}
 	spin_lock_irqsave(&kprobe_lock, flags);
-	INIT_HLIST_NODE(&p->hlist);
-	if (get_kprobe(p->addr)) {
-		ret = -EEXIST;
+	old_p = get_kprobe(p->addr);
+	if (old_p) {
+		ret = register_aggr_kprobe(old_p, p);
 		goto out;
 	}
-	arch_copy_kprobe(p);
 
+	arch_copy_kprobe(p);
+	INIT_HLIST_NODE(&p->hlist);
 	hlist_add_head(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
 
@@ -107,17 +225,17 @@ rm_kprobe:
 void unregister_kprobe(struct kprobe *p)
 {
 	unsigned long flags;
+	struct kprobe *old_p;
+
 	spin_lock_irqsave(&kprobe_lock, flags);
-	if (!get_kprobe(p->addr)) {
+	old_p = get_kprobe(p->addr);
+	if (old_p) {
+		if (old_p->pre_handler == aggr_pre_handler)
+			cleanup_aggr_kprobe(old_p, p, flags);
+		else
+			cleanup_kprobe(p, flags);
+	} else
 		spin_unlock_irqrestore(&kprobe_lock, flags);
-		return;
-	}
-	*p->addr = p->opcode;
-	hlist_del(&p->hlist);
-	flush_icache_range((unsigned long) p->addr,
-			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
-	spin_unlock_irqrestore(&kprobe_lock, flags);
-	arch_remove_kprobe(p);
 }
 
 static struct notifier_block kprobe_exceptions_nb = {
-- 
cgit v1.2.2


From b7e4e85337060354f8b860cc38066725559313a4 Mon Sep 17 00:00:00 2001
From: Paulo Marques <pmarques@grupopie.com>
Date: Thu, 5 May 2005 16:15:49 -0700
Subject: [PATCH] setitimer timer expires too early

It seems that the code responsible for this is in kernel/itimer.c:126:

	p->signal->real_timer.expires = jiffies + interval;
	add_timer(&p->signal->real_timer);

If you request an interval of, lets say 900 usecs, the interval given by
timeval_to_jiffies will be 1.

If you request this when we are half-way between two timer ticks, the
interval will only give 400 usecs.

If we want to guarantee that we never ever give intervals less than
requested, the simple solution would be to change that to:

	p->signal->real_timer.expires = jiffies + interval + 1;

This however will produce pathological cases, like having a idle system
being requested 1 ms timeouts will give systematically 2 ms timeouts,
whereas currently it simply gives a few usecs less than 1 ms.

The complex (and more computationally expensive) solution would be to
check the gettimeofday time, and compute the correct number of jiffies.
This way, if we request a 300 usecs timer 200 usecs inside the timer
tick, we can wait just one tick, but not if we are 800 usecs inside the
tick. This would also mean that we would have to lock preemption during
these computations to avoid races, etc.

I've searched the archives but couldn't find this particular issue being
discussed before.

Attached is a patch to do the simple solution, in case anybody thinks
that it should be used.

Signed-Off-By: Paulo Marques <pmarques@grupopie.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/itimer.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/itimer.c b/kernel/itimer.c
index e9a40e947e..1dc988e0d2 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -123,7 +123,11 @@ static inline void it_real_arm(struct task_struct *p, unsigned long interval)
 		return;
 	if (interval > (unsigned long) LONG_MAX)
 		interval = LONG_MAX;
-	p->signal->real_timer.expires = jiffies + interval;
+	/* the "+ 1" below makes sure that the timer doesn't go off before
+	 * the interval requested. This could happen if
+	 * time requested % (usecs per jiffy) is more than the usecs left
+	 * in the current jiffy */
+	p->signal->real_timer.expires = jiffies + interval + 1;
 	add_timer(&p->signal->real_timer);
 }
 
-- 
cgit v1.2.2


From ebe8b54134314cc31331f6e26f42276cd947d1df Mon Sep 17 00:00:00 2001
From: Domen Puncer <domen@coderock.org>
Date: Thu, 5 May 2005 16:16:19 -0700
Subject: [PATCH] correctly name the Shell sort

As per http://www.nist.gov/dads/HTML/shellsort.html, this should be
referred to as a Shell sort.  Shell-Metzner is a misnomer.

Signed-off-by: Daniel Dickman <didickman@yahoo.com>
Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index f64e97cabe..f006632c2b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1195,7 +1195,7 @@ static int groups_from_user(struct group_info *group_info,
 	return 0;
 }
 
-/* a simple shell-metzner sort */
+/* a simple Shell sort */
 static void groups_sort(struct group_info *group_info)
 {
 	int base, max, stride;
-- 
cgit v1.2.2


From 4f167fb491725ca0be9df0d76b4b2dd862cdfe0b Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin.zhang@intel.com>
Date: Mon, 16 May 2005 21:53:43 -0700
Subject: [PATCH] spurious interrupt fix

On my IA64 machine, after kernel 2.6.12-rc3 boots, an edge-triggered
interrupt (IRQ 46) keeps triggered over and over again.  There is no IRQ 46
interrupt action handler.  It has lots of impact on performance.

Kernel 2.6.10 and its prior versions have no the problem.  Basically,
kernel 2.6.10 will mask the spurious edge interrupt if the interrupt is
triggered for the second time and its status includes
IRQ_DISABLE|IRQ_PENDING.

Originally, IA64 kernel has its own specific _irq_desc definitions in file
arch/ia64/kernel/irq.c.  The definition initiates _irq_desc[irq].status to
IRQ_DISABLE.  Since kernel 2.6.11, it was moved to architecture independent
codes, i.e.  kernel/irq/handle.c, but kernel/irq/handle.c initiates
_irq_desc[irq].status to 0 instead of IRQ_DISABLE.

Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/handle.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 2fb0e46e11..06b5a63239 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -30,6 +30,7 @@
  */
 irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
 	[0 ... NR_IRQS-1] = {
+		.status = IRQ_DISABLED,
 		.handler = &no_irq_type,
 		.lock = SPIN_LOCK_UNLOCKED
 	}
-- 
cgit v1.2.2


From 3c0547ba8b3bbd8b26ae35e33ac17ff51f67f78c Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Mon, 16 May 2005 21:53:47 -0700
Subject: [PATCH] add_preferred_console() build fix

Move add_preferred_console out of CONFIG_PRINTK so serial console does the
right thing.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 72 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 290a07ce2c..01b58d7d17 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -160,42 +160,6 @@ static int __init console_setup(char *str)
 
 __setup("console=", console_setup);
 
-/**
- * add_preferred_console - add a device to the list of preferred consoles.
- *
- * The last preferred console added will be used for kernel messages
- * and stdin/out/err for init.  Normally this is used by console_setup
- * above to handle user-supplied console arguments; however it can also
- * be used by arch-specific code either to override the user or more
- * commonly to provide a default console (ie from PROM variables) when
- * the user has not supplied one.
- */
-int __init add_preferred_console(char *name, int idx, char *options)
-{
-	struct console_cmdline *c;
-	int i;
-
-	/*
-	 *	See if this tty is not yet registered, and
-	 *	if we have a slot free.
-	 */
-	for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
-		if (strcmp(console_cmdline[i].name, name) == 0 &&
-			  console_cmdline[i].index == idx) {
-				selected_console = i;
-				return 0;
-		}
-	if (i == MAX_CMDLINECONSOLES)
-		return -E2BIG;
-	selected_console = i;
-	c = &console_cmdline[i];
-	memcpy(c->name, name, sizeof(c->name));
-	c->name[sizeof(c->name) - 1] = 0;
-	c->options = options;
-	c->index = idx;
-	return 0;
-}
-
 static int __init log_buf_len_setup(char *str)
 {
 	unsigned long size = memparse(str, &str);
@@ -670,6 +634,42 @@ static void call_console_drivers(unsigned long start, unsigned long end) {}
 
 #endif
 
+/**
+ * add_preferred_console - add a device to the list of preferred consoles.
+ *
+ * The last preferred console added will be used for kernel messages
+ * and stdin/out/err for init.  Normally this is used by console_setup
+ * above to handle user-supplied console arguments; however it can also
+ * be used by arch-specific code either to override the user or more
+ * commonly to provide a default console (ie from PROM variables) when
+ * the user has not supplied one.
+ */
+int __init add_preferred_console(char *name, int idx, char *options)
+{
+	struct console_cmdline *c;
+	int i;
+
+	/*
+	 *	See if this tty is not yet registered, and
+	 *	if we have a slot free.
+	 */
+	for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
+		if (strcmp(console_cmdline[i].name, name) == 0 &&
+			  console_cmdline[i].index == idx) {
+				selected_console = i;
+				return 0;
+		}
+	if (i == MAX_CMDLINECONSOLES)
+		return -E2BIG;
+	selected_console = i;
+	c = &console_cmdline[i];
+	memcpy(c->name, name, sizeof(c->name));
+	c->name[sizeof(c->name) - 1] = 0;
+	c->options = options;
+	c->index = idx;
+	return 0;
+}
+
 /**
  * acquire_console_sem - lock the console system for exclusive use.
  *
-- 
cgit v1.2.2


From dfaa9c94b13071c9b5f8578d0ae99acc76c60139 Mon Sep 17 00:00:00 2001
From: William Lee Irwin III <wli@holomorphy.com>
Date: Mon, 16 May 2005 21:53:58 -0700
Subject: [PATCH] profile.c: `schedule' parsing fix

profile=schedule parsing is not quite what it should be.  First, str[7] is
'e', not ',', but then even if it did fall through, prof_on =
SCHED_PROFILING would be clobbered inside if (get_option(...)) So a small
amount of rearrangement is done in this patch to correct it.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/profile.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index 0221a50ca8..ad8cbb75ff 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -49,15 +49,19 @@ static DECLARE_MUTEX(profile_flip_mutex);
 
 static int __init profile_setup(char * str)
 {
+	static char __initdata schedstr[] = "schedule";
 	int par;
 
-	if (!strncmp(str, "schedule", 8)) {
+	if (!strncmp(str, schedstr, strlen(schedstr))) {
 		prof_on = SCHED_PROFILING;
-		printk(KERN_INFO "kernel schedule profiling enabled\n");
-		if (str[7] == ',')
-			str += 8;
-	}
-	if (get_option(&str,&par)) {
+		if (str[strlen(schedstr)] == ',')
+			str += strlen(schedstr) + 1;
+		if (get_option(&str, &par))
+			prof_shift = par;
+		printk(KERN_INFO
+			"kernel schedule profiling enabled (shift: %ld)\n",
+			prof_shift);
+	} else if (get_option(&str, &par)) {
 		prof_shift = par;
 		prof_on = CPU_PROFILING;
 		printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
-- 
cgit v1.2.2


From 82428b62aa6294ea640c7e920a9224ecaf46db65 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Mon, 9 May 2005 08:07:00 -0700
Subject: [PATCH] Driver Core: pm diagnostics update, check for errors

This patch includes various tweaks in the messaging that appears during
system pm state transitions:

  * Warn about certain illegal calls in the device tree, like resuming
    child before parent or suspending parent before child.  This could
    happen easily enough through sysfs, or in some cases when drivers
    use device_pm_set_parent().

  * Be more consistent about dev_dbg() tracing ... do it for resume() and
    shutdown() too, and never if the driver doesn't have that method.

  * Say which type of system sleep state is being entered.

Except for the warnings, these only affect debug messaging.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/power/main.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7960ddf04a..4cdebc972f 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -156,14 +156,14 @@ static int enter_state(suspend_state_t state)
 		goto Unlock;
 	}
 
-	pr_debug("PM: Preparing system for suspend\n");
+	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
 	if ((error = suspend_prepare(state)))
 		goto Unlock;
 
-	pr_debug("PM: Entering state.\n");
+	pr_debug("PM: Entering %s sleep\n", pm_states[state]);
 	error = suspend_enter(state);
 
-	pr_debug("PM: Finishing up.\n");
+	pr_debug("PM: Finishing wakeup.\n");
 	suspend_finish(state);
  Unlock:
 	up(&pm_sem);
-- 
cgit v1.2.2


From b39c4fab259b216148e705344a892c96efe1946d Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 20 May 2005 13:59:15 -0700
Subject: [PATCH] cpusets+hotplug+preepmt broken

This patch removes the entwining of cpusets and hotplug code in the "No
more Mr.  Nice Guy" case of sched.c move_task_off_dead_cpu().

Since the hotplug code is holding a spinlock at this point, we cannot take
the cpuset semaphore, cpuset_sem, as would seem to be required either to
update the tasks cpuset, or to scan up the nested cpuset chain, looking for
the nearest cpuset ancestor that still has some CPUs that are online.  So
we just punt and blast the tasks cpus_allowed with all bits allowed.

This reverts these lines of code to what they were before the cpuset patch.
 And it updates the cpuset Doc file, to match.

The one known alternative to this that seems to work came from Dinakar
Guniguntala, and required the hotplug code to take the cpuset_sem semaphore
much earlier in its processing.  So far as we know, the increased locking
entanglement between cpusets and hot plug of this alternative approach is
not worth doing in this case.

Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Nathan Lynch <ntl@pobox.com>
Acked-by: Dinakar Guniguntala <dino@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0dc3158667..66b2ed7848 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4243,7 +4243,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
 
 	/* No more Mr. Nice Guy. */
 	if (dest_cpu == NR_CPUS) {
-		tsk->cpus_allowed = cpuset_cpus_allowed(tsk);
+		cpus_setall(tsk->cpus_allowed);
 		dest_cpu = any_online_cpu(tsk->cpus_allowed);
 
 		/*
-- 
cgit v1.2.2


From 10f02d1c59e55f529140dda3a92f0099d748451c Mon Sep 17 00:00:00 2001
From: Samuel Thibault <samuel.thibault@labri.fr>
Date: Sat, 21 May 2005 17:50:15 +0200
Subject: [PATCH] spin_unlock_bh() and preempt_check_resched()

In _spin_unlock_bh(lock):
	do { \
		_raw_spin_unlock(lock); \
		preempt_enable(); \
		local_bh_enable(); \
		__release(lock); \
	} while (0)

there is no reason for using preempt_enable() instead of a simple
preempt_enable_no_resched()

Since we know bottom halves are disabled, preempt_schedule() will always
return at once (preempt_count!=0), and hence preempt_check_resched() is
useless here...

This fixes it by using "preempt_enable_no_resched()" instead of the
"preempt_enable()", and thus avoids the useless preempt_check_resched()
just before re-enabling bottom halves.

Signed-off-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/spinlock.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index e15ed17863..0c3f9d8bbe 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -294,7 +294,7 @@ EXPORT_SYMBOL(_spin_unlock_irq);
 void __lockfunc _spin_unlock_bh(spinlock_t *lock)
 {
 	_raw_spin_unlock(lock);
-	preempt_enable();
+	preempt_enable_no_resched();
 	local_bh_enable();
 }
 EXPORT_SYMBOL(_spin_unlock_bh);
@@ -318,7 +318,7 @@ EXPORT_SYMBOL(_read_unlock_irq);
 void __lockfunc _read_unlock_bh(rwlock_t *lock)
 {
 	_raw_read_unlock(lock);
-	preempt_enable();
+	preempt_enable_no_resched();
 	local_bh_enable();
 }
 EXPORT_SYMBOL(_read_unlock_bh);
@@ -342,7 +342,7 @@ EXPORT_SYMBOL(_write_unlock_irq);
 void __lockfunc _write_unlock_bh(rwlock_t *lock)
 {
 	_raw_write_unlock(lock);
-	preempt_enable();
+	preempt_enable_no_resched();
 	local_bh_enable();
 }
 EXPORT_SYMBOL(_write_unlock_bh);
@@ -354,7 +354,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
 	if (_raw_spin_trylock(lock))
 		return 1;
 
-	preempt_enable();
+	preempt_enable_no_resched();
 	local_bh_enable();
 	return 0;
 }
-- 
cgit v1.2.2


From c33880aaddbbab1ccf36f4457ed1090621f2e39a Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@sw.ru>
Date: Tue, 24 May 2005 19:29:47 -0700
Subject: [PATCH] sigkill priority fix

If SIGKILL does not have priority, we cannot instantly kill task before it
makes some unexpected job.  It can be critical, but we were unable to
reproduce this easily until Heiko Carstens <Heiko.Carstens@de.ibm.com>
reported this problem on LKML.

Signed-Off-By: Kirill Korotaev <dev@sw.ru>
Signed-Off-By: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 8f3debc77c..b3c24c732c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -522,7 +522,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 {
 	int sig = 0;
 
-	sig = next_signal(pending, mask);
+	/* SIGKILL must have priority, otherwise it is quite easy
+	 * to create an unkillable process, sending sig < SIGKILL
+	 * to self */
+	if (unlikely(sigismember(&pending->signal, SIGKILL))) {
+		if (!sigismember(mask, SIGKILL))
+			sig = SIGKILL;
+	}
+
+	if (likely(!sig))
+		sig = next_signal(pending, mask);
 	if (sig) {
 		if (current->notifier) {
 			if (sigismember(current->notifier_mask, sig)) {
-- 
cgit v1.2.2


From 2efe86b809d97debaaf9fcc13b041aedf15bd3d2 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 27 May 2005 02:02:43 -0700
Subject: [PATCH] cpuset exit NULL dereference fix

There is a race in the kernel cpuset code, between the code
to handle notify_on_release, and the code to remove a cpuset.
The notify_on_release code can end up trying to access a
cpuset that has been removed.  In the most common case, this
causes a NULL pointer dereference from the routine cpuset_path.
However all manner of bad things are possible, in theory at least.

The existing code decrements the cpuset use count, and if the
count goes to zero, processes the notify_on_release request,
if appropriate.  However, once the count goes to zero, unless we
are holding the global cpuset_sem semaphore, there is nothing to
stop another task from immediately removing the cpuset entirely,
and recycling its memory.

The obvious fix would be to always hold the cpuset_sem
semaphore while decrementing the use count and dealing with
notify_on_release.  However we don't want to force a global
semaphore into the mainline task exit path, as that might create
a scaling problem.

The actual fix is almost as easy - since this is only an issue
for cpusets using notify_on_release, which the top level big
cpusets don't normally need to use, only take the cpuset_sem
for cpusets using notify_on_release.

This code has been run for hours without a hiccup, while running
a cpuset create/destroy stress test that could crash the existing
kernel in seconds.  This patch applies to the current -linus
git kernel.

Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Simon Derr <simon.derr@bull.net>
Acked-by: Dinakar Guniguntala <dino@in.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 961d74044d..00e8f25755 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -166,9 +166,8 @@ static struct super_block *cpuset_sb = NULL;
  * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
  * (usually) grab cpuset_sem.  These are the two most performance
  * critical pieces of code here.  The exception occurs on exit(),
- * if the last task using a cpuset exits, and the cpuset was marked
- * notify_on_release.  In that case, the cpuset_sem is taken, the
- * path to the released cpuset calculated, and a usermode call made
+ * when a task in a notify_on_release cpuset exits.  Then cpuset_sem
+ * is taken, and if the cpuset count is zero, a usermode call made
  * to /sbin/cpuset_release_agent with the name of the cpuset (path
  * relative to the root of cpuset file system) as the argument.
  *
@@ -1404,6 +1403,18 @@ void cpuset_fork(struct task_struct *tsk)
  *
  * Description: Detach cpuset from @tsk and release it.
  *
+ * Note that cpusets marked notify_on_release force every task
+ * in them to take the global cpuset_sem semaphore when exiting.
+ * This could impact scaling on very large systems.  Be reluctant
+ * to use notify_on_release cpusets where very high task exit
+ * scaling is required on large systems.
+ *
+ * Don't even think about derefencing 'cs' after the cpuset use
+ * count goes to zero, except inside a critical section guarded
+ * by the cpuset_sem semaphore.  If you don't hold cpuset_sem,
+ * then a zero cpuset use count is a license to any other task to
+ * nuke the cpuset immediately.
+ *
  **/
 
 void cpuset_exit(struct task_struct *tsk)
@@ -1415,10 +1426,13 @@ void cpuset_exit(struct task_struct *tsk)
 	tsk->cpuset = NULL;
 	task_unlock(tsk);
 
-	if (atomic_dec_and_test(&cs->count)) {
+	if (notify_on_release(cs)) {
 		down(&cpuset_sem);
-		check_for_release(cs);
+		if (atomic_dec_and_test(&cs->count))
+			check_for_release(cs);
 		up(&cpuset_sem);
+	} else {
+		atomic_dec(&cs->count);
 	}
 }
 
-- 
cgit v1.2.2


From b60c1f6ffd88850079ae419aa933ab0eddbd5535 Mon Sep 17 00:00:00 2001
From: John Hawkes <hawkes@sgi.com>
Date: Fri, 27 May 2005 12:53:00 -0700
Subject: [PATCH] drop note_interrupt() for per-CPU for proper scaling

The "unhandled interrupts" catcher, note_interrupt(), increments a global
desc->irq_count and grossly damages scaling of very large systems, e.g.,
>192p ia64 Altix, because of this highly contented cacheline, especially
for timer interrupts.  384p is severely crippled, and 512p is unuseable.

All calls to note_interrupt() can be disabled by booting with "noirqdebug",
but this disables the useful interrupt checking for all interrupts.

I propose eliminating note_interrupt() for all per-CPU interrupts.  This
was the behavior of linux-2.6.10 and earlier, but in 2.6.11 a code
restructuring added a call to note_interrupt() for per-CPU interrupts.
Besides, note_interrupt() is a bit racy for concurrent CPU calls anyway, as
the desc->irq_count++ increment isn't atomic (which, if done, would make
scaling even worse).

Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/handle.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 06b5a63239..436c7d93c0 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -119,8 +119,6 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 		 */
 		desc->handler->ack(irq);
 		action_ret = handle_IRQ_event(irq, regs, desc->action);
-		if (!noirqdebug)
-			note_interrupt(irq, desc, action_ret);
 		desc->handler->end(irq);
 		return 1;
 	}
-- 
cgit v1.2.2


From ae92ef8a442421356950a0a8dfdc35e8e783000e Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Tue, 31 May 2005 14:39:29 -0700
Subject: [PATCH] flush icache in correct context

flush_icache_range() is used in two different situation - in binfmt_elf.c &
co for user space mappings and module.c for kernel modules.  On m68k
flush_icache_range() doesn't know which data to flush, as it has separate
address spaces and the pointer argument can be valid in either address
space.

First I considered splitting flush_icache_range(), but this patch is
simpler.  Setting the correct context gives flush_icache_range() enough
information to flush the correct data.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5734ab09d3..83b3d37670 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1758,6 +1758,7 @@ sys_init_module(void __user *umod,
 		const char __user *uargs)
 {
 	struct module *mod;
+	mm_segment_t old_fs = get_fs();
 	int ret = 0;
 
 	/* Must have permission */
@@ -1775,6 +1776,9 @@ sys_init_module(void __user *umod,
 		return PTR_ERR(mod);
 	}
 
+	/* flush the icache in correct context */
+	set_fs(KERNEL_DS);
+
 	/* Flush the instruction cache, since we've played with text */
 	if (mod->module_init)
 		flush_icache_range((unsigned long)mod->module_init,
@@ -1783,6 +1787,8 @@ sys_init_module(void __user *umod,
 	flush_icache_range((unsigned long)mod->module_core,
 			   (unsigned long)mod->module_core + mod->core_size);
 
+	set_fs(old_fs);
+
 	/* Now sew it into the lists.  They won't access us, since
            strong_try_module_get() will fail. */
 	stop_machine_run(__link_module, mod, NR_CPUS);
-- 
cgit v1.2.2


From 6df3cecbb95345981718b38d357c50bc3425420a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 13 Jun 2005 15:52:32 -0700
Subject: [PATCH] cond_resched_lock() fix

On one path, cond_resched_lock() fails to return true if it dropped the lock.
We think this might be causing the crashes in JBD's log_do_checkpoint().

Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 66b2ed7848..f12a0c8a7d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3755,19 +3755,22 @@ EXPORT_SYMBOL(cond_resched);
  */
 int cond_resched_lock(spinlock_t * lock)
 {
+	int ret = 0;
+
 	if (need_lockbreak(lock)) {
 		spin_unlock(lock);
 		cpu_relax();
+		ret = 1;
 		spin_lock(lock);
 	}
 	if (need_resched()) {
 		_raw_spin_unlock(lock);
 		preempt_enable_no_resched();
 		__cond_resched();
+		ret = 1;
 		spin_lock(lock);
-		return 1;
 	}
-	return 0;
+	return ret;
 }
 
 EXPORT_SYMBOL(cond_resched_lock);
-- 
cgit v1.2.2