From 83c7d09173fdb6b06b109e65895392db3e49ac9c Mon Sep 17 00:00:00 2001 From: Date: Fri, 29 Apr 2005 15:54:44 +0100 Subject: AUDIT: Avoid log pollution by untrusted strings. We log strings from userspace, such as arguments to open(). These could be formatted to contain \n followed by fake audit log entries. Provide a function for logging such strings, which gives a hex dump when the string contains anything but basic printable ASCII characters. Use it for logging filenames. Signed-off-by: David Woodhouse --- kernel/audit.c | 23 +++++++++++++++++++++++ kernel/auditsc.c | 7 ++++--- 2 files changed, 27 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 0f84dd7af2..dca7b99615 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -720,6 +720,29 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) va_end(args); } +void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) +{ + int i; + + for (i=0; i 0x7f) { + audit_log_hex(ab, string, strlen(string)); + return; + } + p++; + } + audit_log_format(ab, "\"%s\"", string); +} + + /* This is a helper-function to print the d_path without using a static * buffer or allocating another buffer in addition to the one in * audit_buffer. */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 6f1931381b..00e87ffff1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -696,9 +696,10 @@ static void audit_log_exit(struct audit_context *context) if (!ab) continue; /* audit_panic has been called */ audit_log_format(ab, "item=%d", i); - if (context->names[i].name) - audit_log_format(ab, " name=%s", - context->names[i].name); + if (context->names[i].name) { + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, context->names[i].name); + } if (context->names[i].ino != (unsigned long)-1) audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" " uid=%d gid=%d rdev=%02x:%02x", -- cgit v1.2.2 From 81b7854d52d35ed2353dd47033ae630d18322a2d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 29 Apr 2005 15:59:11 +0100 Subject: audit_log_untrustedstring() warning fix kernel/audit.c: In function `audit_log_untrustedstring': kernel/audit.c:736: warning: comparison is always false due to limited range of data type Signed-off-by: Andrew Morton Signed-off-by: David Woodhouse --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index dca7b99615..e7bff8000d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -730,7 +730,7 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { - const char *p = string; + const unsigned char *p = string; while (*p) { if (*p == '"' || *p == ' ' || *p < 0x20 || *p > 0x7f) { -- cgit v1.2.2 From 2fd6f58ba6efc82ea2c9c2630f7ff5ed9eeaf34a Mon Sep 17 00:00:00 2001 From: Date: Fri, 29 Apr 2005 16:08:28 +0100 Subject: [AUDIT] Don't allow ptrace to fool auditing, log arch of audited syscalls. We were calling ptrace_notify() after auditing the syscall and arguments, but the debugger could have _changed_ them before the syscall was actually invoked. Reorder the calls to fix that. While we're touching ever call to audit_syscall_entry(), we also make it take an extra argument: the architecture of the syscall which was made, because some architectures allow more than one type of syscall. Also add an explicit success/failure flag to audit_syscall_exit(), for the benefit of architectures which return that in a condition register rather than only returning a single register. Change type of syscall return value to 'long' not 'int'. Signed-off-by: David Woodhouse --- kernel/auditsc.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 00e87ffff1..77e92592de 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -123,7 +123,7 @@ struct audit_context { int major; /* syscall number */ unsigned long argv[4]; /* syscall arguments */ int return_valid; /* return code is valid */ - int return_code;/* syscall return code */ + long return_code;/* syscall return code */ int auditable; /* 1 if record should be written */ int name_count; struct audit_names names[AUDIT_NAMES]; @@ -135,6 +135,7 @@ struct audit_context { uid_t uid, euid, suid, fsuid; gid_t gid, egid, sgid, fsgid; unsigned long personality; + int arch; #if AUDIT_DEBUG int put_count; @@ -348,6 +349,10 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_PERS: result = (tsk->personality == value); break; + case AUDIT_ARCH: + if (ctx) + result = (ctx->arch == value); + break; case AUDIT_EXIT: if (ctx && ctx->return_valid) @@ -355,7 +360,7 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_SUCCESS: if (ctx && ctx->return_valid) - result = (ctx->return_code >= 0); + result = (ctx->return_valid == AUDITSC_SUCCESS); break; case AUDIT_DEVMAJOR: if (ctx) { @@ -648,8 +653,11 @@ static void audit_log_exit(struct audit_context *context) audit_log_format(ab, "syscall=%d", context->major); if (context->personality != PER_LINUX) audit_log_format(ab, " per=%lx", context->personality); + audit_log_format(ab, " arch=%x", context->arch); if (context->return_valid) - audit_log_format(ab, " exit=%d", context->return_code); + audit_log_format(ab, " success=%s exit=%ld", + (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", + context->return_code); audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" " pid=%d loginuid=%d uid=%d gid=%d" @@ -773,7 +781,7 @@ static inline unsigned int audit_serial(void) * then the record will be written at syscall exit time (otherwise, it * will only be written if another part of the kernel requests that it * be written). */ -void audit_syscall_entry(struct task_struct *tsk, int major, +void audit_syscall_entry(struct task_struct *tsk, int arch, int major, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4) { @@ -827,6 +835,7 @@ void audit_syscall_entry(struct task_struct *tsk, int major, if (!audit_enabled) return; + context->arch = arch; context->major = major; context->argv[0] = a1; context->argv[1] = a2; @@ -850,13 +859,13 @@ void audit_syscall_entry(struct task_struct *tsk, int major, * filtering, or because some other part of the kernel write an audit * message), then write out the syscall information. In call cases, * free the names stored from getname(). */ -void audit_syscall_exit(struct task_struct *tsk, int return_code) +void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) { struct audit_context *context; get_task_struct(tsk); task_lock(tsk); - context = audit_get_context(tsk, 1, return_code); + context = audit_get_context(tsk, valid, return_code); task_unlock(tsk); /* Not having a context here is ok, since the parent may have @@ -869,6 +878,7 @@ void audit_syscall_exit(struct task_struct *tsk, int return_code) context->in_syscall = 0; context->auditable = 0; + if (context->previous) { struct audit_context *new_context = context->previous; context->previous = NULL; -- cgit v1.2.2 From d812ddbb89e323d054a7d073466225966c8350c8 Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Fri, 29 Apr 2005 16:09:52 +0100 Subject: [AUDIT] Fix signedness of 'serial' in various routines. Attached is a patch that corrects a signed/unsigned warning. I also noticed that we needlessly init serial to 0. That only needs to occur if the kernel was compiled without the audit system. -Steve Grubb Signed-off-by: David Woodhouse --- kernel/audit.c | 6 ++++-- kernel/auditsc.c | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e7bff8000d..aa35422c0c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -620,7 +620,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx) struct audit_buffer *ab = NULL; unsigned long flags; struct timespec t; - int serial = 0; + unsigned int serial; if (!audit_initialized) return NULL; @@ -669,8 +669,10 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx) audit_get_stamp(ab->ctx, &t, &serial); else #endif + { t = CURRENT_TIME; - + serial = 0; + } audit_log_format(ab, "audit(%lu.%03lu:%u): ", t.tv_sec, t.tv_nsec/1000000, serial); return ab; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 77e92592de..49ecd707b9 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -992,7 +992,7 @@ void audit_inode(const char *name, const struct inode *inode) } void audit_get_stamp(struct audit_context *ctx, - struct timespec *t, int *serial) + struct timespec *t, unsigned int *serial) { if (ctx) { t->tv_sec = ctx->ctime.tv_sec; -- cgit v1.2.2 From c7fcb0ee74ef4cfdea02befacb55945c93641e44 Mon Sep 17 00:00:00 2001 From: Peter Martuccelli Date: Fri, 29 Apr 2005 16:10:24 +0100 Subject: [AUDIT] Avoid using %*.*s format strings. They don't seem to work correctly (investigation ongoing), but we don't actually need to do it anyway. Patch from Peter Martuccelli Signed-off-by: David Woodhouse --- kernel/audit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index aa35422c0c..42ce282728 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -540,8 +540,8 @@ static inline int audit_log_drain(struct audit_buffer *ab) if (!audit_pid) { /* No daemon */ int offset = ab->nlh ? NLMSG_SPACE(0) : 0; int len = skb->len - offset; - printk(KERN_ERR "%*.*s\n", - len, len, skb->data + offset); + skb->data[offset + len] = '\0'; + printk(KERN_ERR "%s\n", skb->data + offset); } kfree_skb(skb); ab->nlh = NULL; -- cgit v1.2.2 From 85c8721ff3bc96b702427a440616079e8daf8a2f Mon Sep 17 00:00:00 2001 From: Date: Fri, 29 Apr 2005 16:23:29 +0100 Subject: audit: update pointer to userspace tools, remove emacs mode tags --- kernel/audit.c | 4 ++-- kernel/auditsc.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 42ce282728..58c7d7e472 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1,4 +1,4 @@ -/* audit.c -- Auditing support -*- linux-c -*- +/* audit.c -- Auditing support * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. * System-call specific features have moved to auditsc.c * @@ -38,7 +38,7 @@ * 6) Support low-overhead kernel-based filtering to minimize the * information that must be passed to user-space. * - * Example user-space utilities: http://people.redhat.com/faith/audit/ + * Example user-space utilities: http://people.redhat.com/sgrubb/audit/ */ #include diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 49ecd707b9..9ff2c1b103 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1,4 +1,4 @@ -/* auditsc.c -- System-call auditing support -*- linux-c -*- +/* auditsc.c -- System-call auditing support * Handles all system-call specific auditing features. * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. -- cgit v1.2.2 From c94c257c88c517f251da273a15c654224c7b6e21 Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Fri, 29 Apr 2005 16:27:17 +0100 Subject: Add audit uid to netlink credentials Most audit control messages are sent over netlink.In order to properly log the identity of the sender of audit control messages, we would like to add the loginuid to the netlink_creds structure, as per the attached patch. Signed-off-by: Serge Hallyn Signed-off-by: David Woodhouse --- kernel/audit.c | 46 +++++++++++++++++++++++++--------------------- kernel/auditsc.c | 5 ++++- 2 files changed, 29 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 58c7d7e472..587d3b2eba 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -239,36 +239,36 @@ void audit_log_lost(const char *message) } -static int audit_set_rate_limit(int limit) +static int audit_set_rate_limit(int limit, uid_t loginuid) { int old = audit_rate_limit; audit_rate_limit = limit; - audit_log(current->audit_context, "audit_rate_limit=%d old=%d", - audit_rate_limit, old); + audit_log(NULL, "audit_rate_limit=%d old=%d by auid %u", + audit_rate_limit, old, loginuid); return old; } -static int audit_set_backlog_limit(int limit) +static int audit_set_backlog_limit(int limit, uid_t loginuid) { int old = audit_backlog_limit; audit_backlog_limit = limit; - audit_log(current->audit_context, "audit_backlog_limit=%d old=%d", - audit_backlog_limit, old); + audit_log(NULL, "audit_backlog_limit=%d old=%d by auid %u", + audit_backlog_limit, old, loginuid); return old; } -static int audit_set_enabled(int state) +static int audit_set_enabled(int state, uid_t loginuid) { int old = audit_enabled; if (state != 0 && state != 1) return -EINVAL; audit_enabled = state; - audit_log(current->audit_context, "audit_enabled=%d old=%d", - audit_enabled, old); + audit_log(NULL, "audit_enabled=%d old=%d by auid %u", + audit_enabled, old, loginuid); return old; } -static int audit_set_failure(int state) +static int audit_set_failure(int state, uid_t loginuid) { int old = audit_failure; if (state != AUDIT_FAIL_SILENT @@ -276,8 +276,8 @@ static int audit_set_failure(int state) && state != AUDIT_FAIL_PANIC) return -EINVAL; audit_failure = state; - audit_log(current->audit_context, "audit_failure=%d old=%d", - audit_failure, old); + audit_log(NULL, "audit_failure=%d old=%d by auid %u", + audit_failure, old, loginuid); return old; } @@ -344,6 +344,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; + uid_t loginuid; /* loginuid of sender */ err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); if (err) @@ -351,6 +352,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) pid = NETLINK_CREDS(skb)->pid; uid = NETLINK_CREDS(skb)->uid; + loginuid = NETLINK_CB(skb).loginuid; seq = nlh->nlmsg_seq; data = NLMSG_DATA(nlh); @@ -371,34 +373,36 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; status_get = (struct audit_status *)data; if (status_get->mask & AUDIT_STATUS_ENABLED) { - err = audit_set_enabled(status_get->enabled); + err = audit_set_enabled(status_get->enabled, loginuid); if (err < 0) return err; } if (status_get->mask & AUDIT_STATUS_FAILURE) { - err = audit_set_failure(status_get->failure); + err = audit_set_failure(status_get->failure, loginuid); if (err < 0) return err; } if (status_get->mask & AUDIT_STATUS_PID) { int old = audit_pid; audit_pid = status_get->pid; - audit_log(current->audit_context, - "audit_pid=%d old=%d", audit_pid, old); + audit_log(NULL, "audit_pid=%d old=%d by auid %u", + audit_pid, old, loginuid); } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) - audit_set_rate_limit(status_get->rate_limit); + audit_set_rate_limit(status_get->rate_limit, loginuid); if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) - audit_set_backlog_limit(status_get->backlog_limit); + audit_set_backlog_limit(status_get->backlog_limit, + loginuid); break; case AUDIT_USER: ab = audit_log_start(NULL); if (!ab) break; /* audit_panic has been called */ audit_log_format(ab, - "user pid=%d uid=%d length=%d msg='%.1024s'", + "user pid=%d uid=%d length=%d loginuid=%u" + " msg='%.1024s'", pid, uid, (int)(nlh->nlmsg_len - ((char *)data - (char *)nlh)), - (char *)data); + loginuid, (char *)data); ab->type = AUDIT_USER; ab->pid = pid; audit_log_end(ab); @@ -411,7 +415,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_LIST: #ifdef CONFIG_AUDITSYSCALL err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, - uid, seq, data); + uid, seq, data, loginuid); #else err = -EOPNOTSUPP; #endif diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9ff2c1b103..66148f81d7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -251,7 +251,8 @@ static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) return 0; } -int audit_receive_filter(int type, int pid, int uid, int seq, void *data) +int audit_receive_filter(int type, int pid, int uid, int seq, void *data, + uid_t loginuid) { u32 flags; struct audit_entry *entry; @@ -286,6 +287,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data) err = audit_add_rule(entry, &audit_entlist); if (!err && (flags & AUDIT_AT_EXIT)) err = audit_add_rule(entry, &audit_extlist); + audit_log(NULL, "auid %u added an audit rule\n", loginuid); break; case AUDIT_DEL: flags =((struct audit_rule *)data)->flags; @@ -295,6 +297,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data) err = audit_del_rule(data, &audit_entlist); if (!err && (flags & AUDIT_AT_EXIT)) err = audit_del_rule(data, &audit_extlist); + audit_log(NULL, "auid %u removed an audit rule\n", loginuid); break; default: return -EINVAL; -- cgit v1.2.2 From 37509e749dc2072e667db806ef24b9e897f61b8a Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Fri, 29 Apr 2005 17:19:14 +0100 Subject: [AUDIT] Requeue messages at head of queue, up to audit_backlog If netlink_unicast() fails, requeue the skb back at the head of the queue it just came from, instead of the tail. And do so unless we've exceeded the audit_backlog limit; not according to some other arbitrary limit. From: Chris Wright Signed-off-by: David Woodhouse --- kernel/audit.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 587d3b2eba..4a697c73fa 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -142,7 +142,6 @@ struct audit_buffer { int total; int type; int pid; - int count; /* Times requeued */ }; void audit_set_type(struct audit_buffer *ab, int type) @@ -526,9 +525,9 @@ static inline int audit_log_drain(struct audit_buffer *ab) retval = netlink_unicast(audit_sock, skb, audit_pid, MSG_DONTWAIT); } - if (retval == -EAGAIN && ab->count < 5) { - ++ab->count; - skb_queue_tail(&ab->sklist, skb); + if (retval == -EAGAIN && + (atomic_read(&audit_backlog)) < audit_backlog_limit) { + skb_queue_head(&ab->sklist, skb); audit_log_end_irq(ab); return 1; } @@ -666,7 +665,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx) ab->total = 0; ab->type = AUDIT_KERNEL; ab->pid = 0; - ab->count = 0; #ifdef CONFIG_AUDITSYSCALL if (ab->ctx) -- cgit v1.2.2 From 456be6cd90dbbb9b0ea01d56932d56d110d51cf7 Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Fri, 29 Apr 2005 17:30:07 +0100 Subject: [AUDIT] LOGIN message credentials Attached is a new patch that solves the issue of getting valid credentials into the LOGIN message. The current code was assuming that the audit context had already been copied. This is not always the case for LOGIN messages. To solve the problem, the patch passes the task struct to the function that emits the message where it can get valid credentials. Signed-off-by: Steve Grubb Signed-off-by: David Woodhouse --- kernel/auditsc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 66148f81d7..37b3ac94bc 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1010,20 +1010,21 @@ void audit_get_stamp(struct audit_context *ctx, extern int audit_set_type(struct audit_buffer *ab, int type); -int audit_set_loginuid(struct audit_context *ctx, uid_t loginuid) +int audit_set_loginuid(struct task_struct *task, uid_t loginuid) { - if (ctx) { + if (task->audit_context) { struct audit_buffer *ab; ab = audit_log_start(NULL); if (ab) { audit_log_format(ab, "login pid=%d uid=%u " "old loginuid=%u new loginuid=%u", - ctx->pid, ctx->uid, ctx->loginuid, loginuid); + task->pid, task->uid, + task->audit_context->loginuid, loginuid); audit_set_type(ab, AUDIT_LOGIN); audit_log_end(ab); } - ctx->loginuid = loginuid; + task->audit_context->loginuid = loginuid; } return 0; } -- cgit v1.2.2 From 0dd8e06bdaa0a97e706ee1a489a1f6176c4ddc64 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Tue, 3 May 2005 14:01:15 +0100 Subject: [PATCH] add new audit data to last skb When adding more formatted audit data to an skb for delivery to userspace, the kernel will attempt to reuse an skb that has spare room. However, if the audit message has already been fragmented to multiple skb's, the search for spare room in the skb uses the head of the list. This will corrupt the audit message with trailing bytes being placed midway through the stream. Fix is to look at the end of the list. Signed-off-by: Chris Wright Signed-off-by: David Woodhouse --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 4a697c73fa..00455a9cf0 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -486,7 +486,7 @@ static void audit_log_move(struct audit_buffer *ab) if (ab->len == 0) return; - skb = skb_peek(&ab->sklist); + skb = skb_peek_tail(&ab->sklist); if (!skb || skb_tailroom(skb) <= ab->len + extra) { skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC); if (!skb) { -- cgit v1.2.2 From 012914dad25bd5cacf88af4429eecda62a06020d Mon Sep 17 00:00:00 2001 From: Russ Anderson <(rja@sgi.com)> Date: Sat, 23 Apr 2005 00:08:00 -0700 Subject: [patch] MCA recovery module undefined symbol fix The patch "MCA recovery improvements" added do_exit to mca_drv.c. That's fine when the mca recovery code is built in the kernel (CONFIG_IA64_MCA_RECOVERY=y) but breaks building the mca recovery code as a module (CONFIG_IA64_MCA_RECOVERY=m). Most users are currently building this as a module, as loading and unloading the module provides a very convenient way to turn on/off error recovery. This patch exports do_exit, so mca_drv.c can build as a module. Signed-off-by: Russ Anderson (rja@sgi.com) Signed-off-by: Tony Luck --- kernel/exit.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 7be283d989..edaa50b5bb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -846,6 +846,8 @@ fastcall NORET_TYPE void do_exit(long code) for (;;) ; } +EXPORT_SYMBOL_GPL(do_exit); + NORET_TYPE void complete_and_exit(struct completion *comp, long code) { if (comp) -- cgit v1.2.2 From 075d6eb16d273dab7b7b4b83fcee8bce4ee387ed Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 5 May 2005 16:15:09 -0700 Subject: [PATCH] ppc32: platform-specific functions missing from kallsyms. The PPC32 kernel puts platform-specific functions into separate sections so that unneeded parts of it can be freed when we've booted and actually worked out what we're running on today. This makes kallsyms ignore those functions, because they're not between _[se]text or _[se]inittext. Rather than teaching kallsyms about the various pmac/chrp/etc sections, this patch adds '_[se]extratext' markers for kallsyms. Signed-off-by: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 1627f8d6e0..13bcec151b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -46,6 +46,14 @@ static inline int is_kernel_inittext(unsigned long addr) return 0; } +static inline int is_kernel_extratext(unsigned long addr) +{ + if (addr >= (unsigned long)_sextratext + && addr <= (unsigned long)_eextratext) + return 1; + return 0; +} + static inline int is_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) @@ -169,8 +177,9 @@ const char *kallsyms_lookup(unsigned long addr, namebuf[0] = 0; if ((all_var && is_kernel(addr)) || - (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr)))) { - unsigned long symbol_end=0; + (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr) || + is_kernel_extratext(addr)))) { + unsigned long symbol_end = 0; /* do a binary search on the sorted kallsyms_addresses array */ low = 0; -- cgit v1.2.2 From 7d12e522ba13ce718b7ec32b75803dece8adb072 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 5 May 2005 16:15:11 -0700 Subject: [PATCH] ppc64: remove hidden -fno-omit-frame-pointer for schedule.c While looking at code generated by gcc4.0 I noticed some functions still had frame pointers, even after we stopped ppc64 from defining CONFIG_FRAME_POINTER. It turns out kernel/Makefile hardwires -fno-omit-frame-pointer on when compiling schedule.c. Create CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER and define it on architectures that dont require frame pointers in sched.c code. (akpm: blame me for the name) Signed-off-by: Anton Blanchard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index eb88b446c2..b01d26fe8d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -29,7 +29,7 @@ obj-$(CONFIG_SYSFS) += ksysfs.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o -ifneq ($(CONFIG_IA64),y) +ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond # me. I suspect most platforms don't need this, but until we know that for sure -- cgit v1.2.2 From 04dea5f93231204cc3ca0ab793ce76dbb10c86ba Mon Sep 17 00:00:00 2001 From: Prasanna S Panchamukhi Date: Thu, 5 May 2005 16:15:41 -0700 Subject: [PATCH] Kprobes: Oops! in unregister_kprobe() kernel oops! when unregister_kprobe() is called on a non-registered kprobe. This patch fixes the above problem by checking if the probe exists before unregistering. Signed-off-by: Prasanna S Panchamukhi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1d5dd1337b..d8903e60c9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -107,13 +107,17 @@ rm_kprobe: void unregister_kprobe(struct kprobe *p) { unsigned long flags; - arch_remove_kprobe(p); spin_lock_irqsave(&kprobe_lock, flags); + if (!get_kprobe(p->addr)) { + spin_unlock_irqrestore(&kprobe_lock, flags); + return; + } *p->addr = p->opcode; hlist_del(&p->hlist); flush_icache_range((unsigned long) p->addr, (unsigned long) p->addr + sizeof(kprobe_opcode_t)); spin_unlock_irqrestore(&kprobe_lock, flags); + arch_remove_kprobe(p); } static struct notifier_block kprobe_exceptions_nb = { -- cgit v1.2.2 From 64f562c6df3cfc5d1b2b4bdbcb7951457df9c237 Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Thu, 5 May 2005 16:15:42 -0700 Subject: [PATCH] kprobes: Allow multiple kprobes at the same address Allow registration of multiple kprobes at an address in an architecture agnostic way. Corresponding handlers will be invoked in a sequence. But, a kprobe and a jprobe can't (yet) co-exist at the same address. Signed-off-by: Ananth N Mavinakayanahalli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 131 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d8903e60c9..037142b72a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -44,6 +44,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; unsigned int kprobe_cpu = NR_CPUS; static DEFINE_SPINLOCK(kprobe_lock); +static struct kprobe *curr_kprobe; /* Locks kprobe: irqs must be disabled */ void lock_kprobes(void) @@ -73,22 +74,139 @@ struct kprobe *get_kprobe(void *addr) return NULL; } +/* + * Aggregate handlers for multiple kprobes support - these handlers + * take care of invoking the individual kprobe handlers on p->list + */ +int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe *kp; + + list_for_each_entry(kp, &p->list, list) { + if (kp->pre_handler) { + curr_kprobe = kp; + kp->pre_handler(kp, regs); + curr_kprobe = NULL; + } + } + return 0; +} + +void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) +{ + struct kprobe *kp; + + list_for_each_entry(kp, &p->list, list) { + if (kp->post_handler) { + curr_kprobe = kp; + kp->post_handler(kp, regs, flags); + curr_kprobe = NULL; + } + } + return; +} + +int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr) +{ + /* + * if we faulted "during" the execution of a user specified + * probe handler, invoke just that probe's fault handler + */ + if (curr_kprobe && curr_kprobe->fault_handler) { + if (curr_kprobe->fault_handler(curr_kprobe, regs, trapnr)) + return 1; + } + return 0; +} + +/* + * Fill in the required fields of the "manager kprobe". Replace the + * earlier kprobe in the hlist with the manager kprobe + */ +static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) +{ + ap->addr = p->addr; + ap->opcode = p->opcode; + memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn)); + + ap->pre_handler = aggr_pre_handler; + ap->post_handler = aggr_post_handler; + ap->fault_handler = aggr_fault_handler; + + INIT_LIST_HEAD(&ap->list); + list_add(&p->list, &ap->list); + + INIT_HLIST_NODE(&ap->hlist); + hlist_del(&p->hlist); + hlist_add_head(&ap->hlist, + &kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]); +} + +/* + * This is the second or subsequent kprobe at the address - handle + * the intricacies + * TODO: Move kcalloc outside the spinlock + */ +static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) +{ + int ret = 0; + struct kprobe *ap; + + if (old_p->break_handler || p->break_handler) { + ret = -EEXIST; /* kprobe and jprobe can't (yet) coexist */ + } else if (old_p->pre_handler == aggr_pre_handler) { + list_add(&p->list, &old_p->list); + } else { + ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC); + if (!ap) + return -ENOMEM; + add_aggr_kprobe(ap, old_p); + list_add(&p->list, &ap->list); + } + return ret; +} + +/* kprobe removal house-keeping routines */ +static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) +{ + *p->addr = p->opcode; + hlist_del(&p->hlist); + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + spin_unlock_irqrestore(&kprobe_lock, flags); + arch_remove_kprobe(p); +} + +static inline void cleanup_aggr_kprobe(struct kprobe *old_p, + struct kprobe *p, unsigned long flags) +{ + list_del(&p->list); + if (list_empty(&old_p->list)) { + cleanup_kprobe(old_p, flags); + kfree(old_p); + } else + spin_unlock_irqrestore(&kprobe_lock, flags); +} + int register_kprobe(struct kprobe *p) { int ret = 0; unsigned long flags = 0; + struct kprobe *old_p; if ((ret = arch_prepare_kprobe(p)) != 0) { goto rm_kprobe; } spin_lock_irqsave(&kprobe_lock, flags); - INIT_HLIST_NODE(&p->hlist); - if (get_kprobe(p->addr)) { - ret = -EEXIST; + old_p = get_kprobe(p->addr); + if (old_p) { + ret = register_aggr_kprobe(old_p, p); goto out; } - arch_copy_kprobe(p); + arch_copy_kprobe(p); + INIT_HLIST_NODE(&p->hlist); hlist_add_head(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); @@ -107,17 +225,17 @@ rm_kprobe: void unregister_kprobe(struct kprobe *p) { unsigned long flags; + struct kprobe *old_p; + spin_lock_irqsave(&kprobe_lock, flags); - if (!get_kprobe(p->addr)) { + old_p = get_kprobe(p->addr); + if (old_p) { + if (old_p->pre_handler == aggr_pre_handler) + cleanup_aggr_kprobe(old_p, p, flags); + else + cleanup_kprobe(p, flags); + } else spin_unlock_irqrestore(&kprobe_lock, flags); - return; - } - *p->addr = p->opcode; - hlist_del(&p->hlist); - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); - spin_unlock_irqrestore(&kprobe_lock, flags); - arch_remove_kprobe(p); } static struct notifier_block kprobe_exceptions_nb = { -- cgit v1.2.2 From b7e4e85337060354f8b860cc38066725559313a4 Mon Sep 17 00:00:00 2001 From: Paulo Marques Date: Thu, 5 May 2005 16:15:49 -0700 Subject: [PATCH] setitimer timer expires too early It seems that the code responsible for this is in kernel/itimer.c:126: p->signal->real_timer.expires = jiffies + interval; add_timer(&p->signal->real_timer); If you request an interval of, lets say 900 usecs, the interval given by timeval_to_jiffies will be 1. If you request this when we are half-way between two timer ticks, the interval will only give 400 usecs. If we want to guarantee that we never ever give intervals less than requested, the simple solution would be to change that to: p->signal->real_timer.expires = jiffies + interval + 1; This however will produce pathological cases, like having a idle system being requested 1 ms timeouts will give systematically 2 ms timeouts, whereas currently it simply gives a few usecs less than 1 ms. The complex (and more computationally expensive) solution would be to check the gettimeofday time, and compute the correct number of jiffies. This way, if we request a 300 usecs timer 200 usecs inside the timer tick, we can wait just one tick, but not if we are 800 usecs inside the tick. This would also mean that we would have to lock preemption during these computations to avoid races, etc. I've searched the archives but couldn't find this particular issue being discussed before. Attached is a patch to do the simple solution, in case anybody thinks that it should be used. Signed-Off-By: Paulo Marques Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/itimer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/itimer.c b/kernel/itimer.c index e9a40e947e..1dc988e0d2 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -123,7 +123,11 @@ static inline void it_real_arm(struct task_struct *p, unsigned long interval) return; if (interval > (unsigned long) LONG_MAX) interval = LONG_MAX; - p->signal->real_timer.expires = jiffies + interval; + /* the "+ 1" below makes sure that the timer doesn't go off before + * the interval requested. This could happen if + * time requested % (usecs per jiffy) is more than the usecs left + * in the current jiffy */ + p->signal->real_timer.expires = jiffies + interval + 1; add_timer(&p->signal->real_timer); } -- cgit v1.2.2 From ebe8b54134314cc31331f6e26f42276cd947d1df Mon Sep 17 00:00:00 2001 From: Domen Puncer Date: Thu, 5 May 2005 16:16:19 -0700 Subject: [PATCH] correctly name the Shell sort As per http://www.nist.gov/dads/HTML/shellsort.html, this should be referred to as a Shell sort. Shell-Metzner is a misnomer. Signed-off-by: Daniel Dickman Signed-off-by: Domen Puncer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index f64e97cabe..f006632c2b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1195,7 +1195,7 @@ static int groups_from_user(struct group_info *group_info, return 0; } -/* a simple shell-metzner sort */ +/* a simple Shell sort */ static void groups_sort(struct group_info *group_info) { int base, max, stride; -- cgit v1.2.2 From 4f167fb491725ca0be9df0d76b4b2dd862cdfe0b Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Mon, 16 May 2005 21:53:43 -0700 Subject: [PATCH] spurious interrupt fix On my IA64 machine, after kernel 2.6.12-rc3 boots, an edge-triggered interrupt (IRQ 46) keeps triggered over and over again. There is no IRQ 46 interrupt action handler. It has lots of impact on performance. Kernel 2.6.10 and its prior versions have no the problem. Basically, kernel 2.6.10 will mask the spurious edge interrupt if the interrupt is triggered for the second time and its status includes IRQ_DISABLE|IRQ_PENDING. Originally, IA64 kernel has its own specific _irq_desc definitions in file arch/ia64/kernel/irq.c. The definition initiates _irq_desc[irq].status to IRQ_DISABLE. Since kernel 2.6.11, it was moved to architecture independent codes, i.e. kernel/irq/handle.c, but kernel/irq/handle.c initiates _irq_desc[irq].status to 0 instead of IRQ_DISABLE. Signed-off-by: Zhang Yanmin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/handle.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 2fb0e46e11..06b5a63239 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -30,6 +30,7 @@ */ irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = { + .status = IRQ_DISABLED, .handler = &no_irq_type, .lock = SPIN_LOCK_UNLOCKED } -- cgit v1.2.2 From 3c0547ba8b3bbd8b26ae35e33ac17ff51f67f78c Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Mon, 16 May 2005 21:53:47 -0700 Subject: [PATCH] add_preferred_console() build fix Move add_preferred_console out of CONFIG_PRINTK so serial console does the right thing. Signed-off-by: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 72 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 36 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 290a07ce2c..01b58d7d17 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -160,42 +160,6 @@ static int __init console_setup(char *str) __setup("console=", console_setup); -/** - * add_preferred_console - add a device to the list of preferred consoles. - * - * The last preferred console added will be used for kernel messages - * and stdin/out/err for init. Normally this is used by console_setup - * above to handle user-supplied console arguments; however it can also - * be used by arch-specific code either to override the user or more - * commonly to provide a default console (ie from PROM variables) when - * the user has not supplied one. - */ -int __init add_preferred_console(char *name, int idx, char *options) -{ - struct console_cmdline *c; - int i; - - /* - * See if this tty is not yet registered, and - * if we have a slot free. - */ - for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) - if (strcmp(console_cmdline[i].name, name) == 0 && - console_cmdline[i].index == idx) { - selected_console = i; - return 0; - } - if (i == MAX_CMDLINECONSOLES) - return -E2BIG; - selected_console = i; - c = &console_cmdline[i]; - memcpy(c->name, name, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; - c->options = options; - c->index = idx; - return 0; -} - static int __init log_buf_len_setup(char *str) { unsigned long size = memparse(str, &str); @@ -670,6 +634,42 @@ static void call_console_drivers(unsigned long start, unsigned long end) {} #endif +/** + * add_preferred_console - add a device to the list of preferred consoles. + * + * The last preferred console added will be used for kernel messages + * and stdin/out/err for init. Normally this is used by console_setup + * above to handle user-supplied console arguments; however it can also + * be used by arch-specific code either to override the user or more + * commonly to provide a default console (ie from PROM variables) when + * the user has not supplied one. + */ +int __init add_preferred_console(char *name, int idx, char *options) +{ + struct console_cmdline *c; + int i; + + /* + * See if this tty is not yet registered, and + * if we have a slot free. + */ + for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) + if (strcmp(console_cmdline[i].name, name) == 0 && + console_cmdline[i].index == idx) { + selected_console = i; + return 0; + } + if (i == MAX_CMDLINECONSOLES) + return -E2BIG; + selected_console = i; + c = &console_cmdline[i]; + memcpy(c->name, name, sizeof(c->name)); + c->name[sizeof(c->name) - 1] = 0; + c->options = options; + c->index = idx; + return 0; +} + /** * acquire_console_sem - lock the console system for exclusive use. * -- cgit v1.2.2 From dfaa9c94b13071c9b5f8578d0ae99acc76c60139 Mon Sep 17 00:00:00 2001 From: William Lee Irwin III Date: Mon, 16 May 2005 21:53:58 -0700 Subject: [PATCH] profile.c: `schedule' parsing fix profile=schedule parsing is not quite what it should be. First, str[7] is 'e', not ',', but then even if it did fall through, prof_on = SCHED_PROFILING would be clobbered inside if (get_option(...)) So a small amount of rearrangement is done in this patch to correct it. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 0221a50ca8..ad8cbb75ff 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -49,15 +49,19 @@ static DECLARE_MUTEX(profile_flip_mutex); static int __init profile_setup(char * str) { + static char __initdata schedstr[] = "schedule"; int par; - if (!strncmp(str, "schedule", 8)) { + if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; - printk(KERN_INFO "kernel schedule profiling enabled\n"); - if (str[7] == ',') - str += 8; - } - if (get_option(&str,&par)) { + if (str[strlen(schedstr)] == ',') + str += strlen(schedstr) + 1; + if (get_option(&str, &par)) + prof_shift = par; + printk(KERN_INFO + "kernel schedule profiling enabled (shift: %ld)\n", + prof_shift); + } else if (get_option(&str, &par)) { prof_shift = par; prof_on = CPU_PROFILING; printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", -- cgit v1.2.2 From 82428b62aa6294ea640c7e920a9224ecaf46db65 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Mon, 9 May 2005 08:07:00 -0700 Subject: [PATCH] Driver Core: pm diagnostics update, check for errors This patch includes various tweaks in the messaging that appears during system pm state transitions: * Warn about certain illegal calls in the device tree, like resuming child before parent or suspending parent before child. This could happen easily enough through sysfs, or in some cases when drivers use device_pm_set_parent(). * Be more consistent about dev_dbg() tracing ... do it for resume() and shutdown() too, and never if the driver doesn't have that method. * Say which type of system sleep state is being entered. Except for the warnings, these only affect debug messaging. Signed-off-by: David Brownell Acked-by: Pavel Machek Signed-off-by: Greg Kroah-Hartman --- kernel/power/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 7960ddf04a..4cdebc972f 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -156,14 +156,14 @@ static int enter_state(suspend_state_t state) goto Unlock; } - pr_debug("PM: Preparing system for suspend\n"); + pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); if ((error = suspend_prepare(state))) goto Unlock; - pr_debug("PM: Entering state.\n"); + pr_debug("PM: Entering %s sleep\n", pm_states[state]); error = suspend_enter(state); - pr_debug("PM: Finishing up.\n"); + pr_debug("PM: Finishing wakeup.\n"); suspend_finish(state); Unlock: up(&pm_sem); -- cgit v1.2.2 From b39c4fab259b216148e705344a892c96efe1946d Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Fri, 20 May 2005 13:59:15 -0700 Subject: [PATCH] cpusets+hotplug+preepmt broken This patch removes the entwining of cpusets and hotplug code in the "No more Mr. Nice Guy" case of sched.c move_task_off_dead_cpu(). Since the hotplug code is holding a spinlock at this point, we cannot take the cpuset semaphore, cpuset_sem, as would seem to be required either to update the tasks cpuset, or to scan up the nested cpuset chain, looking for the nearest cpuset ancestor that still has some CPUs that are online. So we just punt and blast the tasks cpus_allowed with all bits allowed. This reverts these lines of code to what they were before the cpuset patch. And it updates the cpuset Doc file, to match. The one known alternative to this that seems to work came from Dinakar Guniguntala, and required the hotplug code to take the cpuset_sem semaphore much earlier in its processing. So far as we know, the increased locking entanglement between cpusets and hot plug of this alternative approach is not worth doing in this case. Signed-off-by: Paul Jackson Acked-by: Nathan Lynch Acked-by: Dinakar Guniguntala Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0dc3158667..66b2ed7848 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4243,7 +4243,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) /* No more Mr. Nice Guy. */ if (dest_cpu == NR_CPUS) { - tsk->cpus_allowed = cpuset_cpus_allowed(tsk); + cpus_setall(tsk->cpus_allowed); dest_cpu = any_online_cpu(tsk->cpus_allowed); /* -- cgit v1.2.2 From 10f02d1c59e55f529140dda3a92f0099d748451c Mon Sep 17 00:00:00 2001 From: Samuel Thibault Date: Sat, 21 May 2005 17:50:15 +0200 Subject: [PATCH] spin_unlock_bh() and preempt_check_resched() In _spin_unlock_bh(lock): do { \ _raw_spin_unlock(lock); \ preempt_enable(); \ local_bh_enable(); \ __release(lock); \ } while (0) there is no reason for using preempt_enable() instead of a simple preempt_enable_no_resched() Since we know bottom halves are disabled, preempt_schedule() will always return at once (preempt_count!=0), and hence preempt_check_resched() is useless here... This fixes it by using "preempt_enable_no_resched()" instead of the "preempt_enable()", and thus avoids the useless preempt_check_resched() just before re-enabling bottom halves. Signed-off-by: Samuel Thibault Signed-off-by: Linus Torvalds --- kernel/spinlock.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/spinlock.c b/kernel/spinlock.c index e15ed17863..0c3f9d8bbe 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -294,7 +294,7 @@ EXPORT_SYMBOL(_spin_unlock_irq); void __lockfunc _spin_unlock_bh(spinlock_t *lock) { _raw_spin_unlock(lock); - preempt_enable(); + preempt_enable_no_resched(); local_bh_enable(); } EXPORT_SYMBOL(_spin_unlock_bh); @@ -318,7 +318,7 @@ EXPORT_SYMBOL(_read_unlock_irq); void __lockfunc _read_unlock_bh(rwlock_t *lock) { _raw_read_unlock(lock); - preempt_enable(); + preempt_enable_no_resched(); local_bh_enable(); } EXPORT_SYMBOL(_read_unlock_bh); @@ -342,7 +342,7 @@ EXPORT_SYMBOL(_write_unlock_irq); void __lockfunc _write_unlock_bh(rwlock_t *lock) { _raw_write_unlock(lock); - preempt_enable(); + preempt_enable_no_resched(); local_bh_enable(); } EXPORT_SYMBOL(_write_unlock_bh); @@ -354,7 +354,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock) if (_raw_spin_trylock(lock)) return 1; - preempt_enable(); + preempt_enable_no_resched(); local_bh_enable(); return 0; } -- cgit v1.2.2 From c33880aaddbbab1ccf36f4457ed1090621f2e39a Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Tue, 24 May 2005 19:29:47 -0700 Subject: [PATCH] sigkill priority fix If SIGKILL does not have priority, we cannot instantly kill task before it makes some unexpected job. It can be critical, but we were unable to reproduce this easily until Heiko Carstens reported this problem on LKML. Signed-Off-By: Kirill Korotaev Signed-Off-By: Alexey Kuznetsov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 8f3debc77c..b3c24c732c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -522,7 +522,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, { int sig = 0; - sig = next_signal(pending, mask); + /* SIGKILL must have priority, otherwise it is quite easy + * to create an unkillable process, sending sig < SIGKILL + * to self */ + if (unlikely(sigismember(&pending->signal, SIGKILL))) { + if (!sigismember(mask, SIGKILL)) + sig = SIGKILL; + } + + if (likely(!sig)) + sig = next_signal(pending, mask); if (sig) { if (current->notifier) { if (sigismember(current->notifier_mask, sig)) { -- cgit v1.2.2 From 2efe86b809d97debaaf9fcc13b041aedf15bd3d2 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Fri, 27 May 2005 02:02:43 -0700 Subject: [PATCH] cpuset exit NULL dereference fix There is a race in the kernel cpuset code, between the code to handle notify_on_release, and the code to remove a cpuset. The notify_on_release code can end up trying to access a cpuset that has been removed. In the most common case, this causes a NULL pointer dereference from the routine cpuset_path. However all manner of bad things are possible, in theory at least. The existing code decrements the cpuset use count, and if the count goes to zero, processes the notify_on_release request, if appropriate. However, once the count goes to zero, unless we are holding the global cpuset_sem semaphore, there is nothing to stop another task from immediately removing the cpuset entirely, and recycling its memory. The obvious fix would be to always hold the cpuset_sem semaphore while decrementing the use count and dealing with notify_on_release. However we don't want to force a global semaphore into the mainline task exit path, as that might create a scaling problem. The actual fix is almost as easy - since this is only an issue for cpusets using notify_on_release, which the top level big cpusets don't normally need to use, only take the cpuset_sem for cpusets using notify_on_release. This code has been run for hours without a hiccup, while running a cpuset create/destroy stress test that could crash the existing kernel in seconds. This patch applies to the current -linus git kernel. Signed-off-by: Paul Jackson Acked-by: Simon Derr Acked-by: Dinakar Guniguntala Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 961d74044d..00e8f25755 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -166,9 +166,8 @@ static struct super_block *cpuset_sb = NULL; * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't * (usually) grab cpuset_sem. These are the two most performance * critical pieces of code here. The exception occurs on exit(), - * if the last task using a cpuset exits, and the cpuset was marked - * notify_on_release. In that case, the cpuset_sem is taken, the - * path to the released cpuset calculated, and a usermode call made + * when a task in a notify_on_release cpuset exits. Then cpuset_sem + * is taken, and if the cpuset count is zero, a usermode call made * to /sbin/cpuset_release_agent with the name of the cpuset (path * relative to the root of cpuset file system) as the argument. * @@ -1404,6 +1403,18 @@ void cpuset_fork(struct task_struct *tsk) * * Description: Detach cpuset from @tsk and release it. * + * Note that cpusets marked notify_on_release force every task + * in them to take the global cpuset_sem semaphore when exiting. + * This could impact scaling on very large systems. Be reluctant + * to use notify_on_release cpusets where very high task exit + * scaling is required on large systems. + * + * Don't even think about derefencing 'cs' after the cpuset use + * count goes to zero, except inside a critical section guarded + * by the cpuset_sem semaphore. If you don't hold cpuset_sem, + * then a zero cpuset use count is a license to any other task to + * nuke the cpuset immediately. + * **/ void cpuset_exit(struct task_struct *tsk) @@ -1415,10 +1426,13 @@ void cpuset_exit(struct task_struct *tsk) tsk->cpuset = NULL; task_unlock(tsk); - if (atomic_dec_and_test(&cs->count)) { + if (notify_on_release(cs)) { down(&cpuset_sem); - check_for_release(cs); + if (atomic_dec_and_test(&cs->count)) + check_for_release(cs); up(&cpuset_sem); + } else { + atomic_dec(&cs->count); } } -- cgit v1.2.2 From b60c1f6ffd88850079ae419aa933ab0eddbd5535 Mon Sep 17 00:00:00 2001 From: John Hawkes Date: Fri, 27 May 2005 12:53:00 -0700 Subject: [PATCH] drop note_interrupt() for per-CPU for proper scaling The "unhandled interrupts" catcher, note_interrupt(), increments a global desc->irq_count and grossly damages scaling of very large systems, e.g., >192p ia64 Altix, because of this highly contented cacheline, especially for timer interrupts. 384p is severely crippled, and 512p is unuseable. All calls to note_interrupt() can be disabled by booting with "noirqdebug", but this disables the useful interrupt checking for all interrupts. I propose eliminating note_interrupt() for all per-CPU interrupts. This was the behavior of linux-2.6.10 and earlier, but in 2.6.11 a code restructuring added a call to note_interrupt() for per-CPU interrupts. Besides, note_interrupt() is a bit racy for concurrent CPU calls anyway, as the desc->irq_count++ increment isn't atomic (which, if done, would make scaling even worse). Signed-off-by: John Hawkes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/handle.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 06b5a63239..436c7d93c0 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -119,8 +119,6 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) */ desc->handler->ack(irq); action_ret = handle_IRQ_event(irq, regs, desc->action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); desc->handler->end(irq); return 1; } -- cgit v1.2.2 From ae92ef8a442421356950a0a8dfdc35e8e783000e Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Tue, 31 May 2005 14:39:29 -0700 Subject: [PATCH] flush icache in correct context flush_icache_range() is used in two different situation - in binfmt_elf.c & co for user space mappings and module.c for kernel modules. On m68k flush_icache_range() doesn't know which data to flush, as it has separate address spaces and the pointer argument can be valid in either address space. First I considered splitting flush_icache_range(), but this patch is simpler. Setting the correct context gives flush_icache_range() enough information to flush the correct data. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 5734ab09d3..83b3d37670 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1758,6 +1758,7 @@ sys_init_module(void __user *umod, const char __user *uargs) { struct module *mod; + mm_segment_t old_fs = get_fs(); int ret = 0; /* Must have permission */ @@ -1775,6 +1776,9 @@ sys_init_module(void __user *umod, return PTR_ERR(mod); } + /* flush the icache in correct context */ + set_fs(KERNEL_DS); + /* Flush the instruction cache, since we've played with text */ if (mod->module_init) flush_icache_range((unsigned long)mod->module_init, @@ -1783,6 +1787,8 @@ sys_init_module(void __user *umod, flush_icache_range((unsigned long)mod->module_core, (unsigned long)mod->module_core + mod->core_size); + set_fs(old_fs); + /* Now sew it into the lists. They won't access us, since strong_try_module_get() will fail. */ stop_machine_run(__link_module, mod, NR_CPUS); -- cgit v1.2.2 From 6df3cecbb95345981718b38d357c50bc3425420a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 13 Jun 2005 15:52:32 -0700 Subject: [PATCH] cond_resched_lock() fix On one path, cond_resched_lock() fails to return true if it dropped the lock. We think this might be causing the crashes in JBD's log_do_checkpoint(). Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 66b2ed7848..f12a0c8a7d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3755,19 +3755,22 @@ EXPORT_SYMBOL(cond_resched); */ int cond_resched_lock(spinlock_t * lock) { + int ret = 0; + if (need_lockbreak(lock)) { spin_unlock(lock); cpu_relax(); + ret = 1; spin_lock(lock); } if (need_resched()) { _raw_spin_unlock(lock); preempt_enable_no_resched(); __cond_resched(); + ret = 1; spin_lock(lock); - return 1; } - return 0; + return ret; } EXPORT_SYMBOL(cond_resched_lock); -- cgit v1.2.2