Auto merge with /home/aegl/GIT/linus

author: Tony Luck <tony.luck@intel.com> 2005-06-15 17:06:48 -0400
committer: Tony Luck <tony.luck@intel.com> 2005-06-15 17:06:48 -0400
commit: f2cbb4f01936a3e4225692e03b084b78c56d386d (patch)
tree: f89f3d8baa250589a38a4dd2df56f84cddae3c76 /kernel
parent: 325a479c4c110db278ef3361460a48c4093252cc (diff)
parent: 1016888fb69662936b32ab767c7419a3be9a69d3 (diff)
17 files changed, 338 insertions, 124 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index eb88b446c2..b01d26fe8d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -29,7 +29,7 @@ obj-$(CONFIG_SYSFS) += ksysfs.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
-ifneq ($(CONFIG_IA64),y)
+ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 # needed for x86 only.  Why this used to be enabled for all architectures is beyond
 # me.  I suspect most platforms don't need this, but until we know that for sure
diff --git a/kernel/audit.c b/kernel/audit.c
index ac26d4d960..9c4f1af0c7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1,4 +1,4 @@
-/* audit.c -- Auditing support -*- linux-c -*-
+/* audit.c -- Auditing support
 * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
 * System-call specific features have moved to auditsc.c
 *
@@ -38,7 +38,7 @@
 *        6) Support low-overhead kernel-based filtering to minimize the
 *           information that must be passed to user-space.
 *
- * Example user-space utilities: http://people.redhat.com/faith/audit/
+ * Example user-space utilities: http://people.redhat.com/sgrubb/audit/
 */
 #include <linux/init.h>
@@ -142,7 +142,6 @@ struct audit_buffer {
        int                  total;
        int                  type;
        int                  pid;
-        int                  count; /* Times requeued */
 };
 void audit_set_type(struct audit_buffer *ab, int type)
@@ -239,36 +238,36 @@ void audit_log_lost(const char *message)
 }
-static int audit_set_rate_limit(int limit)
+static int audit_set_rate_limit(int limit, uid_t loginuid)
 {
        int old          = audit_rate_limit;
        audit_rate_limit = limit;
-        audit_log(current->audit_context, "audit_rate_limit=%d old=%d",
+        audit_log(NULL, "audit_rate_limit=%d old=%d by auid %u",
-                  audit_rate_limit, old);
+                        audit_rate_limit, old, loginuid);
        return old;
 }
-static int audit_set_backlog_limit(int limit)
+static int audit_set_backlog_limit(int limit, uid_t loginuid)
 {
        int old          = audit_backlog_limit;
        audit_backlog_limit = limit;
-        audit_log(current->audit_context, "audit_backlog_limit=%d old=%d",
+        audit_log(NULL, "audit_backlog_limit=%d old=%d by auid %u",
-                  audit_backlog_limit, old);
+                        audit_backlog_limit, old, loginuid);
        return old;
 }
-static int audit_set_enabled(int state)
+static int audit_set_enabled(int state, uid_t loginuid)
 {
        int old          = audit_enabled;
        if (state != 0 && state != 1)
                return -EINVAL;
        audit_enabled = state;
-        audit_log(current->audit_context, "audit_enabled=%d old=%d",
+        audit_log(NULL, "audit_enabled=%d old=%d by auid %u",
-                  audit_enabled, old);
+                  audit_enabled, old, loginuid);
        return old;
 }
-static int audit_set_failure(int state)
+static int audit_set_failure(int state, uid_t loginuid)
 {
        int old          = audit_failure;
        if (state != AUDIT_FAIL_SILENT
@@ -276,8 +275,8 @@ static int audit_set_failure(int state)
            && state != AUDIT_FAIL_PANIC)
                return -EINVAL;
        audit_failure = state;
-        audit_log(current->audit_context, "audit_failure=%d old=%d",
+        audit_log(NULL, "audit_failure=%d old=%d by auid %u",
-                  audit_failure, old);
+                  audit_failure, old, loginuid);
        return old;
 }
@@ -344,6 +343,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        int                     err;
        struct audit_buffer     *ab;
        u16                     msg_type = nlh->nlmsg_type;
+        uid_t                   loginuid; /* loginuid of sender */
        err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
        if (err)
@@ -351,6 +351,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        pid  = NETLINK_CREDS(skb)->pid;
        uid  = NETLINK_CREDS(skb)->uid;
+        loginuid = NETLINK_CB(skb).loginuid;
        seq  = nlh->nlmsg_seq;
        data = NLMSG_DATA(nlh);
@@ -371,34 +372,36 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        return -EINVAL;
                status_get   = (struct audit_status *)data;
                if (status_get->mask & AUDIT_STATUS_ENABLED) {
-                        err = audit_set_enabled(status_get->enabled);
+                        err = audit_set_enabled(status_get->enabled, loginuid);
                        if (err < 0) return err;
                }
                if (status_get->mask & AUDIT_STATUS_FAILURE) {
-                        err = audit_set_failure(status_get->failure);
+                        err = audit_set_failure(status_get->failure, loginuid);
                        if (err < 0) return err;
                }
                if (status_get->mask & AUDIT_STATUS_PID) {
                        int old   = audit_pid;
                        audit_pid = status_get->pid;
-                        audit_log(current->audit_context,
+                        audit_log(NULL, "audit_pid=%d old=%d by auid %u",
-                                  "audit_pid=%d old=%d", audit_pid, old);
+                                  audit_pid, old, loginuid);
                }
                if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
-                        audit_set_rate_limit(status_get->rate_limit);
+                        audit_set_rate_limit(status_get->rate_limit, loginuid);
                if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
-                        audit_set_backlog_limit(status_get->backlog_limit);
+                        audit_set_backlog_limit(status_get->backlog_limit,
+                                                        loginuid);
                break;
        case AUDIT_USER:
                ab = audit_log_start(NULL);
                if (!ab)
                        break;  /* audit_panic has been called */
                audit_log_format(ab,
-                                 "user pid=%d uid=%d length=%d msg='%.1024s'",
+                                 "user pid=%d uid=%d length=%d loginuid=%u"
+                                 " msg='%.1024s'",
                                 pid, uid,
                                 (int)(nlh->nlmsg_len
                                       - ((char *)data - (char *)nlh)),
-                                 (char *)data);
+                                 loginuid, (char *)data);
                ab->type = AUDIT_USER;
                ab->pid  = pid;
                audit_log_end(ab);
@@ -411,7 +414,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        case AUDIT_LIST:
 #ifdef CONFIG_AUDITSYSCALL
                err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
-                                           uid, seq, data);
+                                           uid, seq, data, loginuid);
 #else
                err = -EOPNOTSUPP;
 #endif
@@ -480,7 +483,7 @@ static void audit_log_move(struct audit_buffer *ab)
        if (ab->len == 0)
                return;
-        skb = skb_peek(&ab->sklist);
+        skb = skb_peek_tail(&ab->sklist);
        if (!skb || skb_tailroom(skb) <= ab->len + extra) {
                skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC);
                if (!skb) {
@@ -519,9 +522,9 @@ static inline int audit_log_drain(struct audit_buffer *ab)
                        retval = netlink_unicast(audit_sock, skb, audit_pid,
                                                 MSG_DONTWAIT);
                }
-                if (retval == -EAGAIN && ab->count < 5) {
+                if (retval == -EAGAIN &&
-                        ++ab->count;
+                    (atomic_read(&audit_backlog)) < audit_backlog_limit) {
-                        skb_queue_tail(&ab->sklist, skb);
+                        skb_queue_head(&ab->sklist, skb);
                        audit_log_end_irq(ab);
                        return 1;
                }
@@ -537,8 +540,8 @@ static inline int audit_log_drain(struct audit_buffer *ab)
                if (!audit_pid) { /* No daemon */
                        int offset = ab->nlh ? NLMSG_SPACE(0) : 0;
                        int len    = skb->len - offset;
-                        printk(KERN_ERR "%*.*s\n",
+                        skb->data[offset + len] = '\0';
-                               len, len, skb->data + offset);
+                        printk(KERN_ERR "%s\n", skb->data + offset);
                }
                kfree_skb(skb);
                ab->nlh = NULL;
@@ -617,7 +620,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx)
        struct audit_buffer     *ab     = NULL;
        unsigned long           flags;
        struct timespec         t;
-        int                     serial  = 0;
+        unsigned int            serial;
        if (!audit_initialized)
                return NULL;
@@ -659,15 +662,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx)
        ab->total = 0;
        ab->type  = AUDIT_KERNEL;
        ab->pid   = 0;
-        ab->count = 0;
 #ifdef CONFIG_AUDITSYSCALL
        if (ab->ctx)
                audit_get_stamp(ab->ctx, &t, &serial);
        else
 #endif
+        {
                t = CURRENT_TIME;
+                serial = 0;
+        }
        audit_log_format(ab, "audit(%lu.%03lu:%u): ",
                         t.tv_sec, t.tv_nsec/1000000, serial);
        return ab;
@@ -717,6 +721,29 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
        va_end(args);
 }
+void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len)
+{
+        int i;
+        for (i=0; i<len; i++)
+                audit_log_format(ab, "%02x", buf[i]);
+}
+void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
+{
+        const unsigned char *p = string;
+        while (*p) {
+                if (*p == '"' || *p == ' ' || *p < 0x20 || *p > 0x7f) {
+                        audit_log_hex(ab, string, strlen(string));
+                        return;
+                }
+                p++;
+        }
+        audit_log_format(ab, "\"%s\"", string);
+}
 /* This is a helper-function to print the d_path without using a static
 * buffer or allocating another buffer in addition to the one in
 * audit_buffer. */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 6f1931381b..37b3ac94bc 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1,4 +1,4 @@
-/* auditsc.c -- System-call auditing support -*- linux-c -*-
+/* auditsc.c -- System-call auditing support
 * Handles all system-call specific auditing features.
 *
 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
@@ -123,7 +123,7 @@ struct audit_context {
        int                 major;      /* syscall number */
        unsigned long       argv[4];    /* syscall arguments */
        int                 return_valid; /* return code is valid */
-        int                 return_code;/* syscall return code */
+        long                return_code;/* syscall return code */
        int                 auditable;  /* 1 if record should be written */
        int                 name_count;
        struct audit_names  names[AUDIT_NAMES];
@@ -135,6 +135,7 @@ struct audit_context {
        uid_t               uid, euid, suid, fsuid;
        gid_t               gid, egid, sgid, fsgid;
        unsigned long       personality;
+        int                 arch;
 #if AUDIT_DEBUG
        int                 put_count;
@@ -250,7 +251,8 @@ static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
        return 0;
 }
-int audit_receive_filter(int type, int pid, int uid, int seq, void *data)
+int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
+                                                        uid_t loginuid)
 {
        u32                flags;
        struct audit_entry *entry;
@@ -285,6 +287,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data)
                        err = audit_add_rule(entry, &audit_entlist);
                if (!err && (flags & AUDIT_AT_EXIT))
                        err = audit_add_rule(entry, &audit_extlist);
+                audit_log(NULL, "auid %u added an audit rule\n", loginuid);
                break;
        case AUDIT_DEL:
                flags =((struct audit_rule *)data)->flags;
@@ -294,6 +297,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data)
                        err = audit_del_rule(data, &audit_entlist);
                if (!err && (flags & AUDIT_AT_EXIT))
                        err = audit_del_rule(data, &audit_extlist);
+                audit_log(NULL, "auid %u removed an audit rule\n", loginuid);
                break;
        default:
                return -EINVAL;
@@ -348,6 +352,10 @@ static int audit_filter_rules(struct task_struct *tsk,
                case AUDIT_PERS:
                        result = (tsk->personality == value);
                        break;
+                case AUDIT_ARCH:
+                        if (ctx) 
+                                result = (ctx->arch == value);
+                        break;
                case AUDIT_EXIT:
                        if (ctx && ctx->return_valid)
@@ -355,7 +363,7 @@ static int audit_filter_rules(struct task_struct *tsk,
                        break;
                case AUDIT_SUCCESS:
                        if (ctx && ctx->return_valid)
-                                result = (ctx->return_code >= 0);
+                                result = (ctx->return_valid == AUDITSC_SUCCESS);
                        break;
                case AUDIT_DEVMAJOR:
                        if (ctx) {
@@ -648,8 +656,11 @@ static void audit_log_exit(struct audit_context *context)
        audit_log_format(ab, "syscall=%d", context->major);
        if (context->personality != PER_LINUX)
                audit_log_format(ab, " per=%lx", context->personality);
+        audit_log_format(ab, " arch=%x", context->arch);
        if (context->return_valid)
-                audit_log_format(ab, " exit=%d", context->return_code);
+                audit_log_format(ab, " success=%s exit=%ld", 
+                                 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
+                                 context->return_code);
        audit_log_format(ab,
                  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
                  " pid=%d loginuid=%d uid=%d gid=%d"
@@ -696,9 +707,10 @@ static void audit_log_exit(struct audit_context *context)
                if (!ab)
                        continue; /* audit_panic has been called */
                audit_log_format(ab, "item=%d", i);
-                if (context->names[i].name)
+                if (context->names[i].name) {
-                        audit_log_format(ab, " name=%s",
+                        audit_log_format(ab, " name=");
-                                         context->names[i].name);
+                        audit_log_untrustedstring(ab, context->names[i].name);
+                }
                if (context->names[i].ino != (unsigned long)-1)
                        audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
                                             " uid=%d gid=%d rdev=%02x:%02x",
@@ -772,7 +784,7 @@ static inline unsigned int audit_serial(void)
 * then the record will be written at syscall exit time (otherwise, it
 * will only be written if another part of the kernel requests that it
 * be written). */
-void audit_syscall_entry(struct task_struct *tsk, int major,
+void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
                         unsigned long a1, unsigned long a2,
                         unsigned long a3, unsigned long a4)
 {
@@ -826,6 +838,7 @@ void audit_syscall_entry(struct task_struct *tsk, int major,
        if (!audit_enabled)
                return;
+        context->arch       = arch;
        context->major      = major;
        context->argv[0]    = a1;
        context->argv[1]    = a2;
@@ -849,13 +862,13 @@ void audit_syscall_entry(struct task_struct *tsk, int major,
 * filtering, or because some other part of the kernel write an audit
 * message), then write out the syscall information.  In call cases,
 * free the names stored from getname(). */
-void audit_syscall_exit(struct task_struct *tsk, int return_code)
+void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
 {
        struct audit_context *context;
        get_task_struct(tsk);
        task_lock(tsk);
-        context = audit_get_context(tsk, 1, return_code);
+        context = audit_get_context(tsk, valid, return_code);
        task_unlock(tsk);
        /* Not having a context here is ok, since the parent may have
@@ -868,6 +881,7 @@ void audit_syscall_exit(struct task_struct *tsk, int return_code)
        context->in_syscall = 0;
        context->auditable  = 0;
        if (context->previous) {
                struct audit_context *new_context = context->previous;
                context->previous  = NULL;
@@ -981,7 +995,7 @@ void audit_inode(const char *name, const struct inode *inode)
 }
 void audit_get_stamp(struct audit_context *ctx,
-                     struct timespec *t, int *serial)
+                     struct timespec *t, unsigned int *serial)
 {
        if (ctx) {
                t->tv_sec  = ctx->ctime.tv_sec;
@@ -996,20 +1010,21 @@ void audit_get_stamp(struct audit_context *ctx,
 extern int audit_set_type(struct audit_buffer *ab, int type);
-int audit_set_loginuid(struct audit_context *ctx, uid_t loginuid)
+int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 {
-        if (ctx) {
+        if (task->audit_context) {
                struct audit_buffer *ab;
                ab = audit_log_start(NULL);
                if (ab) {
                        audit_log_format(ab, "login pid=%d uid=%u "
                                "old loginuid=%u new loginuid=%u",
-                                ctx->pid, ctx->uid, ctx->loginuid, loginuid);
+                                task->pid, task->uid, 
+                                task->audit_context->loginuid, loginuid);
                        audit_set_type(ab, AUDIT_LOGIN);
                        audit_log_end(ab);
                }
-                ctx->loginuid = loginuid;
+                task->audit_context->loginuid = loginuid;
        }
        return 0;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 961d74044d..00e8f25755 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -166,9 +166,8 @@ static struct super_block *cpuset_sb = NULL;
 * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
 * (usually) grab cpuset_sem.  These are the two most performance
 * critical pieces of code here.  The exception occurs on exit(),
- * if the last task using a cpuset exits, and the cpuset was marked
+ * when a task in a notify_on_release cpuset exits.  Then cpuset_sem
- * notify_on_release.  In that case, the cpuset_sem is taken, the
+ * is taken, and if the cpuset count is zero, a usermode call made
- * path to the released cpuset calculated, and a usermode call made
 * to /sbin/cpuset_release_agent with the name of the cpuset (path
 * relative to the root of cpuset file system) as the argument.
 *
@@ -1404,6 +1403,18 @@ void cpuset_fork(struct task_struct *tsk)
 *
 * Description: Detach cpuset from @tsk and release it.
 *
+ * Note that cpusets marked notify_on_release force every task
+ * in them to take the global cpuset_sem semaphore when exiting.
+ * This could impact scaling on very large systems.  Be reluctant
+ * to use notify_on_release cpusets where very high task exit
+ * scaling is required on large systems.
+ *
+ * Don't even think about derefencing 'cs' after the cpuset use
+ * count goes to zero, except inside a critical section guarded
+ * by the cpuset_sem semaphore.  If you don't hold cpuset_sem,
+ * then a zero cpuset use count is a license to any other task to
+ * nuke the cpuset immediately.
+ *
 **/
 void cpuset_exit(struct task_struct *tsk)
@@ -1415,10 +1426,13 @@ void cpuset_exit(struct task_struct *tsk)
        tsk->cpuset = NULL;
        task_unlock(tsk);
-        if (atomic_dec_and_test(&cs->count)) {
+        if (notify_on_release(cs)) {
                down(&cpuset_sem);
-                check_for_release(cs);
+                if (atomic_dec_and_test(&cs->count))
+                        check_for_release(cs);
                up(&cpuset_sem);
+        } else {
+                atomic_dec(&cs->count);
        }
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index 7be283d989..edaa50b5bb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -846,6 +846,8 @@ fastcall NORET_TYPE void do_exit(long code)
        for (;;) ;
 }
+EXPORT_SYMBOL_GPL(do_exit);
 NORET_TYPE void complete_and_exit(struct completion *comp, long code)
 {
        if (comp)
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 2fb0e46e11..436c7d93c0 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -30,6 +30,7 @@
 */
 irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
        [0 ... NR_IRQS-1] = {
+                .status = IRQ_DISABLED,
                .handler = &no_irq_type,
                .lock = SPIN_LOCK_UNLOCKED
        }
@@ -118,8 +119,6 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
                 */
                desc->handler->ack(irq);
                action_ret = handle_IRQ_event(irq, regs, desc->action);
-                if (!noirqdebug)
-                        note_interrupt(irq, desc, action_ret);
                desc->handler->end(irq);
                return 1;
        }
diff --git a/kernel/itimer.c b/kernel/itimer.c
index e9a40e947e..1dc988e0d2 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -123,7 +123,11 @@ static inline void it_real_arm(struct task_struct *p, unsigned long interval)
                return;
        if (interval > (unsigned long) LONG_MAX)
                interval = LONG_MAX;
-        p->signal->real_timer.expires = jiffies + interval;
+        /* the "+ 1" below makes sure that the timer doesn't go off before
+         * the interval requested. This could happen if
+         * time requested % (usecs per jiffy) is more than the usecs left
+         * in the current jiffy */
+        p->signal->real_timer.expires = jiffies + interval + 1;
        add_timer(&p->signal->real_timer);
 }
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 1627f8d6e0..13bcec151b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -46,6 +46,14 @@ static inline int is_kernel_inittext(unsigned long addr)
        return 0;
 }
+static inline int is_kernel_extratext(unsigned long addr)
+{
+        if (addr >= (unsigned long)_sextratext
+            && addr <= (unsigned long)_eextratext)
+                return 1;
+        return 0;
+}
 static inline int is_kernel_text(unsigned long addr)
 {
        if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext)
@@ -169,8 +177,9 @@ const char *kallsyms_lookup(unsigned long addr,
        namebuf[0] = 0;
        if ((all_var && is_kernel(addr)) ||
-            (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr)))) {
+            (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr) ||
-                unsigned long symbol_end=0;
+                                is_kernel_extratext(addr)))) {
+                unsigned long symbol_end = 0;
                /* do a binary search on the sorted kallsyms_addresses array */
                low = 0;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1d5dd1337b..037142b72a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -44,6 +44,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 unsigned int kprobe_cpu = NR_CPUS;
 static DEFINE_SPINLOCK(kprobe_lock);
+static struct kprobe *curr_kprobe;
 /* Locks kprobe: irqs must be disabled */
 void lock_kprobes(void)
@@ -73,22 +74,139 @@ struct kprobe *get_kprobe(void *addr)
        return NULL;
 }
+/*
+ * Aggregate handlers for multiple kprobes support - these handlers
+ * take care of invoking the individual kprobe handlers on p->list
+ */
+int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+        struct kprobe *kp;
+        list_for_each_entry(kp, &p->list, list) {
+                if (kp->pre_handler) {
+                        curr_kprobe = kp;
+                        kp->pre_handler(kp, regs);
+                        curr_kprobe = NULL;
+                }
+        }
+        return 0;
+}
+void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+                unsigned long flags)
+{
+        struct kprobe *kp;
+        list_for_each_entry(kp, &p->list, list) {
+                if (kp->post_handler) {
+                        curr_kprobe = kp;
+                        kp->post_handler(kp, regs, flags);
+                        curr_kprobe = NULL;
+                }
+        }
+        return;
+}
+int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr)
+{
+        /*
+         * if we faulted "during" the execution of a user specified
+         * probe handler, invoke just that probe's fault handler
+         */
+        if (curr_kprobe && curr_kprobe->fault_handler) {
+                if (curr_kprobe->fault_handler(curr_kprobe, regs, trapnr))
+                        return 1;
+        }
+        return 0;
+}
+/*
+ * Fill in the required fields of the "manager kprobe". Replace the
+ * earlier kprobe in the hlist with the manager kprobe
+ */
+static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
+{
+        ap->addr = p->addr;
+        ap->opcode = p->opcode;
+        memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn));
+        ap->pre_handler = aggr_pre_handler;
+        ap->post_handler = aggr_post_handler;
+        ap->fault_handler = aggr_fault_handler;
+        INIT_LIST_HEAD(&ap->list);
+        list_add(&p->list, &ap->list);
+        INIT_HLIST_NODE(&ap->hlist);
+        hlist_del(&p->hlist);
+        hlist_add_head(&ap->hlist,
+                &kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]);
+}
+/*
+ * This is the second or subsequent kprobe at the address - handle
+ * the intricacies
+ * TODO: Move kcalloc outside the spinlock
+ */
+static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
+{
+        int ret = 0;
+        struct kprobe *ap;
+        if (old_p->break_handler || p->break_handler) {
+                ret = -EEXIST;  /* kprobe and jprobe can't (yet) coexist */
+        } else if (old_p->pre_handler == aggr_pre_handler) {
+                list_add(&p->list, &old_p->list);
+        } else {
+                ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC);
+                if (!ap)
+                        return -ENOMEM;
+                add_aggr_kprobe(ap, old_p);
+                list_add(&p->list, &ap->list);
+        }
+        return ret;
+}
+/* kprobe removal house-keeping routines */
+static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
+{
+        *p->addr = p->opcode;
+        hlist_del(&p->hlist);
+        flush_icache_range((unsigned long) p->addr,
+                   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+        spin_unlock_irqrestore(&kprobe_lock, flags);
+        arch_remove_kprobe(p);
+}
+static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
+                struct kprobe *p, unsigned long flags)
+{
+        list_del(&p->list);
+        if (list_empty(&old_p->list)) {
+                cleanup_kprobe(old_p, flags);
+                kfree(old_p);
+        } else
+                spin_unlock_irqrestore(&kprobe_lock, flags);
+}
 int register_kprobe(struct kprobe *p)
 {
        int ret = 0;
        unsigned long flags = 0;
+        struct kprobe *old_p;
        if ((ret = arch_prepare_kprobe(p)) != 0) {
                goto rm_kprobe;
        }
        spin_lock_irqsave(&kprobe_lock, flags);
-        INIT_HLIST_NODE(&p->hlist);
+        old_p = get_kprobe(p->addr);
-        if (get_kprobe(p->addr)) {
+        if (old_p) {
-                ret = -EEXIST;
+                ret = register_aggr_kprobe(old_p, p);
                goto out;
        }
-        arch_copy_kprobe(p);
+        arch_copy_kprobe(p);
+        INIT_HLIST_NODE(&p->hlist);
        hlist_add_head(&p->hlist,
                       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
@@ -107,13 +225,17 @@ rm_kprobe:
 void unregister_kprobe(struct kprobe *p)
 {
        unsigned long flags;
-        arch_remove_kprobe(p);
+        struct kprobe *old_p;
        spin_lock_irqsave(&kprobe_lock, flags);
-        *p->addr = p->opcode;
+        old_p = get_kprobe(p->addr);
-        hlist_del(&p->hlist);
+        if (old_p) {
-        flush_icache_range((unsigned long) p->addr,
+                if (old_p->pre_handler == aggr_pre_handler)
-                           (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+                        cleanup_aggr_kprobe(old_p, p, flags);
-        spin_unlock_irqrestore(&kprobe_lock, flags);
+                else
+                        cleanup_kprobe(p, flags);
+        } else
+                spin_unlock_irqrestore(&kprobe_lock, flags);
 }
 static struct notifier_block kprobe_exceptions_nb = {
diff --git a/kernel/module.c b/kernel/module.c
index 5734ab09d3..83b3d37670 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1758,6 +1758,7 @@ sys_init_module(void __user *umod,
                const char __user *uargs)
 {
        struct module *mod;
+        mm_segment_t old_fs = get_fs();
        int ret = 0;
        /* Must have permission */
@@ -1775,6 +1776,9 @@ sys_init_module(void __user *umod,
                return PTR_ERR(mod);
        }
+        /* flush the icache in correct context */
+        set_fs(KERNEL_DS);
        /* Flush the instruction cache, since we've played with text */
        if (mod->module_init)
                flush_icache_range((unsigned long)mod->module_init,
@@ -1783,6 +1787,8 @@ sys_init_module(void __user *umod,
        flush_icache_range((unsigned long)mod->module_core,
                           (unsigned long)mod->module_core + mod->core_size);
+        set_fs(old_fs);
        /* Now sew it into the lists.  They won't access us, since
           strong_try_module_get() will fail. */
        stop_machine_run(__link_module, mod, NR_CPUS);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7960ddf04a..4cdebc972f 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -156,14 +156,14 @@ static int enter_state(suspend_state_t state)
                goto Unlock;
        }
-        pr_debug("PM: Preparing system for suspend\n");
+        pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
        if ((error = suspend_prepare(state)))
                goto Unlock;
-        pr_debug("PM: Entering state.\n");
+        pr_debug("PM: Entering %s sleep\n", pm_states[state]);
        error = suspend_enter(state);
-        pr_debug("PM: Finishing up.\n");
+        pr_debug("PM: Finishing wakeup.\n");
        suspend_finish(state);
 Unlock:
        up(&pm_sem);
diff --git a/kernel/printk.c b/kernel/printk.c
index 290a07ce2c..01b58d7d17 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -160,42 +160,6 @@ static int __init console_setup(char *str)
 __setup("console=", console_setup);
-/**
- * add_preferred_console - add a device to the list of preferred consoles.
- *
- * The last preferred console added will be used for kernel messages
- * and stdin/out/err for init.  Normally this is used by console_setup
- * above to handle user-supplied console arguments; however it can also
- * be used by arch-specific code either to override the user or more
- * commonly to provide a default console (ie from PROM variables) when
- * the user has not supplied one.
- */
-int __init add_preferred_console(char *name, int idx, char *options)
-{
-        struct console_cmdline *c;
-        int i;
-        /*
-         *      See if this tty is not yet registered, and
-         *      if we have a slot free.
-         */
-        for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
-                if (strcmp(console_cmdline[i].name, name) == 0 &&
-                          console_cmdline[i].index == idx) {
-                                selected_console = i;
-                                return 0;
-                }
-        if (i == MAX_CMDLINECONSOLES)
-                return -E2BIG;
-        selected_console = i;
-        c = &console_cmdline[i];
-        memcpy(c->name, name, sizeof(c->name));
-        c->name[sizeof(c->name) - 1] = 0;
-        c->options = options;
-        c->index = idx;
-        return 0;
-}
 static int __init log_buf_len_setup(char *str)
 {
        unsigned long size = memparse(str, &str);
@@ -671,6 +635,42 @@ static void call_console_drivers(unsigned long start, unsigned long end) {}
 #endif
 /**
+ * add_preferred_console - add a device to the list of preferred consoles.
+ *
+ * The last preferred console added will be used for kernel messages
+ * and stdin/out/err for init.  Normally this is used by console_setup
+ * above to handle user-supplied console arguments; however it can also
+ * be used by arch-specific code either to override the user or more
+ * commonly to provide a default console (ie from PROM variables) when
+ * the user has not supplied one.
+ */
+int __init add_preferred_console(char *name, int idx, char *options)
+{
+        struct console_cmdline *c;
+        int i;
+        /*
+         *      See if this tty is not yet registered, and
+         *      if we have a slot free.
+         */
+        for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
+                if (strcmp(console_cmdline[i].name, name) == 0 &&
+                          console_cmdline[i].index == idx) {
+                                selected_console = i;
+                                return 0;
+                }
+        if (i == MAX_CMDLINECONSOLES)
+                return -E2BIG;
+        selected_console = i;
+        c = &console_cmdline[i];
+        memcpy(c->name, name, sizeof(c->name));
+        c->name[sizeof(c->name) - 1] = 0;
+        c->options = options;
+        c->index = idx;
+        return 0;
+}
+/**
 * acquire_console_sem - lock the console system for exclusive use.
 *
 * Acquires a semaphore which guarantees that the caller has
diff --git a/kernel/profile.c b/kernel/profile.c
index 0221a50ca8..ad8cbb75ff 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -49,15 +49,19 @@ static DECLARE_MUTEX(profile_flip_mutex);
 static int __init profile_setup(char * str)
 {
+        static char __initdata schedstr[] = "schedule";
        int par;
-        if (!strncmp(str, "schedule", 8)) {
+        if (!strncmp(str, schedstr, strlen(schedstr))) {
                prof_on = SCHED_PROFILING;
-                printk(KERN_INFO "kernel schedule profiling enabled\n");
+                if (str[strlen(schedstr)] == ',')
-                if (str[7] == ',')
+                        str += strlen(schedstr) + 1;
-                        str += 8;
+                if (get_option(&str, &par))
-        }
+                        prof_shift = par;
-        if (get_option(&str,&par)) {
+                printk(KERN_INFO
+                        "kernel schedule profiling enabled (shift: %ld)\n",
+                        prof_shift);
+        } else if (get_option(&str, &par)) {
                prof_shift = par;
                prof_on = CPU_PROFILING;
                printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
diff --git a/kernel/sched.c b/kernel/sched.c
index 0dc3158667..f12a0c8a7d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3755,19 +3755,22 @@ EXPORT_SYMBOL(cond_resched);
 */
 int cond_resched_lock(spinlock_t * lock)
 {
+        int ret = 0;
        if (need_lockbreak(lock)) {
                spin_unlock(lock);
                cpu_relax();
+                ret = 1;
                spin_lock(lock);
        }
        if (need_resched()) {
                _raw_spin_unlock(lock);
                preempt_enable_no_resched();
                __cond_resched();
+                ret = 1;
                spin_lock(lock);
-                return 1;
        }
-        return 0;
+        return ret;
 }
 EXPORT_SYMBOL(cond_resched_lock);
@@ -4243,7 +4246,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
        /* No more Mr. Nice Guy. */
        if (dest_cpu == NR_CPUS) {
-                tsk->cpus_allowed = cpuset_cpus_allowed(tsk);
+                cpus_setall(tsk->cpus_allowed);
                dest_cpu = any_online_cpu(tsk->cpus_allowed);
                /*
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f3debc77c..b3c24c732c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -522,7 +522,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 {
        int sig = 0;
-        sig = next_signal(pending, mask);
+        /* SIGKILL must have priority, otherwise it is quite easy
+         * to create an unkillable process, sending sig < SIGKILL
+         * to self */
+        if (unlikely(sigismember(&pending->signal, SIGKILL))) {
+                if (!sigismember(mask, SIGKILL))
+                        sig = SIGKILL;
+        }
+        if (likely(!sig))
+                sig = next_signal(pending, mask);
        if (sig) {
                if (current->notifier) {
                        if (sigismember(current->notifier_mask, sig)) {
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index e15ed17863..0c3f9d8bbe 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -294,7 +294,7 @@ EXPORT_SYMBOL(_spin_unlock_irq);
 void __lockfunc _spin_unlock_bh(spinlock_t *lock)
 {
        _raw_spin_unlock(lock);
-        preempt_enable();
+        preempt_enable_no_resched();
        local_bh_enable();
 }
 EXPORT_SYMBOL(_spin_unlock_bh);
@@ -318,7 +318,7 @@ EXPORT_SYMBOL(_read_unlock_irq);
 void __lockfunc _read_unlock_bh(rwlock_t *lock)
 {
        _raw_read_unlock(lock);
-        preempt_enable();
+        preempt_enable_no_resched();
        local_bh_enable();
 }
 EXPORT_SYMBOL(_read_unlock_bh);
@@ -342,7 +342,7 @@ EXPORT_SYMBOL(_write_unlock_irq);
 void __lockfunc _write_unlock_bh(rwlock_t *lock)
 {
        _raw_write_unlock(lock);
-        preempt_enable();
+        preempt_enable_no_resched();
        local_bh_enable();
 }
 EXPORT_SYMBOL(_write_unlock_bh);
@@ -354,7 +354,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
        if (_raw_spin_trylock(lock))
                return 1;
-        preempt_enable();
+        preempt_enable_no_resched();
        local_bh_enable();
        return 0;
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index f64e97cabe..f006632c2b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1195,7 +1195,7 @@ static int groups_from_user(struct group_info *group_info,
        return 0;
 }
-/* a simple shell-metzner sort */
+/* a simple Shell sort */
 static void groups_sort(struct group_info *group_info)
 {
        int base, max, stride;
author	Tony Luck <tony.luck@intel.com>	2005-06-15 17:06:48 -0400
committer	Tony Luck <tony.luck@intel.com>	2005-06-15 17:06:48 -0400
commit	f2cbb4f01936a3e4225692e03b084b78c56d386d (patch)
tree	f89f3d8baa250589a38a4dd2df56f84cddae3c76 /kernel
parent	325a479c4c110db278ef3361460a48c4093252cc (diff)
parent	1016888fb69662936b32ab767c7419a3be9a69d3 (diff)