Merge branch 'master' of /pub/scm/linux/kernel/git/torvalds/linux-2.6

author: Steve French <sfrench@us.ibm.com> 2008-05-06 13:55:32 -0400
committer: Steve French <sfrench@us.ibm.com> 2008-05-06 13:55:32 -0400
commit: a815752ac0ffdb910e92958d41d28f4fb28e5296 (patch)
tree: a3aa16a282354da0debe8e3a3a7ed8aac6e54001 /kernel
parent: 5ade9deaaa3e1f7291467d97b238648e43eae15e (diff)
parent: a15306365a16380f3bafee9e181ba01231d4acd7 (diff)
70 files changed, 2816 insertions, 2410 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c5f081132a4..1c9938addb9d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,9 +9,9 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
            rcupdate.o extable.o params.o posix-timers.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-            notifier.o ksysfs.o pm_qos_params.o
+            notifier.o ksysfs.o pm_qos_params.o sched_clock.o
-obj-$(CONFIG_SYSCTL) += sysctl_check.o
+obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
diff --git a/kernel/audit.c b/kernel/audit.c
index a7b16086d36f..b7d3709cc452 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -126,6 +126,8 @@ static int	   audit_freelist_count;
 static LIST_HEAD(audit_freelist);
 static struct sk_buff_head audit_skb_queue;
+/* queue of skbs to send to auditd when/if it comes back */
+static struct sk_buff_head audit_skb_hold_queue;
 static struct task_struct *kauditd_task;
 static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
 static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
@@ -154,6 +156,11 @@ struct audit_buffer {
        gfp_t                gfp_mask;
 };
+struct audit_reply {
+        int pid;
+        struct sk_buff *skb;
+};
 static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
 {
        if (ab) {
@@ -252,14 +259,15 @@ void audit_log_lost(const char *message)
 }
 static int audit_log_config_change(char *function_name, int new, int old,
-                                   uid_t loginuid, u32 sid, int allow_changes)
+                                   uid_t loginuid, u32 sessionid, u32 sid,
+                                   int allow_changes)
 {
        struct audit_buffer *ab;
        int rc = 0;
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
-        audit_log_format(ab, "%s=%d old=%d by auid=%u", function_name, new,
+        audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
-                         old, loginuid);
+                         old, loginuid, sessionid);
        if (sid) {
                char *ctx = NULL;
                u32 len;
@@ -279,7 +287,8 @@ static int audit_log_config_change(char *function_name, int new, int old,
 }
 static int audit_do_config_change(char *function_name, int *to_change,
-                                  int new, uid_t loginuid, u32 sid)
+                                  int new, uid_t loginuid, u32 sessionid,
+                                  u32 sid)
 {
        int allow_changes, rc = 0, old = *to_change;
@@ -290,8 +299,8 @@ static int audit_do_config_change(char *function_name, int *to_change,
                allow_changes = 1;
        if (audit_enabled != AUDIT_OFF) {
-                rc = audit_log_config_change(function_name, new, old,
+                rc = audit_log_config_change(function_name, new, old, loginuid,
-                                             loginuid, sid, allow_changes);
+                                             sessionid, sid, allow_changes);
                if (rc)
                        allow_changes = 0;
        }
@@ -305,26 +314,28 @@ static int audit_do_config_change(char *function_name, int *to_change,
        return rc;
 }
-static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
+static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid,
+                                u32 sid)
 {
        return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
-                                      limit, loginuid, sid);
+                                      limit, loginuid, sessionid, sid);
 }
-static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
+static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid,
+                                   u32 sid)
 {
        return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
-                                      limit, loginuid, sid);
+                                      limit, loginuid, sessionid, sid);
 }
-static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
+static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)
 {
        int rc;
        if (state < AUDIT_OFF || state > AUDIT_LOCKED)
                return -EINVAL;
        rc =  audit_do_config_change("audit_enabled", &audit_enabled, state,
-                                     loginuid, sid);
+                                     loginuid, sessionid, sid);
        if (!rc)
                audit_ever_enabled |= !!state;
@@ -332,7 +343,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
        return rc;
 }
-static int audit_set_failure(int state, uid_t loginuid, u32 sid)
+static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid)
 {
        if (state != AUDIT_FAIL_SILENT
            && state != AUDIT_FAIL_PRINTK
@@ -340,7 +351,43 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
                return -EINVAL;
        return audit_do_config_change("audit_failure", &audit_failure, state,
-                                      loginuid, sid);
+                                      loginuid, sessionid, sid);
+}
+/*
+ * Queue skbs to be sent to auditd when/if it comes back.  These skbs should
+ * already have been sent via prink/syslog and so if these messages are dropped
+ * it is not a huge concern since we already passed the audit_log_lost()
+ * notification and stuff.  This is just nice to get audit messages during
+ * boot before auditd is running or messages generated while auditd is stopped.
+ * This only holds messages is audit_default is set, aka booting with audit=1
+ * or building your kernel that way.
+ */
+static void audit_hold_skb(struct sk_buff *skb)
+{
+        if (audit_default &&
+            skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit)
+                skb_queue_tail(&audit_skb_hold_queue, skb);
+        else
+                kfree_skb(skb);
+}
+static void kauditd_send_skb(struct sk_buff *skb)
+{
+        int err;
+        /* take a reference in case we can't send it and we want to hold it */
+        skb_get(skb);
+        err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
+        if (err < 0) {
+                BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
+                printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
+                audit_log_lost("auditd dissapeared\n");
+                audit_pid = 0;
+                /* we might get lucky and get this in the next auditd */
+                audit_hold_skb(skb);
+        } else
+                /* drop the extra reference if sent ok */
+                kfree_skb(skb);
 }
 static int kauditd_thread(void *dummy)
@@ -349,24 +396,41 @@ static int kauditd_thread(void *dummy)
        set_freezable();
        while (!kthread_should_stop()) {
+                /*
+                 * if auditd just started drain the queue of messages already
+                 * sent to syslog/printk.  remember loss here is ok.  we already
+                 * called audit_log_lost() if it didn't go out normally.  so the
+                 * race between the skb_dequeue and the next check for audit_pid
+                 * doesn't matter.
+                 *
+                 * if you ever find kauditd to be too slow we can get a perf win
+                 * by doing our own locking and keeping better track if there
+                 * are messages in this queue.  I don't see the need now, but
+                 * in 5 years when I want to play with this again I'll see this
+                 * note and still have no friggin idea what i'm thinking today.
+                 */
+                if (audit_default && audit_pid) {
+                        skb = skb_dequeue(&audit_skb_hold_queue);
+                        if (unlikely(skb)) {
+                                while (skb && audit_pid) {
+                                        kauditd_send_skb(skb);
+                                        skb = skb_dequeue(&audit_skb_hold_queue);
+                                }
+                        }
+                }
                skb = skb_dequeue(&audit_skb_queue);
                wake_up(&audit_backlog_wait);
                if (skb) {
-                        if (audit_pid) {
+                        if (audit_pid)
-                                int err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
+                                kauditd_send_skb(skb);
-                                if (err < 0) {
+                        else {
-                                        BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
-                                        printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
-                                        audit_log_lost("auditd dissapeared\n");
-                                        audit_pid = 0;
-                                }
-                        } else {
                                if (printk_ratelimit())
-                                        printk(KERN_NOTICE "%s\n", skb->data +
+                                        printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
-                                                NLMSG_SPACE(0));
                                else
                                        audit_log_lost("printk limit exceeded\n");
-                                kfree_skb(skb);
+                                audit_hold_skb(skb);
                        }
                } else {
                        DECLARE_WAITQUEUE(wait, current);
@@ -385,13 +449,13 @@ static int kauditd_thread(void *dummy)
        return 0;
 }
-static int audit_prepare_user_tty(pid_t pid, uid_t loginuid)
+static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
 {
        struct task_struct *tsk;
        int err;
        read_lock(&tasklist_lock);
-        tsk = find_task_by_pid(pid);
+        tsk = find_task_by_vpid(pid);
        err = -ESRCH;
        if (!tsk)
                goto out;
@@ -404,7 +468,7 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid)
        if (err)
                goto out;
-        tty_audit_push_task(tsk, loginuid);
+        tty_audit_push_task(tsk, loginuid, sessionid);
 out:
        read_unlock(&tasklist_lock);
        return err;
@@ -469,6 +533,19 @@ nlmsg_failure:			/* Used by NLMSG_PUT */
        return NULL;
 }
+static int audit_send_reply_thread(void *arg)
+{
+        struct audit_reply *reply = (struct audit_reply *)arg;
+        mutex_lock(&audit_cmd_mutex);
+        mutex_unlock(&audit_cmd_mutex);
+        /* Ignore failure. It'll only happen if the sender goes away,
+           because our timeout is set to infinite. */
+        netlink_unicast(audit_sock, reply->skb, reply->pid, 0);
+        kfree(reply);
+        return 0;
+}
 /**
 * audit_send_reply - send an audit reply message via netlink
 * @pid: process id to send reply to
@@ -485,14 +562,26 @@ nlmsg_failure:			/* Used by NLMSG_PUT */
 void audit_send_reply(int pid, int seq, int type, int done, int multi,
                      void *payload, int size)
 {
-        struct sk_buff  *skb;
+        struct sk_buff *skb;
+        struct task_struct *tsk;
+        struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
+                                            GFP_KERNEL);
+        if (!reply)
+                return;
        skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
        if (!skb)
                return;
-        /* Ignore failure. It'll only happen if the sender goes away,
-           because our timeout is set to infinite. */
+        reply->pid = pid;
-        netlink_unicast(audit_sock, skb, pid, 0);
+        reply->skb = skb;
-        return;
+        tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
+        if (IS_ERR(tsk)) {
+                kfree(reply);
+                kfree_skb(skb);
+        }
 }
 /*
@@ -534,7 +623,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 }
 static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
-                                     u32 pid, u32 uid, uid_t auid, u32 sid)
+                                     u32 pid, u32 uid, uid_t auid, u32 ses,
+                                     u32 sid)
 {
        int rc = 0;
        char *ctx = NULL;
@@ -546,8 +636,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
        }
        *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
-        audit_log_format(*ab, "user pid=%d uid=%u auid=%u",
+        audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
-                         pid, uid, auid);
+                         pid, uid, auid, ses);
        if (sid) {
                rc = security_secid_to_secctx(sid, &ctx, &len);
                if (rc)
@@ -570,6 +660,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        struct audit_buffer     *ab;
        u16                     msg_type = nlh->nlmsg_type;
        uid_t                   loginuid; /* loginuid of sender */
+        u32                     sessionid;
        struct audit_sig_info   *sig_data;
        char                    *ctx = NULL;
        u32                     len;
@@ -591,6 +682,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        pid  = NETLINK_CREDS(skb)->pid;
        uid  = NETLINK_CREDS(skb)->uid;
        loginuid = NETLINK_CB(skb).loginuid;
+        sessionid = NETLINK_CB(skb).sessionid;
        sid  = NETLINK_CB(skb).sid;
        seq  = nlh->nlmsg_seq;
        data = NLMSG_DATA(nlh);
@@ -613,12 +705,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                status_get   = (struct audit_status *)data;
                if (status_get->mask & AUDIT_STATUS_ENABLED) {
                        err = audit_set_enabled(status_get->enabled,
-                                                        loginuid, sid);
+                                                loginuid, sessionid, sid);
                        if (err < 0) return err;
                }
                if (status_get->mask & AUDIT_STATUS_FAILURE) {
                        err = audit_set_failure(status_get->failure,
-                                                         loginuid, sid);
+                                                loginuid, sessionid, sid);
                        if (err < 0) return err;
                }
                if (status_get->mask & AUDIT_STATUS_PID) {
@@ -627,17 +719,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        if (audit_enabled != AUDIT_OFF)
                                audit_log_config_change("audit_pid", new_pid,
                                                        audit_pid, loginuid,
-                                                        sid, 1);
+                                                        sessionid, sid, 1);
                        audit_pid = new_pid;
                        audit_nlk_pid = NETLINK_CB(skb).pid;
                }
                if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
                        err = audit_set_rate_limit(status_get->rate_limit,
-                                                         loginuid, sid);
+                                                   loginuid, sessionid, sid);
                if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
                        err = audit_set_backlog_limit(status_get->backlog_limit,
-                                                        loginuid, sid);
+                                                      loginuid, sessionid, sid);
                break;
        case AUDIT_USER:
        case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
@@ -649,12 +741,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (err == 1) {
                        err = 0;
                        if (msg_type == AUDIT_USER_TTY) {
-                                err = audit_prepare_user_tty(pid, loginuid);
+                                err = audit_prepare_user_tty(pid, loginuid,
+                                                             sessionid);
                                if (err)
                                        break;
                        }
                        audit_log_common_recv_msg(&ab, msg_type, pid, uid,
-                                                  loginuid, sid);
+                                                  loginuid, sessionid, sid);
                        if (msg_type != AUDIT_USER_TTY)
                                audit_log_format(ab, " msg='%.1024s'",
@@ -664,8 +757,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                audit_log_format(ab, " msg=");
                                size = nlmsg_len(nlh);
-                                audit_log_n_untrustedstring(ab, size,
+                                audit_log_n_untrustedstring(ab, data, size);
-                                                            data);
                        }
                        audit_set_pid(ab, pid);
                        audit_log_end(ab);
@@ -677,7 +769,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        return -EINVAL;
                if (audit_enabled == AUDIT_LOCKED) {
                        audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
-                                                  uid, loginuid, sid);
+                                                  uid, loginuid, sessionid, sid);
                        audit_log_format(ab, " audit_enabled=%d res=0",
                                         audit_enabled);
@@ -688,7 +780,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        case AUDIT_LIST:
                err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
                                           uid, seq, data, nlmsg_len(nlh),
-                                           loginuid, sid);
+                                           loginuid, sessionid, sid);
                break;
        case AUDIT_ADD_RULE:
        case AUDIT_DEL_RULE:
@@ -696,7 +788,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        return -EINVAL;
                if (audit_enabled == AUDIT_LOCKED) {
                        audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
-                                                  uid, loginuid, sid);
+                                                  uid, loginuid, sessionid, sid);
                        audit_log_format(ab, " audit_enabled=%d res=0",
                                         audit_enabled);
@@ -707,13 +799,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        case AUDIT_LIST_RULES:
                err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
                                           uid, seq, data, nlmsg_len(nlh),
-                                           loginuid, sid);
+                                           loginuid, sessionid, sid);
                break;
        case AUDIT_TRIM:
                audit_trim_trees();
                audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
-                                          uid, loginuid, sid);
+                                          uid, loginuid, sessionid, sid);
                audit_log_format(ab, " op=trim res=1");
                audit_log_end(ab);
@@ -721,21 +813,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        case AUDIT_MAKE_EQUIV: {
                void *bufp = data;
                u32 sizes[2];
-                size_t len = nlmsg_len(nlh);
+                size_t msglen = nlmsg_len(nlh);
                char *old, *new;
                err = -EINVAL;
-                if (len < 2 * sizeof(u32))
+                if (msglen < 2 * sizeof(u32))
                        break;
                memcpy(sizes, bufp, 2 * sizeof(u32));
                bufp += 2 * sizeof(u32);
-                len -= 2 * sizeof(u32);
+                msglen -= 2 * sizeof(u32);
-                old = audit_unpack_string(&bufp, &len, sizes[0]);
+                old = audit_unpack_string(&bufp, &msglen, sizes[0]);
                if (IS_ERR(old)) {
                        err = PTR_ERR(old);
                        break;
                }
-                new = audit_unpack_string(&bufp, &len, sizes[1]);
+                new = audit_unpack_string(&bufp, &msglen, sizes[1]);
                if (IS_ERR(new)) {
                        err = PTR_ERR(new);
                        kfree(old);
@@ -745,7 +837,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                err = audit_tag_tree(old, new);
                audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
-                                          uid, loginuid, sid);
+                                          uid, loginuid, sessionid, sid);
                audit_log_format(ab, " op=make_equiv old=");
                audit_log_untrustedstring(ab, old);
@@ -779,7 +871,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                struct task_struct *tsk;
                read_lock(&tasklist_lock);
-                tsk = find_task_by_pid(pid);
+                tsk = find_task_by_vpid(pid);
                if (!tsk)
                        err = -ESRCH;
                else {
@@ -802,7 +894,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (s->enabled != 0 && s->enabled != 1)
                        return -EINVAL;
                read_lock(&tasklist_lock);
-                tsk = find_task_by_pid(pid);
+                tsk = find_task_by_vpid(pid);
                if (!tsk)
                        err = -ESRCH;
                else {
@@ -877,6 +969,7 @@ static int __init audit_init(void)
                audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
        skb_queue_head_init(&audit_skb_queue);
+        skb_queue_head_init(&audit_skb_hold_queue);
        audit_initialized = 1;
        audit_enabled = audit_default;
        audit_ever_enabled |= !!audit_default;
@@ -1199,7 +1292,7 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
 * This function will take the passed buf and convert it into a string of
 * ascii hex digits. The new string is placed onto the skb.
 */
-void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
+void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
                size_t len)
 {
        int i, avail, new_len;
@@ -1235,8 +1328,8 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
 * Format a string of no more than slen characters into the audit buffer,
 * enclosed in quote marks.
 */
-static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
+void audit_log_n_string(struct audit_buffer *ab, const char *string,
-                               const char *string)
+                        size_t slen)
 {
        int avail, new_len;
        unsigned char *ptr;
@@ -1292,13 +1385,13 @@ int audit_string_contains_control(const char *string, size_t len)
 * The caller specifies the number of characters in the string to log, which may
 * or may not be the entire string.
 */
-void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
+void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string,
-                                 const char *string)
+                                 size_t len)
 {
        if (audit_string_contains_control(string, len))
-                audit_log_hex(ab, string, len);
+                audit_log_n_hex(ab, string, len);
        else
-                audit_log_n_string(ab, len, string);
+                audit_log_n_string(ab, string, len);
 }
 /**
@@ -1311,7 +1404,7 @@ void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
 */
 void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
 {
-        audit_log_n_untrustedstring(ab, strlen(string), string);
+        audit_log_n_untrustedstring(ab, string, strlen(string));
 }
 /* This is a helper-function to print the escaped d_path */
@@ -1355,19 +1448,23 @@ void audit_log_end(struct audit_buffer *ab)
                audit_log_lost("rate limit exceeded");
        } else {
                struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
+                nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
                if (audit_pid) {
-                        nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
                        skb_queue_tail(&audit_skb_queue, ab->skb);
-                        ab->skb = NULL;
                        wake_up_interruptible(&kauditd_wait);
-                } else if (nlh->nlmsg_type != AUDIT_EOE) {
+                } else {
-                        if (printk_ratelimit()) {
+                        if (nlh->nlmsg_type != AUDIT_EOE) {
-                                printk(KERN_NOTICE "type=%d %s\n",
+                                if (printk_ratelimit()) {
-                                        nlh->nlmsg_type,
+                                        printk(KERN_NOTICE "type=%d %s\n",
-                                        ab->skb->data + NLMSG_SPACE(0));
+                                                nlh->nlmsg_type,
-                        } else
+                                                ab->skb->data + NLMSG_SPACE(0));
-                                audit_log_lost("printk limit exceeded\n");
+                                } else
+                                        audit_log_lost("printk limit exceeded\n");
+                        }
+                        audit_hold_skb(ab->skb);
                }
+                ab->skb = NULL;
        }
        audit_buffer_free(ab);
 }
diff --git a/kernel/audit.h b/kernel/audit.h
index 3cfc54ee3e1f..9d6717412fec 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -74,6 +74,11 @@ struct audit_entry {
        struct audit_krule      rule;
 };
+#ifdef CONFIG_AUDIT
+extern int audit_enabled;
+extern int audit_ever_enabled;
+#endif
 extern int audit_pid;
 #define AUDIT_INODE_BUCKETS     32
@@ -104,6 +109,9 @@ struct audit_netlink_list {
 int audit_send_list(void *);
 struct inotify_watch;
+/* Inotify handle */
+extern struct inotify_handle *audit_ih;
 extern void audit_free_parent(struct inotify_watch *);
 extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
                                const char *, struct inode *);
@@ -111,6 +119,7 @@ extern int selinux_audit_rule_update(void);
 extern struct mutex audit_filter_mutex;
 extern void audit_free_rule_rcu(struct rcu_head *);
+extern struct list_head audit_filter_list[];
 #ifdef CONFIG_AUDIT_TREE
 extern struct audit_chunk *audit_tree_lookup(const struct inode *);
@@ -137,6 +146,10 @@ extern void audit_put_tree(struct audit_tree *);
 extern char *audit_unpack_string(void **, size_t *, size_t);
+extern pid_t audit_sig_pid;
+extern uid_t audit_sig_uid;
+extern u32 audit_sig_sid;
 #ifdef CONFIG_AUDITSYSCALL
 extern int __audit_signal_info(int sig, struct task_struct *t);
 static inline int audit_signal_info(int sig, struct task_struct *t)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 28fef6bf8534..0e0bd27e6512 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -89,14 +89,9 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 DEFINE_MUTEX(audit_filter_mutex);
-/* Inotify handle */
-extern struct inotify_handle *audit_ih;
 /* Inotify events we care about. */
 #define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
-extern int audit_enabled;
 void audit_free_parent(struct inotify_watch *i_watch)
 {
        struct audit_parent *parent;
@@ -272,7 +267,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len,
                return -EINVAL;
        watch = audit_init_watch(path);
-        if (unlikely(IS_ERR(watch)))
+        if (IS_ERR(watch))
                return PTR_ERR(watch);
        audit_get_watch(watch);
@@ -422,7 +417,7 @@ exit_err:
 static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 {
        struct audit_entry *entry;
-        struct audit_field *f;
+        struct audit_field *ino_f;
        int err = 0;
        int i;
@@ -483,6 +478,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
                        if (f->val & ~15)
                                goto exit_free;
                        break;
+                case AUDIT_FILETYPE:
+                        if ((f->val & ~S_IFMT) > S_IFMT)
+                                goto exit_free;
+                        break;
                case AUDIT_INODE:
                        err = audit_to_inode(&entry->rule, f);
                        if (err)
@@ -504,9 +503,9 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
                }
        }
-        f = entry->rule.inode_f;
+        ino_f = entry->rule.inode_f;
-        if (f) {
+        if (ino_f) {
-                switch(f->op) {
+                switch(ino_f->op) {
                case AUDIT_NOT_EQUAL:
                        entry->rule.inode_f = NULL;
                case AUDIT_EQUAL:
@@ -531,7 +530,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 {
        int err = 0;
        struct audit_entry *entry;
-        struct audit_field *f;
+        struct audit_field *ino_f;
        void *bufp;
        size_t remain = datasz - sizeof(struct audit_rule_data);
        int i;
@@ -654,14 +653,18 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                        if (f->val & ~15)
                                goto exit_free;
                        break;
+                case AUDIT_FILETYPE:
+                        if ((f->val & ~S_IFMT) > S_IFMT)
+                                goto exit_free;
+                        break;
                default:
                        goto exit_free;
                }
        }
-        f = entry->rule.inode_f;
+        ino_f = entry->rule.inode_f;
-        if (f) {
+        if (ino_f) {
-                switch(f->op) {
+                switch(ino_f->op) {
                case AUDIT_NOT_EQUAL:
                        entry->rule.inode_f = NULL;
                case AUDIT_EQUAL:
@@ -848,7 +851,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
                return ERR_PTR(-ENOMEM);
        new = audit_init_watch(path);
-        if (unlikely(IS_ERR(new))) {
+        if (IS_ERR(new)) {
                kfree(path);
                goto out;
        }
@@ -989,7 +992,7 @@ static void audit_update_watch(struct audit_parent *parent,
                        audit_set_auditable(current->audit_context);
                nwatch = audit_dupe_watch(owatch);
-                if (unlikely(IS_ERR(nwatch))) {
+                if (IS_ERR(nwatch)) {
                        mutex_unlock(&audit_filter_mutex);
                        audit_panic("error updating watch, skipping");
                        return;
@@ -1004,7 +1007,7 @@ static void audit_update_watch(struct audit_parent *parent,
                        list_del_rcu(&oentry->list);
                        nentry = audit_dupe_rule(&oentry->rule, nwatch);
-                        if (unlikely(IS_ERR(nentry)))
+                        if (IS_ERR(nentry))
                                audit_panic("error updating watch, removing");
                        else {
                                int h = audit_hash_ino((u32)ino);
@@ -1500,8 +1503,9 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
 }
 /* Log rule additions and removals */
-static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
+static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
-                                  struct audit_krule *rule, int res)
+                                  char *action, struct audit_krule *rule,
+                                  int res)
 {
        struct audit_buffer *ab;
@@ -1511,7 +1515,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
        if (!ab)
                return;
-        audit_log_format(ab, "auid=%u", loginuid);
+        audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid);
        if (sid) {
                char *ctx = NULL;
                u32 len;
@@ -1543,7 +1547,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
 * @sid: SE Linux Security ID of sender
 */
 int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
-                         size_t datasz, uid_t loginuid, u32 sid)
+                         size_t datasz, uid_t loginuid, u32 sessionid, u32 sid)
 {
        struct task_struct *tsk;
        struct audit_netlink_list *dest;
@@ -1590,7 +1594,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
                err = audit_add_rule(entry,
                                     &audit_filter_list[entry->rule.listnr]);
-                audit_log_rule_change(loginuid, sid, "add", &entry->rule, !err);
+                audit_log_rule_change(loginuid, sessionid, sid, "add",
+                                      &entry->rule, !err);
                if (err)
                        audit_free_rule(entry);
@@ -1606,8 +1611,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
                err = audit_del_rule(entry,
                                     &audit_filter_list[entry->rule.listnr]);
-                audit_log_rule_change(loginuid, sid, "remove", &entry->rule,
+                audit_log_rule_change(loginuid, sessionid, sid, "remove",
-                                      !err);
+                                      &entry->rule, !err);
                audit_free_rule(entry);
                break;
@@ -1785,7 +1790,7 @@ int audit_update_lsm_rules(void)
                        watch = entry->rule.watch;
                        tree = entry->rule.tree;
                        nentry = audit_dupe_rule(&entry->rule, watch);
-                        if (unlikely(IS_ERR(nentry))) {
+                        if (IS_ERR(nentry)) {
                                /* save the first error encountered for the
                                 * return value */
                                if (!err)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 56e56ed594a8..c10e7aae04d7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -68,9 +68,6 @@
 #include "audit.h"
-extern struct list_head audit_filter_list[];
-extern int audit_ever_enabled;
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
 * for saving names from getname(). */
 #define AUDIT_NAMES    20
@@ -283,6 +280,19 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
        }
 }
+static int audit_match_filetype(struct audit_context *ctx, int which)
+{
+        unsigned index = which & ~S_IFMT;
+        mode_t mode = which & S_IFMT;
+        if (index >= ctx->name_count)
+                return 0;
+        if (ctx->names[index].ino == -1)
+                return 0;
+        if ((ctx->names[index].mode ^ mode) & S_IFMT)
+                return 0;
+        return 1;
+}
 /*
 * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *;
 * ->first_trees points to its beginning, ->trees - to the current end of data.
@@ -592,6 +602,9 @@ static int audit_filter_rules(struct task_struct *tsk,
                case AUDIT_PERM:
                        result = audit_match_perm(ctx, f->val);
                        break;
+                case AUDIT_FILETYPE:
+                        result = audit_match_filetype(ctx, f->val);
+                        break;
                }
                if (!result)
@@ -1095,7 +1108,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
                        audit_log_format(*ab, "[%d]", i);
                audit_log_format(*ab, "=");
                if (has_cntl)
-                        audit_log_hex(*ab, buf, to_send);
+                        audit_log_n_hex(*ab, buf, to_send);
                else
                        audit_log_format(*ab, "\"%s\"", buf);
                audit_log_format(*ab, "\n");
@@ -1296,7 +1309,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                        break; }
                case AUDIT_SOCKETCALL: {
-                        int i;
                        struct audit_aux_data_socketcall *axs = (void *)aux;
                        audit_log_format(ab, "nargs=%d", axs->nargs);
                        for (i=0; i<axs->nargs; i++)
@@ -1307,7 +1319,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                        struct audit_aux_data_sockaddr *axs = (void *)aux;
                        audit_log_format(ab, "saddr=");
-                        audit_log_hex(ab, axs->a, axs->len);
+                        audit_log_n_hex(ab, axs->a, axs->len);
                        break; }
                case AUDIT_FD_PAIR: {
@@ -1321,7 +1333,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
        for (aux = context->aux_pids; aux; aux = aux->next) {
                struct audit_aux_data_pids *axs = (void *)aux;
-                int i;
                for (i = 0; i < axs->pid_count; i++)
                        if (audit_log_pid_context(context, axs->target_pid[i],
@@ -1371,8 +1382,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                        default:
                                /* log the name's directory component */
                                audit_log_format(ab, " name=");
-                                audit_log_n_untrustedstring(ab, n->name_len,
+                                audit_log_n_untrustedstring(ab, n->name,
-                                                            n->name);
+                                                            n->name_len);
                        }
                } else
                        audit_log_format(ab, " name=(null)");
@@ -1596,7 +1607,7 @@ static inline void handle_one(const struct inode *inode)
        if (likely(put_tree_ref(context, chunk)))
                return;
        if (unlikely(!grow_tree_refs(context))) {
-                printk(KERN_WARNING "out of memory, audit has lost a tree reference");
+                printk(KERN_WARNING "out of memory, audit has lost a tree reference\n");
                audit_set_auditable(context);
                audit_put_chunk(chunk);
                unroll_tree_refs(context, p, count);
@@ -1656,7 +1667,7 @@ retry:
                }
                /* too bad */
                printk(KERN_WARNING
-                        "out of memory, audit has lost a tree reference");
+                        "out of memory, audit has lost a tree reference\n");
                unroll_tree_refs(context, p, count);
                audit_set_auditable(context);
                return;
@@ -1752,13 +1763,13 @@ static int audit_inc_name_count(struct audit_context *context,
        if (context->name_count >= AUDIT_NAMES) {
                if (inode)
                        printk(KERN_DEBUG "name_count maxed, losing inode data: "
-                               "dev=%02x:%02x, inode=%lu",
+                               "dev=%02x:%02x, inode=%lu\n",
                               MAJOR(inode->i_sb->s_dev),
                               MINOR(inode->i_sb->s_dev),
                               inode->i_ino);
                else
-                        printk(KERN_DEBUG "name_count maxed, losing inode data");
+                        printk(KERN_DEBUG "name_count maxed, losing inode data\n");
                return 1;
        }
        context->name_count++;
@@ -2361,9 +2372,6 @@ int __audit_signal_info(int sig, struct task_struct *t)
        struct audit_aux_data_pids *axp;
        struct task_struct *tsk = current;
        struct audit_context *ctx = tsk->audit_context;
-        extern pid_t audit_sig_pid;
-        extern uid_t audit_sig_uid;
-        extern u32 audit_sig_sid;
        if (audit_pid && t->tgid == audit_pid) {
                if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
diff --git a/kernel/bounds.c b/kernel/bounds.c
index c3c55544db2f..3c5301381837 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -8,11 +8,7 @@
 /* Include headers that define the enum constants of interest */
 #include <linux/page-flags.h>
 #include <linux/mmzone.h>
+#include <linux/kbuild.h>
-#define DEFINE(sym, val) \
-        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
-#define BLANK() asm volatile("\n->" : : )
 void foo(void)
 {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6d8de051382b..fbc6fc8949b4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -44,6 +44,7 @@
 #include <linux/kmod.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
+#include <linux/hash.h>
 #include <asm/atomic.h>
@@ -118,17 +119,7 @@ static int root_count;
 * be called.
 */
 static int need_forkexit_callback;
+static int need_mm_owner_callback __read_mostly;
-/* bits in struct cgroup flags field */
-enum {
-        /* Control Group is dead */
-        CGRP_REMOVED,
-        /* Control Group has previously had a child cgroup or a task,
-         * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
-        CGRP_RELEASABLE,
-        /* Control Group requires release notifications to userspace */
-        CGRP_NOTIFY_ON_RELEASE,
-};
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -204,6 +195,27 @@ static struct cg_cgroup_link init_css_set_link;
 static DEFINE_RWLOCK(css_set_lock);
 static int css_set_count;
+/* hash table for cgroup groups. This improves the performance to
+ * find an existing css_set */
+#define CSS_SET_HASH_BITS       7
+#define CSS_SET_TABLE_SIZE      (1 << CSS_SET_HASH_BITS)
+static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
+static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+{
+        int i;
+        int index;
+        unsigned long tmp = 0UL;
+        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
+                tmp += (unsigned long)css[i];
+        tmp = (tmp >> 16) ^ tmp;
+        index = hash_long(tmp, CSS_SET_HASH_BITS);
+        return &css_set_table[index];
+}
 /* We don't maintain the lists running through each css_set to its
 * task until after the first call to cgroup_iter_start(). This
 * reduces the fork()/exit() overhead for people who have cgroups
@@ -230,7 +242,7 @@ static int use_task_css_set_links;
 static void unlink_css_set(struct css_set *cg)
 {
        write_lock(&css_set_lock);
-        list_del(&cg->list);
+        hlist_del(&cg->hlist);
        css_set_count--;
        while (!list_empty(&cg->cg_links)) {
                struct cg_cgroup_link *link;
@@ -295,9 +307,7 @@ static inline void put_css_set_taskexit(struct css_set *cg)
 /*
 * find_existing_css_set() is a helper for
 * find_css_set(), and checks to see whether an existing
- * css_set is suitable. This currently walks a linked-list for
+ * css_set is suitable.
- * simplicity; a later patch will use a hash table for better
- * performance
 *
 * oldcg: the cgroup group that we're using before the cgroup
 * transition
@@ -314,7 +324,9 @@ static struct css_set *find_existing_css_set(
 {
        int i;
        struct cgroupfs_root *root = cgrp->root;
-        struct list_head *l = &init_css_set.list;
+        struct hlist_head *hhead;
+        struct hlist_node *node;
+        struct css_set *cg;
        /* Built the set of subsystem state objects that we want to
         * see in the new css_set */
@@ -331,18 +343,13 @@ static struct css_set *find_existing_css_set(
                }
        }
-        /* Look through existing cgroup groups to find one to reuse */
+        hhead = css_set_hash(template);
-        do {
+        hlist_for_each_entry(cg, node, hhead, hlist) {
-                struct css_set *cg =
-                        list_entry(l, struct css_set, list);
                if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
                        /* All subsystems matched */
                        return cg;
                }
-                /* Try the next cgroup group */
+        }
-                l = l->next;
-        } while (l != &init_css_set.list);
        /* No existing cgroup group matched */
        return NULL;
@@ -404,6 +411,8 @@ static struct css_set *find_css_set(
        struct list_head tmp_cg_links;
        struct cg_cgroup_link *link;
+        struct hlist_head *hhead;
        /* First see if we already have a cgroup group that matches
         * the desired set */
        write_lock(&css_set_lock);
@@ -428,6 +437,7 @@ static struct css_set *find_css_set(
        kref_init(&res->ref);
        INIT_LIST_HEAD(&res->cg_links);
        INIT_LIST_HEAD(&res->tasks);
+        INIT_HLIST_NODE(&res->hlist);
        /* Copy the set of subsystem state objects generated in
         * find_existing_css_set() */
@@ -467,9 +477,12 @@ static struct css_set *find_css_set(
        BUG_ON(!list_empty(&tmp_cg_links));
-        /* Link this cgroup group into the list */
-        list_add(&res->list, &init_css_set.list);
        css_set_count++;
+        /* Add this cgroup group to the hash table */
+        hhead = css_set_hash(res->subsys);
+        hlist_add_head(&res->hlist, hhead);
        write_unlock(&css_set_lock);
        return res;
@@ -562,7 +575,7 @@ static struct inode_operations cgroup_dir_inode_operations;
 static struct file_operations proc_cgroupstats_operations;
 static struct backing_dev_info cgroup_backing_dev_info = {
-        .capabilities   = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
@@ -948,7 +961,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        int ret = 0;
        struct super_block *sb;
        struct cgroupfs_root *root;
-        struct list_head tmp_cg_links, *l;
+        struct list_head tmp_cg_links;
        INIT_LIST_HEAD(&tmp_cg_links);
        /* First find the desired set of subsystems */
@@ -990,6 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                /* New superblock */
                struct cgroup *cgrp = &root->top_cgroup;
                struct inode *inode;
+                int i;
                BUG_ON(sb->s_root != NULL);
@@ -1034,22 +1048,25 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                /* Link the top cgroup in this hierarchy into all
                 * the css_set objects */
                write_lock(&css_set_lock);
-                l = &init_css_set.list;
+                for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
-                do {
+                        struct hlist_head *hhead = &css_set_table[i];
+                        struct hlist_node *node;
                        struct css_set *cg;
-                        struct cg_cgroup_link *link;
-                        cg = list_entry(l, struct css_set, list);
+                        hlist_for_each_entry(cg, node, hhead, hlist) {
-                        BUG_ON(list_empty(&tmp_cg_links));
+                                struct cg_cgroup_link *link;
-                        link = list_entry(tmp_cg_links.next,
-                                          struct cg_cgroup_link,
+                                BUG_ON(list_empty(&tmp_cg_links));
-                                          cgrp_link_list);
+                                link = list_entry(tmp_cg_links.next,
-                        list_del(&link->cgrp_link_list);
+                                                  struct cg_cgroup_link,
-                        link->cg = cg;
+                                                  cgrp_link_list);
-                        list_add(&link->cgrp_link_list,
+                                list_del(&link->cgrp_link_list);
-                                 &root->top_cgroup.css_sets);
+                                link->cg = cg;
-                        list_add(&link->cg_link_list, &cg->cg_links);
+                                list_add(&link->cgrp_link_list,
-                        l = l->next;
+                                         &root->top_cgroup.css_sets);
-                } while (l != &init_css_set.list);
+                                list_add(&link->cg_link_list, &cg->cg_links);
+                        }
+                }
                write_unlock(&css_set_lock);
                free_cg_links(&tmp_cg_links);
@@ -1307,18 +1324,16 @@ enum cgroup_filetype {
        FILE_DIR,
        FILE_TASKLIST,
        FILE_NOTIFY_ON_RELEASE,
-        FILE_RELEASABLE,
        FILE_RELEASE_AGENT,
 };
-static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft,
+static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
-                                 struct file *file,
+                                struct file *file,
-                                 const char __user *userbuf,
+                                const char __user *userbuf,
-                                 size_t nbytes, loff_t *unused_ppos)
+                                size_t nbytes, loff_t *unused_ppos)
 {
        char buffer[64];
        int retval = 0;
-        u64 val;
        char *end;
        if (!nbytes)
@@ -1329,16 +1344,18 @@ static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft,
                return -EFAULT;
        buffer[nbytes] = 0;     /* nul-terminate */
+        strstrip(buffer);
-        /* strip newline if necessary */
+        if (cft->write_u64) {
-        if (nbytes && (buffer[nbytes-1] == '\n'))
+                u64 val = simple_strtoull(buffer, &end, 0);
-                buffer[nbytes-1] = 0;
+                if (*end)
-        val = simple_strtoull(buffer, &end, 0);
+                        return -EINVAL;
-        if (*end)
+                retval = cft->write_u64(cgrp, cft, val);
-                return -EINVAL;
+        } else {
+                s64 val = simple_strtoll(buffer, &end, 0);
-        /* Pass to subsystem */
+                if (*end)
-        retval = cft->write_uint(cgrp, cft, val);
+                        return -EINVAL;
+                retval = cft->write_s64(cgrp, cft, val);
+        }
        if (!retval)
                retval = nbytes;
        return retval;
@@ -1419,23 +1436,39 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
                return -ENODEV;
        if (cft->write)
                return cft->write(cgrp, cft, file, buf, nbytes, ppos);
-        if (cft->write_uint)
+        if (cft->write_u64 || cft->write_s64)
-                return cgroup_write_uint(cgrp, cft, file, buf, nbytes, ppos);
+                return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
+        if (cft->trigger) {
+                int ret = cft->trigger(cgrp, (unsigned int)cft->private);
+                return ret ? ret : nbytes;
+        }
        return -EINVAL;
 }
-static ssize_t cgroup_read_uint(struct cgroup *cgrp, struct cftype *cft,
+static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
-                                   struct file *file,
+                               struct file *file,
-                                   char __user *buf, size_t nbytes,
+                               char __user *buf, size_t nbytes,
-                                   loff_t *ppos)
+                               loff_t *ppos)
 {
        char tmp[64];
-        u64 val = cft->read_uint(cgrp, cft);
+        u64 val = cft->read_u64(cgrp, cft);
        int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
+static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
+                               struct file *file,
+                               char __user *buf, size_t nbytes,
+                               loff_t *ppos)
+{
+        char tmp[64];
+        s64 val = cft->read_s64(cgrp, cft);
+        int len = sprintf(tmp, "%lld\n", (long long) val);
+        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+}
 static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
                                          struct cftype *cft,
                                          struct file *file,
@@ -1490,11 +1523,56 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
        if (cft->read)
                return cft->read(cgrp, cft, file, buf, nbytes, ppos);
-        if (cft->read_uint)
+        if (cft->read_u64)
-                return cgroup_read_uint(cgrp, cft, file, buf, nbytes, ppos);
+                return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
+        if (cft->read_s64)
+                return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
        return -EINVAL;
 }
+/*
+ * seqfile ops/methods for returning structured data. Currently just
+ * supports string->u64 maps, but can be extended in future.
+ */
+struct cgroup_seqfile_state {
+        struct cftype *cft;
+        struct cgroup *cgroup;
+};
+static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
+{
+        struct seq_file *sf = cb->state;
+        return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
+}
+static int cgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+        struct cgroup_seqfile_state *state = m->private;
+        struct cftype *cft = state->cft;
+        if (cft->read_map) {
+                struct cgroup_map_cb cb = {
+                        .fill = cgroup_map_add,
+                        .state = m,
+                };
+                return cft->read_map(state->cgroup, cft, &cb);
+        }
+        return cft->read_seq_string(state->cgroup, cft, m);
+}
+int cgroup_seqfile_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq = file->private_data;
+        kfree(seq->private);
+        return single_release(inode, file);
+}
+static struct file_operations cgroup_seqfile_operations = {
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = cgroup_seqfile_release,
+};
 static int cgroup_file_open(struct inode *inode, struct file *file)
 {
        int err;
@@ -1507,7 +1585,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
        cft = __d_cft(file->f_dentry);
        if (!cft)
                return -ENODEV;
-        if (cft->open)
+        if (cft->read_map || cft->read_seq_string) {
+                struct cgroup_seqfile_state *state =
+                        kzalloc(sizeof(*state), GFP_USER);
+                if (!state)
+                        return -ENOMEM;
+                state->cft = cft;
+                state->cgroup = __d_cgrp(file->f_dentry->d_parent);
+                file->f_op = &cgroup_seqfile_operations;
+                err = single_open(file, cgroup_seqfile_show, state);
+                if (err < 0)
+                        kfree(state);
+        } else if (cft->open)
                err = cft->open(inode, file);
        else
                err = 0;
@@ -1715,7 +1804,7 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
 * The tasklist_lock is not held here, as do_each_thread() and
 * while_each_thread() are protected by RCU.
 */
-void cgroup_enable_task_cg_lists(void)
+static void cgroup_enable_task_cg_lists(void)
 {
        struct task_struct *p, *g;
        write_lock(&css_set_lock);
@@ -1913,14 +2002,14 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
        if (heap->size) {
                for (i = 0; i < heap->size; i++) {
-                        struct task_struct *p = heap->ptrs[i];
+                        struct task_struct *q = heap->ptrs[i];
                        if (i == 0) {
-                                latest_time = p->start_time;
+                                latest_time = q->start_time;
-                                latest_task = p;
+                                latest_task = q;
                        }
                        /* Process the task per the caller's callback */
-                        scan->process_task(p, scan);
+                        scan->process_task(q, scan);
-                        put_task_struct(p);
+                        put_task_struct(q);
                }
                /*
                 * If we had to process any tasks at all, scan again
@@ -2138,11 +2227,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
        return notify_on_release(cgrp);
 }
-static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft)
-{
-        return test_bit(CGRP_RELEASABLE, &cgrp->flags);
-}
 /*
 * for the common functions, 'private' gives the type of file
 */
@@ -2158,16 +2242,10 @@ static struct cftype files[] = {
        {
                .name = "notify_on_release",
-                .read_uint = cgroup_read_notify_on_release,
+                .read_u64 = cgroup_read_notify_on_release,
                .write = cgroup_common_file_write,
                .private = FILE_NOTIFY_ON_RELEASE,
        },
-        {
-                .name = "releasable",
-                .read_uint = cgroup_read_releasable,
-                .private = FILE_RELEASABLE,
-        }
 };
 static struct cftype cft_release_agent = {
@@ -2401,10 +2479,9 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        return 0;
 }
-static void cgroup_init_subsys(struct cgroup_subsys *ss)
+static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 {
        struct cgroup_subsys_state *css;
-        struct list_head *l;
        printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
@@ -2415,34 +2492,19 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss)
        BUG_ON(IS_ERR(css));
        init_cgroup_css(css, ss, dummytop);
-        /* Update all cgroup groups to contain a subsys
+        /* Update the init_css_set to contain a subsys
         * pointer to this state - since the subsystem is
-         * newly registered, all tasks and hence all cgroup
+         * newly registered, all tasks and hence the
-         * groups are in the subsystem's top cgroup. */
+         * init_css_set is in the subsystem's top cgroup. */
-        write_lock(&css_set_lock);
+        init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
-        l = &init_css_set.list;
-        do {
-                struct css_set *cg =
-                        list_entry(l, struct css_set, list);
-                cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
-                l = l->next;
-        } while (l != &init_css_set.list);
-        write_unlock(&css_set_lock);
-        /* If this subsystem requested that it be notified with fork
-         * events, we should send it one now for every process in the
-         * system */
-        if (ss->fork) {
-                struct task_struct *g, *p;
-                read_lock(&tasklist_lock);
-                do_each_thread(g, p) {
-                        ss->fork(ss, p);
-                } while_each_thread(g, p);
-                read_unlock(&tasklist_lock);
-        }
        need_forkexit_callback |= ss->fork || ss->exit;
+        need_mm_owner_callback |= !!ss->mm_owner_changed;
+        /* At system boot, before all subsystems have been
+         * registered, no tasks have been forked, so we don't
+         * need to invoke fork callbacks here. */
+        BUG_ON(!list_empty(&init_task.tasks));
        ss->active = 1;
 }
@@ -2458,9 +2520,9 @@ int __init cgroup_init_early(void)
        int i;
        kref_init(&init_css_set.ref);
        kref_get(&init_css_set.ref);
-        INIT_LIST_HEAD(&init_css_set.list);
        INIT_LIST_HEAD(&init_css_set.cg_links);
        INIT_LIST_HEAD(&init_css_set.tasks);
+        INIT_HLIST_NODE(&init_css_set.hlist);
        css_set_count = 1;
        init_cgroup_root(&rootnode);
        list_add(&rootnode.root_list, &roots);
@@ -2473,6 +2535,9 @@ int __init cgroup_init_early(void)
        list_add(&init_css_set_link.cg_link_list,
                 &init_css_set.cg_links);
+        for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
+                INIT_HLIST_HEAD(&css_set_table[i]);
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
@@ -2502,7 +2567,7 @@ int __init cgroup_init(void)
 {
        int err;
        int i;
-        struct proc_dir_entry *entry;
+        struct hlist_head *hhead;
        err = bdi_init(&cgroup_backing_dev_info);
        if (err)
@@ -2514,13 +2579,15 @@ int __init cgroup_init(void)
                        cgroup_init_subsys(ss);
        }
+        /* Add init_css_set to the hash table */
+        hhead = css_set_hash(init_css_set.subsys);
+        hlist_add_head(&init_css_set.hlist, hhead);
        err = register_filesystem(&cgroup_fs_type);
        if (err < 0)
                goto out;
-        entry = create_proc_entry("cgroups", 0, NULL);
+        proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
-        if (entry)
-                entry->proc_fops = &proc_cgroupstats_operations;
 out:
        if (err)
@@ -2683,6 +2750,34 @@ void cgroup_fork_callbacks(struct task_struct *child)
        }
 }
+#ifdef CONFIG_MM_OWNER
+/**
+ * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
+ * @p: the new owner
+ *
+ * Called on every change to mm->owner. mm_init_owner() does not
+ * invoke this routine, since it assigns the mm->owner the first time
+ * and does not change it.
+ */
+void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
+{
+        struct cgroup *oldcgrp, *newcgrp;
+        if (need_mm_owner_callback) {
+                int i;
+                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                        struct cgroup_subsys *ss = subsys[i];
+                        oldcgrp = task_cgroup(old, ss->subsys_id);
+                        newcgrp = task_cgroup(new, ss->subsys_id);
+                        if (oldcgrp == newcgrp)
+                                continue;
+                        if (ss->mm_owner_changed)
+                                ss->mm_owner_changed(ss, oldcgrp, newcgrp);
+                }
+        }
+}
+#endif /* CONFIG_MM_OWNER */
 /**
 * cgroup_post_fork - called on a new task after adding it to the task list
 * @child: the task in question
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index 37301e877cb0..c3dc3aba4c02 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -1,5 +1,5 @@
 /*
- * kernel/ccontainer_debug.c - Example cgroup subsystem that
+ * kernel/cgroup_debug.c - Example cgroup subsystem that
 * exposes debug info
 *
 * Copyright (C) Google Inc, 2007
@@ -62,25 +62,35 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
        return count;
 }
+static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
+{
+        return test_bit(CGRP_RELEASABLE, &cgrp->flags);
+}
 static struct cftype files[] =  {
        {
                .name = "cgroup_refcount",
-                .read_uint = cgroup_refcount_read,
+                .read_u64 = cgroup_refcount_read,
        },
        {
                .name = "taskcount",
-                .read_uint = taskcount_read,
+                .read_u64 = taskcount_read,
        },
        {
                .name = "current_css_set",
-                .read_uint = current_css_set_read,
+                .read_u64 = current_css_set_read,
        },
        {
                .name = "current_css_set_refcount",
-                .read_uint = current_css_set_refcount_read,
+                .read_u64 = current_css_set_refcount_read,
        },
+        {
+                .name = "releasable",
+                .read_u64 = releasable_read,
+        }
 };
 static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/compat.c b/kernel/compat.c
index e1ef04870c2a..32c254a8ab9a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -898,7 +898,7 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
        current->state = TASK_INTERRUPTIBLE;
        schedule();
-        set_thread_flag(TIF_RESTORE_SIGMASK);
+        set_restore_sigmask();
        return -ERESTARTNOHAND;
 }
 #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
@@ -955,7 +955,8 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
                        __put_user(txc.jitcnt, &utp->jitcnt) ||
                        __put_user(txc.calcnt, &utp->calcnt) ||
                        __put_user(txc.errcnt, &utp->errcnt) ||
-                        __put_user(txc.stbcnt, &utp->stbcnt))
+                        __put_user(txc.stbcnt, &utp->stbcnt) ||
+                        __put_user(txc.tai, &utp->tai))
                ret = -EFAULT;
        return ret;
@@ -1080,4 +1081,3 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
        return 0;
 }
diff --git a/kernel/configs.c b/kernel/configs.c
index e84d3f9c6c7b..4c345210ed8c 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -79,12 +79,11 @@ static int __init ikconfig_init(void)
        struct proc_dir_entry *entry;
        /* create the current config file */
-        entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO,
+        entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL,
-                                  &proc_root);
+                            &ikconfig_file_ops);
        if (!entry)
                return -ENOMEM;
-        entry->proc_fops = &ikconfig_file_ops;
        entry->size = kernel_config_data_size;
        return 0;
@@ -95,7 +94,7 @@ static int __init ikconfig_init(void)
 static void __exit ikconfig_cleanup(void)
 {
-        remove_proc_entry("config.gz", &proc_root);
+        remove_proc_entry("config.gz", NULL);
 }
 module_init(ikconfig_init);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2011ad8d2697..c77bc3a1c722 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -33,17 +33,13 @@ static struct {
         * an ongoing cpu hotplug operation.
         */
        int refcount;
-        wait_queue_head_t writer_queue;
 } cpu_hotplug;
-#define writer_exists() (cpu_hotplug.active_writer != NULL)
 void __init cpu_hotplug_init(void)
 {
        cpu_hotplug.active_writer = NULL;
        mutex_init(&cpu_hotplug.lock);
        cpu_hotplug.refcount = 0;
-        init_waitqueue_head(&cpu_hotplug.writer_queue);
 }
 #ifdef CONFIG_HOTPLUG_CPU
@@ -65,11 +61,8 @@ void put_online_cpus(void)
        if (cpu_hotplug.active_writer == current)
                return;
        mutex_lock(&cpu_hotplug.lock);
-        cpu_hotplug.refcount--;
+        if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
+                wake_up_process(cpu_hotplug.active_writer);
-        if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
-                wake_up(&cpu_hotplug.writer_queue);
        mutex_unlock(&cpu_hotplug.lock);
 }
@@ -98,8 +91,8 @@ void cpu_maps_update_done(void)
 * Note that during a cpu-hotplug operation, the new readers, if any,
 * will be blocked by the cpu_hotplug.lock
 *
- * Since cpu_maps_update_begin is always called after invoking
+ * Since cpu_hotplug_begin() is always called after invoking
- * cpu_maps_update_begin, we can be sure that only one writer is active.
+ * cpu_maps_update_begin(), we can be sure that only one writer is active.
 *
 * Note that theoretically, there is a possibility of a livelock:
 * - Refcount goes to zero, last reader wakes up the sleeping
@@ -115,19 +108,16 @@ void cpu_maps_update_done(void)
 */
 static void cpu_hotplug_begin(void)
 {
-        DECLARE_WAITQUEUE(wait, current);
-        mutex_lock(&cpu_hotplug.lock);
        cpu_hotplug.active_writer = current;
-        add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait);
-        while (cpu_hotplug.refcount) {
+        for (;;) {
-                set_current_state(TASK_UNINTERRUPTIBLE);
+                mutex_lock(&cpu_hotplug.lock);
+                if (likely(!cpu_hotplug.refcount))
+                        break;
+                __set_current_state(TASK_UNINTERRUPTIBLE);
                mutex_unlock(&cpu_hotplug.lock);
                schedule();
-                mutex_lock(&cpu_hotplug.lock);
        }
-        remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
 }
 static void cpu_hotplug_done(void)
@@ -136,7 +126,7 @@ static void cpu_hotplug_done(void)
        mutex_unlock(&cpu_hotplug.lock);
 }
 /* Need to know about CPUs going up/down? */
-int __cpuinit register_cpu_notifier(struct notifier_block *nb)
+int __ref register_cpu_notifier(struct notifier_block *nb)
 {
        int ret;
        cpu_maps_update_begin();
@@ -149,7 +139,7 @@ int __cpuinit register_cpu_notifier(struct notifier_block *nb)
 EXPORT_SYMBOL(register_cpu_notifier);
-void unregister_cpu_notifier(struct notifier_block *nb)
+void __ref unregister_cpu_notifier(struct notifier_block *nb)
 {
        cpu_maps_update_begin();
        raw_notifier_chain_unregister(&cpu_chain, nb);
@@ -180,7 +170,7 @@ struct take_cpu_down_param {
 };
 /* Take this CPU down. */
-static int take_cpu_down(void *_param)
+static int __ref take_cpu_down(void *_param)
 {
        struct take_cpu_down_param *param = _param;
        int err;
@@ -199,7 +189,7 @@ static int take_cpu_down(void *_param)
 }
 /* Requires cpu_add_remove_lock to be held */
-static int _cpu_down(unsigned int cpu, int tasks_frozen)
+static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
        int err, nr_calls = 0;
        struct task_struct *p;
@@ -225,7 +215,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
                __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
                                          hcpu, nr_calls, NULL);
                printk("%s: attempt to take down CPU %u failed\n",
-                                __FUNCTION__, cpu);
+                                __func__, cpu);
                err = -EINVAL;
                goto out_release;
        }
@@ -274,7 +264,7 @@ out_release:
        return err;
 }
-int cpu_down(unsigned int cpu)
+int __ref cpu_down(unsigned int cpu)
 {
        int err = 0;
@@ -305,7 +295,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
        if (ret == NOTIFY_BAD) {
                nr_calls--;
                printk("%s: attempt to bring up CPU %u failed\n",
-                                __FUNCTION__, cpu);
+                                __func__, cpu);
                ret = -EINVAL;
                goto out_notify;
        }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 024888bb9814..8da627d33804 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -127,6 +127,7 @@ struct cpuset_hotplug_scanner {
 typedef enum {
        CS_CPU_EXCLUSIVE,
        CS_MEM_EXCLUSIVE,
+        CS_MEM_HARDWALL,
        CS_MEMORY_MIGRATE,
        CS_SCHED_LOAD_BALANCE,
        CS_SPREAD_PAGE,
@@ -144,6 +145,11 @@ static inline int is_mem_exclusive(const struct cpuset *cs)
        return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
 }
+static inline int is_mem_hardwall(const struct cpuset *cs)
+{
+        return test_bit(CS_MEM_HARDWALL, &cs->flags);
+}
 static inline int is_sched_load_balance(const struct cpuset *cs)
 {
        return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
@@ -735,7 +741,8 @@ static inline int started_after(void *p1, void *p2)
 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
 * words, if its mask is not equal to its cpuset's mask).
 */
-int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
+static int cpuset_test_cpumask(struct task_struct *tsk,
+                               struct cgroup_scanner *scan)
 {
        return !cpus_equal(tsk->cpus_allowed,
                        (cgroup_cs(scan->cg))->cpus_allowed);
@@ -752,7 +759,8 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
 * We don't need to re-check for the cgroup/cpuset membership, since we're
 * holding cgroup_lock() at this point.
 */
-void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
+static void cpuset_change_cpumask(struct task_struct *tsk,
+                                  struct cgroup_scanner *scan)
 {
        set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed));
 }
@@ -1023,19 +1031,6 @@ int current_cpuset_is_being_rebound(void)
        return task_cs(current) == cpuset_being_rebound;
 }
-/*
- * Call with cgroup_mutex held.
- */
-static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
-{
-        if (simple_strtoul(buf, NULL, 10) != 0)
-                cpuset_memory_pressure_enabled = 1;
-        else
-                cpuset_memory_pressure_enabled = 0;
-        return 0;
-}
 static int update_relax_domain_level(struct cpuset *cs, char *buf)
 {
        int val = simple_strtol(buf, NULL, 10);
@@ -1053,25 +1048,20 @@ static int update_relax_domain_level(struct cpuset *cs, char *buf)
 /*
 * update_flag - read a 0 or a 1 in a file and update associated flag
- * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
+ * bit:         the bit to update (see cpuset_flagbits_t)
- *                              CS_SCHED_LOAD_BALANCE,
+ * cs:          the cpuset to update
- *                              CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
+ * turning_on:  whether the flag is being set or cleared
- *                              CS_SPREAD_PAGE, CS_SPREAD_SLAB)
- * cs:  the cpuset to update
- * buf: the buffer where we read the 0 or 1
 *
 * Call with cgroup_mutex held.
 */
-static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+                       int turning_on)
 {
-        int turning_on;
        struct cpuset trialcs;
        int err;
        int cpus_nonempty, balance_flag_changed;
-        turning_on = (simple_strtoul(buf, NULL, 10) != 0);
        trialcs = *cs;
        if (turning_on)
                set_bit(bit, &trialcs.flags);
@@ -1241,6 +1231,7 @@ typedef enum {
        FILE_MEMLIST,
        FILE_CPU_EXCLUSIVE,
        FILE_MEM_EXCLUSIVE,
+        FILE_MEM_HARDWALL,
        FILE_SCHED_LOAD_BALANCE,
        FILE_SCHED_RELAX_DOMAIN_LEVEL,
        FILE_MEMORY_PRESSURE_ENABLED,
@@ -1265,7 +1256,8 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
                return -E2BIG;
        /* +1 for nul-terminator */
-        if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0)
+        buffer = kmalloc(nbytes + 1, GFP_KERNEL);
+        if (!buffer)
                return -ENOMEM;
        if (copy_from_user(buffer, userbuf, nbytes)) {
@@ -1288,46 +1280,71 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
        case FILE_MEMLIST:
                retval = update_nodemask(cs, buffer);
                break;
+        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+                retval = update_relax_domain_level(cs, buffer);
+                break;
+        default:
+                retval = -EINVAL;
+                goto out2;
+        }
+        if (retval == 0)
+                retval = nbytes;
+out2:
+        cgroup_unlock();
+out1:
+        kfree(buffer);
+        return retval;
+}
+static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+        int retval = 0;
+        struct cpuset *cs = cgroup_cs(cgrp);
+        cpuset_filetype_t type = cft->private;
+        cgroup_lock();
+        if (cgroup_is_removed(cgrp)) {
+                cgroup_unlock();
+                return -ENODEV;
+        }
+        switch (type) {
        case FILE_CPU_EXCLUSIVE:
-                retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer);
+                retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
                break;
        case FILE_MEM_EXCLUSIVE:
-                retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
+                retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
                break;
-        case FILE_SCHED_LOAD_BALANCE:
+        case FILE_MEM_HARDWALL:
-                retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
+                retval = update_flag(CS_MEM_HARDWALL, cs, val);
                break;
-        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+        case FILE_SCHED_LOAD_BALANCE:
-                retval = update_relax_domain_level(cs, buffer);
+                retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
                break;
        case FILE_MEMORY_MIGRATE:
-                retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
+                retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
                break;
        case FILE_MEMORY_PRESSURE_ENABLED:
-                retval = update_memory_pressure_enabled(cs, buffer);
+                cpuset_memory_pressure_enabled = !!val;
                break;
        case FILE_MEMORY_PRESSURE:
                retval = -EACCES;
                break;
        case FILE_SPREAD_PAGE:
-                retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
+                retval = update_flag(CS_SPREAD_PAGE, cs, val);
                cs->mems_generation = cpuset_mems_generation++;
                break;
        case FILE_SPREAD_SLAB:
-                retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
+                retval = update_flag(CS_SPREAD_SLAB, cs, val);
                cs->mems_generation = cpuset_mems_generation++;
                break;
        default:
                retval = -EINVAL;
-                goto out2;
+                break;
        }
-        if (retval == 0)
-                retval = nbytes;
-out2:
        cgroup_unlock();
-out1:
-        kfree(buffer);
        return retval;
 }
@@ -1389,33 +1406,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
        case FILE_MEMLIST:
                s += cpuset_sprintf_memlist(s, cs);
                break;
-        case FILE_CPU_EXCLUSIVE:
-                *s++ = is_cpu_exclusive(cs) ? '1' : '0';
-                break;
-        case FILE_MEM_EXCLUSIVE:
-                *s++ = is_mem_exclusive(cs) ? '1' : '0';
-                break;
-        case FILE_SCHED_LOAD_BALANCE:
-                *s++ = is_sched_load_balance(cs) ? '1' : '0';
-                break;
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
                s += sprintf(s, "%d", cs->relax_domain_level);
                break;
-        case FILE_MEMORY_MIGRATE:
-                *s++ = is_memory_migrate(cs) ? '1' : '0';
-                break;
-        case FILE_MEMORY_PRESSURE_ENABLED:
-                *s++ = cpuset_memory_pressure_enabled ? '1' : '0';
-                break;
-        case FILE_MEMORY_PRESSURE:
-                s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
-                break;
-        case FILE_SPREAD_PAGE:
-                *s++ = is_spread_page(cs) ? '1' : '0';
-                break;
-        case FILE_SPREAD_SLAB:
-                *s++ = is_spread_slab(cs) ? '1' : '0';
-                break;
        default:
                retval = -EINVAL;
                goto out;
@@ -1428,121 +1421,137 @@ out:
        return retval;
 }
+static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
+{
+        struct cpuset *cs = cgroup_cs(cont);
+        cpuset_filetype_t type = cft->private;
+        switch (type) {
+        case FILE_CPU_EXCLUSIVE:
+                return is_cpu_exclusive(cs);
+        case FILE_MEM_EXCLUSIVE:
+                return is_mem_exclusive(cs);
+        case FILE_MEM_HARDWALL:
+                return is_mem_hardwall(cs);
+        case FILE_SCHED_LOAD_BALANCE:
+                return is_sched_load_balance(cs);
+        case FILE_MEMORY_MIGRATE:
+                return is_memory_migrate(cs);
+        case FILE_MEMORY_PRESSURE_ENABLED:
+                return cpuset_memory_pressure_enabled;
+        case FILE_MEMORY_PRESSURE:
+                return fmeter_getrate(&cs->fmeter);
+        case FILE_SPREAD_PAGE:
+                return is_spread_page(cs);
+        case FILE_SPREAD_SLAB:
+                return is_spread_slab(cs);
+        default:
+                BUG();
+        }
+}
 /*
 * for the common functions, 'private' gives the type of file
 */
-static struct cftype cft_cpus = {
+static struct cftype files[] = {
-        .name = "cpus",
+        {
-        .read = cpuset_common_file_read,
+                .name = "cpus",
-        .write = cpuset_common_file_write,
+                .read = cpuset_common_file_read,
-        .private = FILE_CPULIST,
+                .write = cpuset_common_file_write,
-};
+                .private = FILE_CPULIST,
+        },
-static struct cftype cft_mems = {
-        .name = "mems",
+        {
-        .read = cpuset_common_file_read,
+                .name = "mems",
-        .write = cpuset_common_file_write,
+                .read = cpuset_common_file_read,
-        .private = FILE_MEMLIST,
+                .write = cpuset_common_file_write,
-};
+                .private = FILE_MEMLIST,
+        },
-static struct cftype cft_cpu_exclusive = {
-        .name = "cpu_exclusive",
+        {
-        .read = cpuset_common_file_read,
+                .name = "cpu_exclusive",
-        .write = cpuset_common_file_write,
+                .read_u64 = cpuset_read_u64,
-        .private = FILE_CPU_EXCLUSIVE,
+                .write_u64 = cpuset_write_u64,
-};
+                .private = FILE_CPU_EXCLUSIVE,
+        },
-static struct cftype cft_mem_exclusive = {
-        .name = "mem_exclusive",
+        {
-        .read = cpuset_common_file_read,
+                .name = "mem_exclusive",
-        .write = cpuset_common_file_write,
+                .read_u64 = cpuset_read_u64,
-        .private = FILE_MEM_EXCLUSIVE,
+                .write_u64 = cpuset_write_u64,
-};
+                .private = FILE_MEM_EXCLUSIVE,
+        },
-static struct cftype cft_sched_load_balance = {
-        .name = "sched_load_balance",
+        {
-        .read = cpuset_common_file_read,
+                .name = "mem_hardwall",
-        .write = cpuset_common_file_write,
+                .read_u64 = cpuset_read_u64,
-        .private = FILE_SCHED_LOAD_BALANCE,
+                .write_u64 = cpuset_write_u64,
-};
+                .private = FILE_MEM_HARDWALL,
+        },
-static struct cftype cft_sched_relax_domain_level = {
-        .name = "sched_relax_domain_level",
+        {
-        .read = cpuset_common_file_read,
+                .name = "sched_load_balance",
-        .write = cpuset_common_file_write,
+                .read_u64 = cpuset_read_u64,
-        .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
+                .write_u64 = cpuset_write_u64,
-};
+                .private = FILE_SCHED_LOAD_BALANCE,
+        },
-static struct cftype cft_memory_migrate = {
-        .name = "memory_migrate",
+        {
-        .read = cpuset_common_file_read,
+                .name = "sched_relax_domain_level",
-        .write = cpuset_common_file_write,
+                .read_u64 = cpuset_read_u64,
-        .private = FILE_MEMORY_MIGRATE,
+                .write_u64 = cpuset_write_u64,
+                .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
+        },
+        {
+                .name = "memory_migrate",
+                .read_u64 = cpuset_read_u64,
+                .write_u64 = cpuset_write_u64,
+                .private = FILE_MEMORY_MIGRATE,
+        },
+        {
+                .name = "memory_pressure",
+                .read_u64 = cpuset_read_u64,
+                .write_u64 = cpuset_write_u64,
+                .private = FILE_MEMORY_PRESSURE,
+        },
+        {
+                .name = "memory_spread_page",
+                .read_u64 = cpuset_read_u64,
+                .write_u64 = cpuset_write_u64,
+                .private = FILE_SPREAD_PAGE,
+        },
+        {
+                .name = "memory_spread_slab",
+                .read_u64 = cpuset_read_u64,
+                .write_u64 = cpuset_write_u64,
+                .private = FILE_SPREAD_SLAB,
+        },
 };
 static struct cftype cft_memory_pressure_enabled = {
        .name = "memory_pressure_enabled",
-        .read = cpuset_common_file_read,
+        .read_u64 = cpuset_read_u64,
-        .write = cpuset_common_file_write,
+        .write_u64 = cpuset_write_u64,
        .private = FILE_MEMORY_PRESSURE_ENABLED,
 };
-static struct cftype cft_memory_pressure = {
-        .name = "memory_pressure",
-        .read = cpuset_common_file_read,
-        .write = cpuset_common_file_write,
-        .private = FILE_MEMORY_PRESSURE,
-};
-static struct cftype cft_spread_page = {
-        .name = "memory_spread_page",
-        .read = cpuset_common_file_read,
-        .write = cpuset_common_file_write,
-        .private = FILE_SPREAD_PAGE,
-};
-static struct cftype cft_spread_slab = {
-        .name = "memory_spread_slab",
-        .read = cpuset_common_file_read,
-        .write = cpuset_common_file_write,
-        .private = FILE_SPREAD_SLAB,
-};
 static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 {
        int err;
-        if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0)
+        err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
-                return err;
+        if (err)
-        if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0)
-                return err;
-        if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0)
-                return err;
-        if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0)
-                return err;
-        if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0)
-                return err;
-        if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
-                return err;
-        if ((err = cgroup_add_file(cont, ss,
-                                        &cft_sched_relax_domain_level)) < 0)
-                return err;
-        if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
-                return err;
-        if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
-                return err;
-        if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0)
                return err;
        /* memory_pressure_enabled is in root cpuset only */
-        if (err == 0 && !cont->parent)
+        if (!cont->parent)
                err = cgroup_add_file(cont, ss,
-                                         &cft_memory_pressure_enabled);
+                                      &cft_memory_pressure_enabled);
-        return 0;
+        return err;
 }
 /*
@@ -1642,7 +1651,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
        cpuset_update_task_memory_state();
        if (is_sched_load_balance(cs))
-                update_flag(CS_SCHED_LOAD_BALANCE, cs, "0");
+                update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
        number_of_cpusets--;
        kfree(cs);
@@ -1707,7 +1716,8 @@ int __init cpuset_init(void)
 * Called by cgroup_scan_tasks() for each task in a cgroup.
 * Return nonzero to stop the walk through the tasks.
 */
-void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan)
+static void cpuset_do_move_task(struct task_struct *tsk,
+                                struct cgroup_scanner *scan)
 {
        struct cpuset_hotplug_scanner *chsp;
@@ -1969,14 +1979,14 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 }
 /*
- * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
+ * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
- * ancestor to the specified cpuset.  Call holding callback_mutex.
+ * mem_hardwall ancestor to the specified cpuset.  Call holding
- * If no ancestor is mem_exclusive (an unusual configuration), then
+ * callback_mutex.  If no ancestor is mem_exclusive or mem_hardwall
- * returns the root cpuset.
+ * (an unusual configuration), then returns the root cpuset.
 */
-static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
+static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
 {
-        while (!is_mem_exclusive(cs) && cs->parent)
+        while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
                cs = cs->parent;
        return cs;
 }
@@ -1990,7 +2000,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
 * __GFP_THISNODE is set, yes, we can always allocate.  If zone
 * z's node is in our tasks mems_allowed, yes.  If it's not a
 * __GFP_HARDWALL request and this zone's nodes is in the nearest
- * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
+ * hardwalled cpuset ancestor to this tasks cpuset, yes.
 * If the task has been OOM killed and has access to memory reserves
 * as specified by the TIF_MEMDIE flag, yes.
 * Otherwise, no.
@@ -2013,7 +2023,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
 * and do not allow allocations outside the current tasks cpuset
 * unless the task has been OOM killed as is marked TIF_MEMDIE.
 * GFP_KERNEL allocations are not so marked, so can escape to the
- * nearest enclosing mem_exclusive ancestor cpuset.
+ * nearest enclosing hardwalled ancestor cpuset.
 *
 * Scanning up parent cpusets requires callback_mutex.  The
 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
@@ -2036,7 +2046,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
 *      in_interrupt - any node ok (current task context irrelevant)
 *      GFP_ATOMIC   - any node ok
 *      TIF_MEMDIE   - any node ok
- *      GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
+ *      GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
 *      GFP_USER     - only nodes in current tasks mems allowed ok.
 *
 * Rule:
@@ -2073,7 +2083,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
        mutex_lock(&callback_mutex);
        task_lock(current);
-        cs = nearest_exclusive_ancestor(task_cs(current));
+        cs = nearest_hardwall_ancestor(task_cs(current));
        task_unlock(current);
        allowed = node_isset(node, cs->mems_allowed);
diff --git a/kernel/dma.c b/kernel/dma.c
index 6a82bb716dac..d2c60a822790 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -149,12 +149,7 @@ static const struct file_operations proc_dma_operations = {
 static int __init proc_dma_init(void)
 {
-        struct proc_dir_entry *e;
+        proc_create("dma", 0, NULL, &proc_dma_operations);
-        e = create_proc_entry("dma", 0, NULL);
-        if (e)
-                e->proc_fops = &proc_dma_operations;
        return 0;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index 2a9d98c641ac..1510f78a0ffa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -19,6 +19,7 @@
 #include <linux/acct.h>
 #include <linux/tsacct_kern.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/binfmts.h>
 #include <linux/nsproxy.h>
 #include <linux/pid_namespace.h>
@@ -52,6 +53,11 @@
 static void exit_mm(struct task_struct * tsk);
+static inline int task_detached(struct task_struct *p)
+{
+        return p->exit_signal == -1;
+}
 static void __unhash_process(struct task_struct *p)
 {
        nr_threads--;
@@ -160,7 +166,7 @@ repeat:
        zap_leader = 0;
        leader = p->group_leader;
        if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
-                BUG_ON(leader->exit_signal == -1);
+                BUG_ON(task_detached(leader));
                do_notify_parent(leader, leader->exit_signal);
                /*
                 * If we were the last child thread and the leader has
@@ -170,7 +176,7 @@ repeat:
                 * do_notify_parent() will have marked it self-reaping in
                 * that case.
                 */
-                zap_leader = (leader->exit_signal == -1);
+                zap_leader = task_detached(leader);
        }
        write_unlock_irq(&tasklist_lock);
@@ -329,13 +335,11 @@ void __set_special_pids(struct pid *pid)
        pid_t nr = pid_nr(pid);
        if (task_session(curr) != pid) {
-                detach_pid(curr, PIDTYPE_SID);
+                change_pid(curr, PIDTYPE_SID, pid);
-                attach_pid(curr, PIDTYPE_SID, pid);
                set_task_session(curr, nr);
        }
        if (task_pgrp(curr) != pid) {
-                detach_pid(curr, PIDTYPE_PGID);
+                change_pid(curr, PIDTYPE_PGID, pid);
-                attach_pid(curr, PIDTYPE_PGID, pid);
                set_task_pgrp(curr, nr);
        }
 }
@@ -557,6 +561,88 @@ void exit_fs(struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(exit_fs);
+#ifdef CONFIG_MM_OWNER
+/*
+ * Task p is exiting and it owned mm, lets find a new owner for it
+ */
+static inline int
+mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
+{
+        /*
+         * If there are other users of the mm and the owner (us) is exiting
+         * we need to find a new owner to take on the responsibility.
+         */
+        if (!mm)
+                return 0;
+        if (atomic_read(&mm->mm_users) <= 1)
+                return 0;
+        if (mm->owner != p)
+                return 0;
+        return 1;
+}
+void mm_update_next_owner(struct mm_struct *mm)
+{
+        struct task_struct *c, *g, *p = current;
+retry:
+        if (!mm_need_new_owner(mm, p))
+                return;
+        read_lock(&tasklist_lock);
+        /*
+         * Search in the children
+         */
+        list_for_each_entry(c, &p->children, sibling) {
+                if (c->mm == mm)
+                        goto assign_new_owner;
+        }
+        /*
+         * Search in the siblings
+         */
+        list_for_each_entry(c, &p->parent->children, sibling) {
+                if (c->mm == mm)
+                        goto assign_new_owner;
+        }
+        /*
+         * Search through everything else. We should not get
+         * here often
+         */
+        do_each_thread(g, c) {
+                if (c->mm == mm)
+                        goto assign_new_owner;
+        } while_each_thread(g, c);
+        read_unlock(&tasklist_lock);
+        return;
+assign_new_owner:
+        BUG_ON(c == p);
+        get_task_struct(c);
+        /*
+         * The task_lock protects c->mm from changing.
+         * We always want mm->owner->mm == mm
+         */
+        task_lock(c);
+        /*
+         * Delay read_unlock() till we have the task_lock()
+         * to ensure that c does not slip away underneath us
+         */
+        read_unlock(&tasklist_lock);
+        if (c->mm != mm) {
+                task_unlock(c);
+                put_task_struct(c);
+                goto retry;
+        }
+        cgroup_mm_owner_callbacks(mm->owner, c);
+        mm->owner = c;
+        task_unlock(c);
+        put_task_struct(c);
+}
+#endif /* CONFIG_MM_OWNER */
 /*
 * Turn us into a lazy TLB process if we
 * aren't already..
@@ -596,6 +682,7 @@ static void exit_mm(struct task_struct * tsk)
        /* We don't want this task to be frozen prematurely */
        clear_freeze_flag(tsk);
        task_unlock(tsk);
+        mm_update_next_owner(mm);
        mmput(mm);
 }
@@ -610,7 +697,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
        if (unlikely(traced)) {
                /* Preserve ptrace links if someone else is tracing this child.  */
                list_del_init(&p->ptrace_list);
-                if (p->parent != p->real_parent)
+                if (ptrace_reparented(p))
                        list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
        } else {
                /* If this child is being traced, then we're the one tracing it
@@ -634,18 +721,18 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
        /* If this is a threaded reparent there is no need to
         * notify anyone anything has happened.
         */
-        if (p->real_parent->group_leader == father->group_leader)
+        if (same_thread_group(p->real_parent, father))
                return;
        /* We don't want people slaying init.  */
-        if (p->exit_signal != -1)
+        if (!task_detached(p))
                p->exit_signal = SIGCHLD;
        /* If we'd notified the old parent about this child's death,
         * also notify the new parent.
         */
        if (!traced && p->exit_state == EXIT_ZOMBIE &&
-            p->exit_signal != -1 && thread_group_empty(p))
+            !task_detached(p) && thread_group_empty(p))
                do_notify_parent(p, p->exit_signal);
        kill_orphaned_pgrp(p, father);
@@ -698,18 +785,18 @@ static void forget_original_parent(struct task_struct *father)
                } else {
                        /* reparent ptraced task to its real parent */
                        __ptrace_unlink (p);
-                        if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
+                        if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) &&
                            thread_group_empty(p))
                                do_notify_parent(p, p->exit_signal);
                }
                /*
-                 * if the ptraced child is a zombie with exit_signal == -1
+                 * if the ptraced child is a detached zombie we must collect
-                 * we must collect it before we exit, or it will remain
+                 * it before we exit, or it will remain zombie forever since
-                 * zombie forever since we prevented it from self-reap itself
+                 * we prevented it from self-reap itself while it was being
-                 * while it was being traced by us, to be able to see it in wait4.
+                 * traced by us, to be able to see it in wait4.
                 */
-                if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
+                if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p)))
                        list_add(&p->ptrace_list, &ptrace_dead);
        }
@@ -766,29 +853,30 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
         * we have changed execution domain as these two values started
         * the same after a fork.
         */
-        if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
+        if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
            (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
-             tsk->self_exec_id != tsk->parent_exec_id)
+             tsk->self_exec_id != tsk->parent_exec_id) &&
-            && !capable(CAP_KILL))
+            !capable(CAP_KILL))
                tsk->exit_signal = SIGCHLD;
        /* If something other than our normal parent is ptracing us, then
         * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
         * only has special meaning to our real parent.
         */
-        if (tsk->exit_signal != -1 && thread_group_empty(tsk)) {
+        if (!task_detached(tsk) && thread_group_empty(tsk)) {
-                int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD;
+                int signal = ptrace_reparented(tsk) ?
+                                SIGCHLD : tsk->exit_signal;
                do_notify_parent(tsk, signal);
        } else if (tsk->ptrace) {
                do_notify_parent(tsk, SIGCHLD);
        }
        state = EXIT_ZOMBIE;
-        if (tsk->exit_signal == -1 && likely(!tsk->ptrace))
+        if (task_detached(tsk) && likely(!tsk->ptrace))
                state = EXIT_DEAD;
        tsk->exit_state = state;
+        /* mt-exec, de_thread() is waiting for us */
        if (thread_group_leader(tsk) &&
            tsk->signal->notify_count < 0 &&
            tsk->signal->group_exit_task)
@@ -1032,12 +1120,13 @@ asmlinkage long sys_exit(int error_code)
 NORET_TYPE void
 do_group_exit(int exit_code)
 {
+        struct signal_struct *sig = current->signal;
        BUG_ON(exit_code & 0x80); /* core dumps don't get here */
-        if (current->signal->flags & SIGNAL_GROUP_EXIT)
+        if (signal_group_exit(sig))
-                exit_code = current->signal->group_exit_code;
+                exit_code = sig->group_exit_code;
        else if (!thread_group_empty(current)) {
-                struct signal_struct *const sig = current->signal;
                struct sighand_struct *const sighand = current->sighand;
                spin_lock_irq(&sighand->siglock);
                if (signal_group_exit(sig))
@@ -1089,7 +1178,7 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
         * Do not consider detached threads that are
         * not ptraced:
         */
-        if (p->exit_signal == -1 && !p->ptrace)
+        if (task_detached(p) && !p->ptrace)
                return 0;
        /* Wait for all children (clone and not) if __WALL is set;
@@ -1179,8 +1268,7 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
                return 0;
        }
-        /* traced means p->ptrace, but not vice versa */
+        traced = ptrace_reparented(p);
-        traced = (p->real_parent != p->parent);
        if (likely(!traced)) {
                struct signal_struct *psig;
@@ -1281,9 +1369,9 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
                 * If it's still not detached after that, don't release
                 * it now.
                 */
-                if (p->exit_signal != -1) {
+                if (!task_detached(p)) {
                        do_notify_parent(p, p->exit_signal);
-                        if (p->exit_signal != -1) {
+                        if (!task_detached(p)) {
                                p->exit_state = EXIT_ZOMBIE;
                                p = NULL;
                        }
diff --git a/kernel/fork.c b/kernel/fork.c
index 6067e429f281..933e60ebccae 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -22,6 +22,7 @@
 #include <linux/mempolicy.h>
 #include <linux/sem.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/key.h>
 #include <linux/binfmts.h>
 #include <linux/mman.h>
@@ -381,14 +382,13 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
        mm->ioctx_list = NULL;
        mm->free_area_cache = TASK_UNMAPPED_BASE;
        mm->cached_hole_size = ~0UL;
-        mm_init_cgroup(mm, p);
+        mm_init_owner(mm, p);
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
                return mm;
        }
-        mm_free_cgroup(mm);
        free_mm(mm);
        return NULL;
 }
@@ -432,13 +432,13 @@ void mmput(struct mm_struct *mm)
        if (atomic_dec_and_test(&mm->mm_users)) {
                exit_aio(mm);
                exit_mmap(mm);
+                set_mm_exe_file(mm, NULL);
                if (!list_empty(&mm->mmlist)) {
                        spin_lock(&mmlist_lock);
                        list_del(&mm->mmlist);
                        spin_unlock(&mmlist_lock);
                }
                put_swap_token(mm);
-                mm_free_cgroup(mm);
                mmdrop(mm);
        }
 }
@@ -545,6 +545,8 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
        if (init_new_context(tsk, mm))
                goto fail_nocontext;
+        dup_mm_exe_file(oldmm, mm);
        err = dup_mmap(mm, oldmm);
        if (err)
                goto free_pt;
@@ -891,7 +893,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sig->group_exit_code = 0;
        sig->group_exit_task = NULL;
        sig->group_stop_count = 0;
-        sig->curr_target = NULL;
+        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_LIST_HEAD(&sig->posix_timers);
@@ -982,6 +984,13 @@ static void rt_mutex_init_task(struct task_struct *p)
 #endif
 }
+#ifdef CONFIG_MM_OWNER
+void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+        mm->owner = p;
+}
+#endif /* CONFIG_MM_OWNER */
 /*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
@@ -1664,18 +1673,6 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
 }
 /*
- * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not
- * supported yet
- */
-static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp)
-{
-        if (unshare_flags & CLONE_SYSVSEM)
-                return -EINVAL;
-        return 0;
-}
-/*
 * unshare allows a process to 'unshare' part of the process
 * context which was originally shared using clone.  copy_*
 * functions used by do_fork() cannot be used here directly
@@ -1690,8 +1687,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
        struct sighand_struct *new_sigh = NULL;
        struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
        struct files_struct *fd, *new_fd = NULL;
-        struct sem_undo_list *new_ulist = NULL;
        struct nsproxy *new_nsproxy = NULL;
+        int do_sysvsem = 0;
        check_unshare_flags(&unshare_flags);
@@ -1703,6 +1700,13 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
                                CLONE_NEWNET))
                goto bad_unshare_out;
+        /*
+         * CLONE_NEWIPC must also detach from the undolist: after switching
+         * to a new ipc namespace, the semaphore arrays from the old
+         * namespace are unreachable.
+         */
+        if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
+                do_sysvsem = 1;
        if ((err = unshare_thread(unshare_flags)))
                goto bad_unshare_out;
        if ((err = unshare_fs(unshare_flags, &new_fs)))
@@ -1713,13 +1717,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
                goto bad_unshare_cleanup_sigh;
        if ((err = unshare_fd(unshare_flags, &new_fd)))
                goto bad_unshare_cleanup_vm;
-        if ((err = unshare_semundo(unshare_flags, &new_ulist)))
-                goto bad_unshare_cleanup_fd;
        if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
                        new_fs)))
-                goto bad_unshare_cleanup_semundo;
+                goto bad_unshare_cleanup_fd;
-        if (new_fs ||  new_mm || new_fd || new_ulist || new_nsproxy) {
+        if (new_fs ||  new_mm || new_fd || do_sysvsem || new_nsproxy) {
+                if (do_sysvsem) {
+                        /*
+                         * CLONE_SYSVSEM is equivalent to sys_exit().
+                         */
+                        exit_sem(current);
+                }
                if (new_nsproxy) {
                        switch_task_namespaces(current, new_nsproxy);
@@ -1755,7 +1763,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
        if (new_nsproxy)
                put_nsproxy(new_nsproxy);
-bad_unshare_cleanup_semundo:
 bad_unshare_cleanup_fd:
        if (new_fd)
                put_files_struct(new_fd);
diff --git a/kernel/futex.c b/kernel/futex.c
index e43945e995f5..449def8074fe 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -104,10 +104,6 @@ struct futex_q {
        /* Key which the futex is hashed on: */
        union futex_key key;
-        /* For fd, sigio sent using these: */
-        int fd;
-        struct file *filp;
        /* Optional priority inheritance state: */
        struct futex_pi_state *pi_state;
        struct task_struct *task;
@@ -126,9 +122,6 @@ struct futex_hash_bucket {
 static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
-/* Futex-fs vfsmount entry: */
-static struct vfsmount *futex_mnt;
 /*
 * Take mm->mmap_sem, when futex is shared
 */
@@ -610,8 +603,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 static void wake_futex(struct futex_q *q)
 {
        plist_del(&q->list, &q->list.plist);
-        if (q->filp)
-                send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
        /*
         * The lock in wake_up_all() is a crucial memory barrier after the
         * plist_del() and also before assigning to q->lock_ptr.
@@ -988,14 +979,10 @@ out:
 }
 /* The key must be already stored in q->key. */
-static inline struct futex_hash_bucket *
+static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
-queue_lock(struct futex_q *q, int fd, struct file *filp)
 {
        struct futex_hash_bucket *hb;
-        q->fd = fd;
-        q->filp = filp;
        init_waitqueue_head(&q->waiters);
        get_futex_key_refs(&q->key);
@@ -1006,7 +993,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
        return hb;
 }
-static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 {
        int prio;
@@ -1041,15 +1028,6 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
 * exactly once.  They are called with the hashed spinlock held.
 */
-/* The key must be already stored in q->key. */
-static void queue_me(struct futex_q *q, int fd, struct file *filp)
-{
-        struct futex_hash_bucket *hb;
-        hb = queue_lock(q, fd, filp);
-        __queue_me(q, hb);
-}
 /* Return 1 if we were still queued (ie. 0 means we were woken) */
 static int unqueue_me(struct futex_q *q)
 {
@@ -1194,7 +1172,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
        if (unlikely(ret != 0))
                goto out_release_sem;
-        hb = queue_lock(&q, -1, NULL);
+        hb = queue_lock(&q);
        /*
         * Access the page AFTER the futex is queued.
@@ -1238,7 +1216,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
                goto out_unlock_release_sem;
        /* Only actually queue if *uaddr contained val.  */
-        __queue_me(&q, hb);
+        queue_me(&q, hb);
        /*
         * Now the futex is queued and we have checked the data, we
@@ -1266,11 +1244,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
                if (!abs_time)
                        schedule();
                else {
-                        hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+                        hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
+                                                HRTIMER_MODE_ABS);
                        hrtimer_init_sleeper(&t, current);
                        t.timer.expires = *abs_time;
-                        hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS);
+                        hrtimer_start(&t.timer, t.timer.expires,
+                                                HRTIMER_MODE_ABS);
                        if (!hrtimer_active(&t.timer))
                                t.task = NULL;
@@ -1286,6 +1266,8 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
                        /* Flag if a timeout occured */
                        rem = (t.task == NULL);
+                        destroy_hrtimer_on_stack(&t.timer);
                }
        }
        __set_current_state(TASK_RUNNING);
@@ -1367,7 +1349,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
        if (time) {
                to = &timeout;
-                hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+                hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
+                                      HRTIMER_MODE_ABS);
                hrtimer_init_sleeper(to, current);
                to->timer.expires = *time;
        }
@@ -1381,7 +1364,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
                goto out_release_sem;
 retry_unlocked:
-        hb = queue_lock(&q, -1, NULL);
+        hb = queue_lock(&q);
 retry_locked:
        ret = lock_taken = 0;
@@ -1494,7 +1477,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
        /*
         * Only actually queue now that the atomic ops are done:
         */
-        __queue_me(&q, hb);
+        queue_me(&q, hb);
        /*
         * Now the futex is queued and we have checked the data, we
@@ -1581,6 +1564,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
        unqueue_me_pi(&q);
        futex_unlock_mm(fshared);
+        if (to)
+                destroy_hrtimer_on_stack(&to->timer);
        return ret != -EINTR ? ret : -ERESTARTNOINTR;
 out_unlock_release_sem:
@@ -1588,6 +1573,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 out_release_sem:
        futex_unlock_mm(fshared);
+        if (to)
+                destroy_hrtimer_on_stack(&to->timer);
        return ret;
 uaddr_faulted:
@@ -1615,6 +1602,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
        if (!ret && (uval != -EFAULT))
                goto retry;
+        if (to)
+                destroy_hrtimer_on_stack(&to->timer);
        return ret;
 }
@@ -1735,121 +1724,6 @@ pi_faulted:
        return ret;
 }
-static int futex_close(struct inode *inode, struct file *filp)
-{
-        struct futex_q *q = filp->private_data;
-        unqueue_me(q);
-        kfree(q);
-        return 0;
-}
-/* This is one-shot: once it's gone off you need a new fd */
-static unsigned int futex_poll(struct file *filp,
-                               struct poll_table_struct *wait)
-{
-        struct futex_q *q = filp->private_data;
-        int ret = 0;
-        poll_wait(filp, &q->waiters, wait);
-        /*
-         * plist_node_empty() is safe here without any lock.
-         * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
-         */
-        if (plist_node_empty(&q->list))
-                ret = POLLIN | POLLRDNORM;
-        return ret;
-}
-static const struct file_operations futex_fops = {
-        .release        = futex_close,
-        .poll           = futex_poll,
-};
-/*
- * Signal allows caller to avoid the race which would occur if they
- * set the sigio stuff up afterwards.
- */
-static int futex_fd(u32 __user *uaddr, int signal)
-{
-        struct futex_q *q;
-        struct file *filp;
-        int ret, err;
-        struct rw_semaphore *fshared;
-        static unsigned long printk_interval;
-        if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
-                printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
-                       "will be removed from the kernel in June 2007\n",
-                       current->comm);
-        }
-        ret = -EINVAL;
-        if (!valid_signal(signal))
-                goto out;
-        ret = get_unused_fd();
-        if (ret < 0)
-                goto out;
-        filp = get_empty_filp();
-        if (!filp) {
-                put_unused_fd(ret);
-                ret = -ENFILE;
-                goto out;
-        }
-        filp->f_op = &futex_fops;
-        filp->f_path.mnt = mntget(futex_mnt);
-        filp->f_path.dentry = dget(futex_mnt->mnt_root);
-        filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
-        if (signal) {
-                err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
-                if (err < 0) {
-                        goto error;
-                }
-                filp->f_owner.signum = signal;
-        }
-        q = kmalloc(sizeof(*q), GFP_KERNEL);
-        if (!q) {
-                err = -ENOMEM;
-                goto error;
-        }
-        q->pi_state = NULL;
-        fshared = &current->mm->mmap_sem;
-        down_read(fshared);
-        err = get_futex_key(uaddr, fshared, &q->key);
-        if (unlikely(err != 0)) {
-                up_read(fshared);
-                kfree(q);
-                goto error;
-        }
-        /*
-         * queue_me() must be called before releasing mmap_sem, because
-         * key->shared.inode needs to be referenced while holding it.
-         */
-        filp->private_data = q;
-        queue_me(q, ret, filp);
-        up_read(fshared);
-        /* Now we map fd to filp, so userspace can access it */
-        fd_install(ret, filp);
-out:
-        return ret;
-error:
-        put_unused_fd(ret);
-        put_filp(filp);
-        ret = err;
-        goto out;
-}
 /*
 * Support for robust futexes: the kernel cleans up held futexes at
 * thread exit time.
@@ -2081,10 +1955,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
        case FUTEX_WAKE_BITSET:
                ret = futex_wake(uaddr, fshared, val, val3);
                break;
-        case FUTEX_FD:
-                /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
-                ret = futex_fd(uaddr, val);
-                break;
        case FUTEX_REQUEUE:
                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
                break;
@@ -2145,19 +2015,6 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
-static int futexfs_get_sb(struct file_system_type *fs_type,
-                          int flags, const char *dev_name, void *data,
-                          struct vfsmount *mnt)
-{
-        return get_sb_pseudo(fs_type, "futex", NULL, FUTEXFS_SUPER_MAGIC, mnt);
-}
-static struct file_system_type futex_fs_type = {
-        .name           = "futexfs",
-        .get_sb         = futexfs_get_sb,
-        .kill_sb        = kill_anon_super,
-};
 static int __init futex_init(void)
 {
        u32 curval;
@@ -2182,16 +2039,6 @@ static int __init futex_init(void)
                spin_lock_init(&futex_queues[i].lock);
        }
-        i = register_filesystem(&futex_fs_type);
-        if (i)
-                return i;
-        futex_mnt = kern_mount(&futex_fs_type);
-        if (IS_ERR(futex_mnt)) {
-                unregister_filesystem(&futex_fs_type);
-                return PTR_ERR(futex_mnt);
-        }
        return 0;
 }
 __initcall(futex_init);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index e379ef0e9c20..421be5fe5cc7 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -43,6 +43,7 @@
 #include <linux/tick.h>
 #include <linux/seq_file.h>
 #include <linux/err.h>
+#include <linux/debugobjects.h>
 #include <asm/uaccess.h>
@@ -153,15 +154,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 }
 /*
- * Helper function to check, whether the timer is running the callback
- * function
- */
-static inline int hrtimer_callback_running(struct hrtimer *timer)
-{
-        return timer->state & HRTIMER_STATE_CALLBACK;
-}
-/*
 * Functions and macros which are different for UP/SMP systems are kept in a
 * single place
 */
@@ -342,6 +334,115 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
        return res;
 }
+#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
+static struct debug_obj_descr hrtimer_debug_descr;
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
+{
+        struct hrtimer *timer = addr;
+        switch (state) {
+        case ODEBUG_STATE_ACTIVE:
+                hrtimer_cancel(timer);
+                debug_object_init(timer, &hrtimer_debug_descr);
+                return 1;
+        default:
+                return 0;
+        }
+}
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ */
+static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
+{
+        switch (state) {
+        case ODEBUG_STATE_NOTAVAILABLE:
+                WARN_ON_ONCE(1);
+                return 0;
+        case ODEBUG_STATE_ACTIVE:
+                WARN_ON(1);
+        default:
+                return 0;
+        }
+}
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
+{
+        struct hrtimer *timer = addr;
+        switch (state) {
+        case ODEBUG_STATE_ACTIVE:
+                hrtimer_cancel(timer);
+                debug_object_free(timer, &hrtimer_debug_descr);
+                return 1;
+        default:
+                return 0;
+        }
+}
+static struct debug_obj_descr hrtimer_debug_descr = {
+        .name           = "hrtimer",
+        .fixup_init     = hrtimer_fixup_init,
+        .fixup_activate = hrtimer_fixup_activate,
+        .fixup_free     = hrtimer_fixup_free,
+};
+static inline void debug_hrtimer_init(struct hrtimer *timer)
+{
+        debug_object_init(timer, &hrtimer_debug_descr);
+}
+static inline void debug_hrtimer_activate(struct hrtimer *timer)
+{
+        debug_object_activate(timer, &hrtimer_debug_descr);
+}
+static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
+{
+        debug_object_deactivate(timer, &hrtimer_debug_descr);
+}
+static inline void debug_hrtimer_free(struct hrtimer *timer)
+{
+        debug_object_free(timer, &hrtimer_debug_descr);
+}
+static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
+                           enum hrtimer_mode mode);
+void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
+                           enum hrtimer_mode mode)
+{
+        debug_object_init_on_stack(timer, &hrtimer_debug_descr);
+        __hrtimer_init(timer, clock_id, mode);
+}
+void destroy_hrtimer_on_stack(struct hrtimer *timer)
+{
+        debug_object_free(timer, &hrtimer_debug_descr);
+}
+#else
+static inline void debug_hrtimer_init(struct hrtimer *timer) { }
+static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
+static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
+#endif
 /*
 * Check, whether the timer is on the callback pending list
 */
@@ -567,6 +668,7 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
                /* Timer is expired, act upon the callback mode */
                switch(timer->cb_mode) {
                case HRTIMER_CB_IRQSAFE_NO_RESTART:
+                        debug_hrtimer_deactivate(timer);
                        /*
                         * We can call the callback from here. No restart
                         * happens, so no danger of recursion
@@ -581,6 +683,7 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
                         * the tick timer in the softirq ! The calling site
                         * takes care of this.
                         */
+                        debug_hrtimer_deactivate(timer);
                        return 1;
                case HRTIMER_CB_IRQSAFE:
                case HRTIMER_CB_SOFTIRQ:
@@ -590,7 +693,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
                        list_add_tail(&timer->cb_entry,
                                      &base->cpu_base->cb_pending);
                        timer->state = HRTIMER_STATE_PENDING;
-                        raise_softirq(HRTIMER_SOFTIRQ);
                        return 1;
                default:
                        BUG();
@@ -633,6 +735,11 @@ static int hrtimer_switch_to_hres(void)
        return 1;
 }
+static inline void hrtimer_raise_softirq(void)
+{
+        raise_softirq(HRTIMER_SOFTIRQ);
+}
 #else
 static inline int hrtimer_hres_active(void) { return 0; }
@@ -651,6 +758,7 @@ static inline int hrtimer_reprogram(struct hrtimer *timer,
 {
        return 0;
 }
+static inline void hrtimer_raise_softirq(void) { }
 #endif /* CONFIG_HIGH_RES_TIMERS */
@@ -730,6 +838,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
        struct hrtimer *entry;
        int leftmost = 1;
+        debug_hrtimer_activate(timer);
        /*
         * Find the right place in the rbtree:
         */
@@ -826,6 +936,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
                 * reprogramming happens in the interrupt handler. This is a
                 * rare case and less expensive than a smp call.
                 */
+                debug_hrtimer_deactivate(timer);
                timer_stats_hrtimer_clear_start_info(timer);
                reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
                __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
@@ -850,7 +961,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 {
        struct hrtimer_clock_base *base, *new_base;
        unsigned long flags;
-        int ret;
+        int ret, raise;
        base = lock_hrtimer_base(timer, &flags);
@@ -873,6 +984,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
                tim = ktime_add_safe(tim, base->resolution);
 #endif
        }
        timer->expires = tim;
        timer_stats_hrtimer_set_start_info(timer);
@@ -884,8 +996,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
        enqueue_hrtimer(timer, new_base,
                        new_base->cpu_base == &__get_cpu_var(hrtimer_bases));
+        /*
+         * The timer may be expired and moved to the cb_pending
+         * list. We can not raise the softirq with base lock held due
+         * to a possible deadlock with runqueue lock.
+         */
+        raise = timer->state == HRTIMER_STATE_PENDING;
        unlock_hrtimer_base(timer, &flags);
+        if (raise)
+                hrtimer_raise_softirq();
        return ret;
 }
 EXPORT_SYMBOL_GPL(hrtimer_start);
@@ -996,14 +1118,8 @@ ktime_t hrtimer_get_next_event(void)
 }
 #endif
-/**
+static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
- * hrtimer_init - initialize a timer to the given clock
+                           enum hrtimer_mode mode)
- * @timer:      the timer to be initialized
- * @clock_id:   the clock to be used
- * @mode:       timer mode abs/rel
- */
-void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
-                  enum hrtimer_mode mode)
 {
        struct hrtimer_cpu_base *cpu_base;
@@ -1024,6 +1140,19 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
        memset(timer->start_comm, 0, TASK_COMM_LEN);
 #endif
 }
+/**
+ * hrtimer_init - initialize a timer to the given clock
+ * @timer:      the timer to be initialized
+ * @clock_id:   the clock to be used
+ * @mode:       timer mode abs/rel
+ */
+void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
+                  enum hrtimer_mode mode)
+{
+        debug_hrtimer_init(timer);
+        __hrtimer_init(timer, clock_id, mode);
+}
 EXPORT_SYMBOL_GPL(hrtimer_init);
 /**
@@ -1057,6 +1186,7 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
                timer = list_entry(cpu_base->cb_pending.next,
                                   struct hrtimer, cb_entry);
+                debug_hrtimer_deactivate(timer);
                timer_stats_account_hrtimer(timer);
                fn = timer->function;
@@ -1105,6 +1235,7 @@ static void __run_hrtimer(struct hrtimer *timer)
        enum hrtimer_restart (*fn)(struct hrtimer *);
        int restart;
+        debug_hrtimer_deactivate(timer);
        __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
        timer_stats_account_hrtimer(timer);
@@ -1363,22 +1494,27 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
 {
        struct hrtimer_sleeper t;
        struct timespec __user  *rmtp;
+        int ret = 0;
-        hrtimer_init(&t.timer, restart->nanosleep.index, HRTIMER_MODE_ABS);
+        hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
+                                HRTIMER_MODE_ABS);
        t.timer.expires.tv64 = restart->nanosleep.expires;
        if (do_nanosleep(&t, HRTIMER_MODE_ABS))
-                return 0;
+                goto out;
        rmtp = restart->nanosleep.rmtp;
        if (rmtp) {
-                int ret = update_rmtp(&t.timer, rmtp);
+                ret = update_rmtp(&t.timer, rmtp);
                if (ret <= 0)
-                        return ret;
+                        goto out;
        }
        /* The other values in restart are already filled in */
-        return -ERESTART_RESTARTBLOCK;
+        ret = -ERESTART_RESTARTBLOCK;
+out:
+        destroy_hrtimer_on_stack(&t.timer);
+        return ret;
 }
 long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
@@ -1386,20 +1522,23 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 {
        struct restart_block *restart;
        struct hrtimer_sleeper t;
+        int ret = 0;
-        hrtimer_init(&t.timer, clockid, mode);
+        hrtimer_init_on_stack(&t.timer, clockid, mode);
        t.timer.expires = timespec_to_ktime(*rqtp);
        if (do_nanosleep(&t, mode))
-                return 0;
+                goto out;
        /* Absolute timers do not update the rmtp value and restart: */
-        if (mode == HRTIMER_MODE_ABS)
+        if (mode == HRTIMER_MODE_ABS) {
-                return -ERESTARTNOHAND;
+                ret = -ERESTARTNOHAND;
+                goto out;
+        }
        if (rmtp) {
-                int ret = update_rmtp(&t.timer, rmtp);
+                ret = update_rmtp(&t.timer, rmtp);
                if (ret <= 0)
-                        return ret;
+                        goto out;
        }
        restart = &current_thread_info()->restart_block;
@@ -1408,7 +1547,10 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
        restart->nanosleep.rmtp = rmtp;
        restart->nanosleep.expires = t.timer.expires.tv64;
-        return -ERESTART_RESTARTBLOCK;
+        ret = -ERESTART_RESTARTBLOCK;
+out:
+        destroy_hrtimer_on_stack(&t.timer);
+        return ret;
 }
 asmlinkage long
@@ -1453,6 +1595,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
        while ((node = rb_first(&old_base->active))) {
                timer = rb_entry(node, struct hrtimer, node);
                BUG_ON(hrtimer_callback_running(timer));
+                debug_hrtimer_deactivate(timer);
                __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
                timer->base = new_base;
                /*
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 6d9204f3a370..38a25b8d8bff 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -1,6 +1,7 @@
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/device.h>
+#include <linux/gfp.h>
 /*
 * Device resource management aware IRQ request/free implementation.
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 438a01464287..46d6611a33bb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
+#include <linux/slab.h>
 #include "internals.h"
@@ -149,6 +150,26 @@ void disable_irq(unsigned int irq)
 }
 EXPORT_SYMBOL(disable_irq);
+static void __enable_irq(struct irq_desc *desc, unsigned int irq)
+{
+        switch (desc->depth) {
+        case 0:
+                printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
+                WARN_ON(1);
+                break;
+        case 1: {
+                unsigned int status = desc->status & ~IRQ_DISABLED;
+                /* Prevent probing on this irq: */
+                desc->status = status | IRQ_NOPROBE;
+                check_irq_resend(desc, irq);
+                /* fall-through */
+        }
+        default:
+                desc->depth--;
+        }
+}
 /**
 *      enable_irq - enable handling of an irq
 *      @irq: Interrupt to enable
@@ -168,22 +189,7 @@ void enable_irq(unsigned int irq)
                return;
        spin_lock_irqsave(&desc->lock, flags);
-        switch (desc->depth) {
+        __enable_irq(desc, irq);
-        case 0:
-                printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
-                WARN_ON(1);
-                break;
-        case 1: {
-                unsigned int status = desc->status & ~IRQ_DISABLED;
-                /* Prevent probing on this irq: */
-                desc->status = status | IRQ_NOPROBE;
-                check_irq_resend(desc, irq);
-                /* fall-through */
-        }
-        default:
-                desc->depth--;
-        }
        spin_unlock_irqrestore(&desc->lock, flags);
 }
 EXPORT_SYMBOL(enable_irq);
@@ -364,7 +370,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
                        compat_irq_chip_set_default_handler(desc);
                desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
-                                  IRQ_INPROGRESS);
+                                  IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
                if (!(desc->status & IRQ_NOAUTOEN)) {
                        desc->depth = 0;
@@ -380,6 +386,16 @@ int setup_irq(unsigned int irq, struct irqaction *new)
        /* Reset broken irq detection when installing new handler */
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
+        /*
+         * Check whether we disabled the irq via the spurious handler
+         * before. Reenable it and give it another chance.
+         */
+        if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
+                desc->status &= ~IRQ_SPURIOUS_DISABLED;
+                __enable_irq(desc, irq);
+        }
        spin_unlock_irqrestore(&desc->lock, flags);
        new->irq = irq;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 088dabbf2d6a..c66d3f10e853 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -209,8 +209,8 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                 * Now kill the IRQ
                 */
                printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
-                desc->status |= IRQ_DISABLED;
+                desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
-                desc->depth = 1;
+                desc->depth++;
                desc->chip->disable(irq);
        }
        desc->irqs_unhandled = 0;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index f091d13def00..6fc0040f3e3a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -472,11 +472,7 @@ static const struct file_operations kallsyms_operations = {
 static int __init kallsyms_init(void)
 {
-        struct proc_dir_entry *entry;
+        proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
-        entry = create_proc_entry("kallsyms", 0444, NULL);
-        if (entry)
-                entry->proc_fops = &kallsyms_operations;
        return 0;
 }
 __initcall(kallsyms_init);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index cb85c79989b4..1c5fcacbcf33 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1217,7 +1217,7 @@ static int __init parse_crashkernel_mem(char 			*cmdline,
                }
                /* match ? */
-                if (system_ram >= start && system_ram <= end) {
+                if (system_ram >= start && system_ram < end) {
                        *crash_size = size;
                        break;
                }
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 1bd0ec1c80b2..39e31a036f5b 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -61,7 +61,7 @@ struct kgdb_state {
        int                     err_code;
        int                     cpu;
        int                     pass_exception;
-        long                    threadid;
+        unsigned long           threadid;
        long                    kgdb_usethreadid;
        struct pt_regs          *linux_regs;
 };
@@ -146,7 +146,7 @@ atomic_t			kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
 * the other CPUs might interfere with your debugging context, so
 * use this with care:
 */
-int                             kgdb_do_roundup = 1;
+static int kgdb_do_roundup = 1;
 static int __init opt_nokgdbroundup(char *str)
 {
@@ -438,7 +438,7 @@ int kgdb_hex2mem(char *buf, char *mem, int count)
 * While we find nice hex chars, build a long_val.
 * Return number of chars processed.
 */
-int kgdb_hex2long(char **ptr, long *long_val)
+int kgdb_hex2long(char **ptr, unsigned long *long_val)
 {
        int hex_val;
        int num = 0;
@@ -709,7 +709,7 @@ int kgdb_isremovedbreak(unsigned long addr)
        return 0;
 }
-int remove_all_break(void)
+static int remove_all_break(void)
 {
        unsigned long addr;
        int error;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index e2764047ec03..8df97d3dfda8 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -27,6 +27,7 @@
 #include <linux/mnt_namespace.h>
 #include <linux/completion.h>
 #include <linux/file.h>
+#include <linux/fdtable.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
 #include <linux/mount.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 92cf6930ab51..bd1b9ea024e1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -98,7 +98,7 @@ static void create_kthread(struct kthread_create_info *create)
                struct sched_param param = { .sched_priority = 0 };
                wait_for_completion(&create->started);
                read_lock(&tasklist_lock);
-                create->result = find_task_by_pid(pid);
+                create->result = find_task_by_pid_ns(pid, &init_pid_ns);
                read_unlock(&tasklist_lock);
                /*
                 * root may have changed our (kthreadd's) priority or CPU mask.
@@ -144,9 +144,9 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
        spin_lock(&kthread_create_lock);
        list_add_tail(&create.list, &kthread_create_list);
-        wake_up_process(kthreadd_task);
        spin_unlock(&kthread_create_lock);
+        wake_up_process(kthreadd_task);
        wait_for_completion(&create.done);
        if (!IS_ERR(create.result)) {
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 7c74dab0d21b..5e7b45c56923 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -233,14 +233,7 @@ static struct file_operations lstats_fops = {
 static int __init init_lstats_procfs(void)
 {
-        struct proc_dir_entry *pe;
+        proc_create("latency_stats", 0644, NULL, &lstats_fops);
-        pe = create_proc_entry("latency_stats", 0644, NULL);
-        if (!pe)
-                return -ENOMEM;
-        pe->proc_fops = &lstats_fops;
        return 0;
 }
 __initcall(init_lstats_procfs);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 8a135bd163c2..dc5d29648d85 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -660,20 +660,12 @@ static const struct file_operations proc_lock_stat_operations = {
 static int __init lockdep_proc_init(void)
 {
-        struct proc_dir_entry *entry;
+        proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
+        proc_create("lockdep_stats", S_IRUSR, NULL,
-        entry = create_proc_entry("lockdep", S_IRUSR, NULL);
+                    &proc_lockdep_stats_operations);
-        if (entry)
-                entry->proc_fops = &proc_lockdep_operations;
-        entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL);
-        if (entry)
-                entry->proc_fops = &proc_lockdep_stats_operations;
 #ifdef CONFIG_LOCK_STAT
-        entry = create_proc_entry("lock_stat", S_IRUSR, NULL);
+        proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations);
-        if (entry)
-                entry->proc_fops = &proc_lock_stat_operations;
 #endif
        return 0;
diff --git a/kernel/marker.c b/kernel/marker.c
index 005b95954593..b5a9fe1d50d5 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -23,12 +23,13 @@
 #include <linux/rcupdate.h>
 #include <linux/marker.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 extern struct marker __start___markers[];
 extern struct marker __stop___markers[];
 /* Set to 1 to enable marker debug output */
-const int marker_debug;
+static const int marker_debug;
 /*
 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
diff --git a/kernel/module.c b/kernel/module.c
index 8d6cccc6c3cf..8e4528c9909f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -164,131 +164,140 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
        return NULL;
 }
-static void printk_unused_warning(const char *name)
+static bool always_ok(bool gplok, bool warn, const char *name)
 {
-        printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
+        return true;
-                "however this module is using it.\n", name);
-        printk(KERN_WARNING "This symbol will go away in the future.\n");
-        printk(KERN_WARNING "Please evalute if this is the right api to use, "
-                "and if it really is, submit a report the linux kernel "
-                "mailinglist together with submitting your code for "
-                "inclusion.\n");
 }
-/* Find a symbol, return value, crc and module which owns it */
+static bool printk_unused_warning(bool gplok, bool warn, const char *name)
-static unsigned long __find_symbol(const char *name,
-                                   struct module **owner,
-                                   const unsigned long **crc,
-                                   int gplok)
 {
-        struct module *mod;
+        if (warn) {
-        const struct kernel_symbol *ks;
+                printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
+                       "however this module is using it.\n", name);
-        /* Core kernel first. */
+                printk(KERN_WARNING
-        *owner = NULL;
+                       "This symbol will go away in the future.\n");
-        ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
+                printk(KERN_WARNING
-        if (ks) {
+                       "Please evalute if this is the right api to use and if "
-                *crc = symversion(__start___kcrctab, (ks - __start___ksymtab));
+                       "it really is, submit a report the linux kernel "
-                return ks->value;
+                       "mailinglist together with submitting your code for "
-        }
+                       "inclusion.\n");
-        if (gplok) {
-                ks = lookup_symbol(name, __start___ksymtab_gpl,
-                                         __stop___ksymtab_gpl);
-                if (ks) {
-                        *crc = symversion(__start___kcrctab_gpl,
-                                          (ks - __start___ksymtab_gpl));
-                        return ks->value;
-                }
        }
-        ks = lookup_symbol(name, __start___ksymtab_gpl_future,
+        return true;
-                                 __stop___ksymtab_gpl_future);
+}
-        if (ks) {
-                if (!gplok) {
+static bool gpl_only_unused_warning(bool gplok, bool warn, const char *name)
-                        printk(KERN_WARNING "Symbol %s is being used "
+{
-                               "by a non-GPL module, which will not "
+        if (!gplok)
-                               "be allowed in the future\n", name);
+                return false;
-                        printk(KERN_WARNING "Please see the file "
+        return printk_unused_warning(gplok, warn, name);
-                               "Documentation/feature-removal-schedule.txt "
+}
-                               "in the kernel source tree for more "
-                               "details.\n");
+static bool gpl_only(bool gplok, bool warn, const char *name)
-                }
+{
-                *crc = symversion(__start___kcrctab_gpl_future,
+        return gplok;
-                                  (ks - __start___ksymtab_gpl_future));
+}
-                return ks->value;
+static bool warn_if_not_gpl(bool gplok, bool warn, const char *name)
+{
+        if (!gplok && warn) {
+                printk(KERN_WARNING "Symbol %s is being used "
+                       "by a non-GPL module, which will not "
+                       "be allowed in the future\n", name);
+                printk(KERN_WARNING "Please see the file "
+                       "Documentation/feature-removal-schedule.txt "
+                       "in the kernel source tree for more details.\n");
        }
+        return true;
+}
-        ks = lookup_symbol(name, __start___ksymtab_unused,
+struct symsearch {
-                                 __stop___ksymtab_unused);
+        const struct kernel_symbol *start, *stop;
-        if (ks) {
+        const unsigned long *crcs;
-                printk_unused_warning(name);
+        bool (*check)(bool gplok, bool warn, const char *name);
-                *crc = symversion(__start___kcrctab_unused,
+};
-                                  (ks - __start___ksymtab_unused));
-                return ks->value;
+/* Look through this array of symbol tables for a symbol match which
+ * passes the check function. */
+static const struct kernel_symbol *search_symarrays(const struct symsearch *arr,
+                                                    unsigned int num,
+                                                    const char *name,
+                                                    bool gplok,
+                                                    bool warn,
+                                                    const unsigned long **crc)
+{
+        unsigned int i;
+        const struct kernel_symbol *ks;
+        for (i = 0; i < num; i++) {
+                ks = lookup_symbol(name, arr[i].start, arr[i].stop);
+                if (!ks || !arr[i].check(gplok, warn, name))
+                        continue;
+                if (crc)
+                        *crc = symversion(arr[i].crcs, ks - arr[i].start);
+                return ks;
        }
+        return NULL;
+}
-        if (gplok)
+/* Find a symbol, return value, (optional) crc and (optional) module
-                ks = lookup_symbol(name, __start___ksymtab_unused_gpl,
+ * which owns it */
-                                 __stop___ksymtab_unused_gpl);
+static unsigned long find_symbol(const char *name,
+                                 struct module **owner,
+                                 const unsigned long **crc,
+                                 bool gplok,
+                                 bool warn)
+{
+        struct module *mod;
+        const struct kernel_symbol *ks;
+        const struct symsearch arr[] = {
+                { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
+                  always_ok },
+                { __start___ksymtab_gpl, __stop___ksymtab_gpl,
+                  __start___kcrctab_gpl, gpl_only },
+                { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
+                  __start___kcrctab_gpl_future, warn_if_not_gpl },
+                { __start___ksymtab_unused, __stop___ksymtab_unused,
+                  __start___kcrctab_unused, printk_unused_warning },
+                { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
+                  __start___kcrctab_unused_gpl, gpl_only_unused_warning },
+        };
+        /* Core kernel first. */
+        ks = search_symarrays(arr, ARRAY_SIZE(arr), name, gplok, warn, crc);
        if (ks) {
-                printk_unused_warning(name);
+                if (owner)
-                *crc = symversion(__start___kcrctab_unused_gpl,
+                        *owner = NULL;
-                                  (ks - __start___ksymtab_unused_gpl));
                return ks->value;
        }
        /* Now try modules. */
        list_for_each_entry(mod, &modules, list) {
-                *owner = mod;
+                struct symsearch arr[] = {
-                ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
+                        { mod->syms, mod->syms + mod->num_syms, mod->crcs,
+                          always_ok },
+                        { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
+                          mod->gpl_crcs, gpl_only },
+                        { mod->gpl_future_syms,
+                          mod->gpl_future_syms + mod->num_gpl_future_syms,
+                          mod->gpl_future_crcs, warn_if_not_gpl },
+                        { mod->unused_syms,
+                          mod->unused_syms + mod->num_unused_syms,
+                          mod->unused_crcs, printk_unused_warning },
+                        { mod->unused_gpl_syms,
+                          mod->unused_gpl_syms + mod->num_unused_gpl_syms,
+                          mod->unused_gpl_crcs, gpl_only_unused_warning },
+                };
+                ks = search_symarrays(arr, ARRAY_SIZE(arr),
+                                      name, gplok, warn, crc);
                if (ks) {
-                        *crc = symversion(mod->crcs, (ks - mod->syms));
+                        if (owner)
-                        return ks->value;
+                                *owner = mod;
-                }
-                if (gplok) {
-                        ks = lookup_symbol(name, mod->gpl_syms,
-                                           mod->gpl_syms + mod->num_gpl_syms);
-                        if (ks) {
-                                *crc = symversion(mod->gpl_crcs,
-                                                  (ks - mod->gpl_syms));
-                                return ks->value;
-                        }
-                }
-                ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms);
-                if (ks) {
-                        printk_unused_warning(name);
-                        *crc = symversion(mod->unused_crcs, (ks - mod->unused_syms));
-                        return ks->value;
-                }
-                if (gplok) {
-                        ks = lookup_symbol(name, mod->unused_gpl_syms,
-                                           mod->unused_gpl_syms + mod->num_unused_gpl_syms);
-                        if (ks) {
-                                printk_unused_warning(name);
-                                *crc = symversion(mod->unused_gpl_crcs,
-                                                  (ks - mod->unused_gpl_syms));
-                                return ks->value;
-                        }
-                }
-                ks = lookup_symbol(name, mod->gpl_future_syms,
-                                   (mod->gpl_future_syms +
-                                    mod->num_gpl_future_syms));
-                if (ks) {
-                        if (!gplok) {
-                                printk(KERN_WARNING "Symbol %s is being used "
-                                       "by a non-GPL module, which will not "
-                                       "be allowed in the future\n", name);
-                                printk(KERN_WARNING "Please see the file "
-                                       "Documentation/feature-removal-schedule.txt "
-                                       "in the kernel source tree for more "
-                                       "details.\n");
-                        }
-                        *crc = symversion(mod->gpl_future_crcs,
-                                          (ks - mod->gpl_future_syms));
                        return ks->value;
                }
        }
        DEBUGP("Failed to find symbol %s\n", name);
        return -ENOENT;
 }
@@ -736,12 +745,13 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
        if (!forced && module_refcount(mod) != 0)
                wait_for_zero_refcount(mod);
+        mutex_unlock(&module_mutex);
        /* Final destruction now noone is using it. */
-        if (mod->exit != NULL) {
+        if (mod->exit != NULL)
-                mutex_unlock(&module_mutex);
                mod->exit();
-                mutex_lock(&module_mutex);
+        blocking_notifier_call_chain(&module_notify_list,
-        }
+                                     MODULE_STATE_GOING, mod);
+        mutex_lock(&module_mutex);
        /* Store the name of the last unloaded module for diagnostic purposes */
        strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
        free_module(mod);
@@ -777,10 +787,9 @@ static void print_unload_info(struct seq_file *m, struct module *mod)
 void __symbol_put(const char *symbol)
 {
        struct module *owner;
-        const unsigned long *crc;
        preempt_disable();
-        if (IS_ERR_VALUE(__find_symbol(symbol, &owner, &crc, 1)))
+        if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false)))
                BUG();
        module_put(owner);
        preempt_enable();
@@ -881,6 +890,19 @@ static struct module_attribute *modinfo_attrs[] = {
 static const char vermagic[] = VERMAGIC_STRING;
+static int try_to_force_load(struct module *mod, const char *symname)
+{
+#ifdef CONFIG_MODULE_FORCE_LOAD
+        if (!(tainted & TAINT_FORCED_MODULE))
+                printk("%s: no version for \"%s\" found: kernel tainted.\n",
+                       mod->name, symname);
+        add_taint_module(mod, TAINT_FORCED_MODULE);
+        return 0;
+#else
+        return -ENOEXEC;
+#endif
+}
 #ifdef CONFIG_MODVERSIONS
 static int check_version(Elf_Shdr *sechdrs,
                         unsigned int versindex,
@@ -905,18 +927,18 @@ static int check_version(Elf_Shdr *sechdrs,
                if (versions[i].crc == *crc)
                        return 1;
-                printk("%s: disagrees about version of symbol %s\n",
-                       mod->name, symname);
                DEBUGP("Found checksum %lX vs module %lX\n",
                       *crc, versions[i].crc);
-                return 0;
+                goto bad_version;
        }
-        /* Not in module's version table.  OK, but that taints the kernel. */
-        if (!(tainted & TAINT_FORCED_MODULE))
+        if (!try_to_force_load(mod, symname))
-                printk("%s: no version for \"%s\" found: kernel tainted.\n",
+                return 1;
-                       mod->name, symname);
-        add_taint_module(mod, TAINT_FORCED_MODULE);
+bad_version:
-        return 1;
+        printk("%s: disagrees about version of symbol %s\n",
+               mod->name, symname);
+        return 0;
 }
 static inline int check_modstruct_version(Elf_Shdr *sechdrs,
@@ -924,13 +946,10 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
                                          struct module *mod)
 {
        const unsigned long *crc;
-        struct module *owner;
-        if (IS_ERR_VALUE(__find_symbol("struct_module",
+        if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false)))
-                                                &owner, &crc, 1)))
                BUG();
-        return check_version(sechdrs, versindex, "struct_module", mod,
+        return check_version(sechdrs, versindex, "struct_module", mod, crc);
-                             crc);
 }
 /* First part is kernel version, which we ignore. */
@@ -974,8 +993,8 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
        unsigned long ret;
        const unsigned long *crc;
-        ret = __find_symbol(name, &owner, &crc,
+        ret = find_symbol(name, &owner, &crc,
-                        !(mod->taints & TAINT_PROPRIETARY_MODULE));
+                          !(mod->taints & TAINT_PROPRIETARY_MODULE), true);
        if (!IS_ERR_VALUE(ret)) {
                /* use_module can fail due to OOM,
                   or module initialization or unloading */
@@ -991,6 +1010,20 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
 * J. Corbet <corbet@lwn.net>
 */
 #if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
+struct module_sect_attr
+{
+        struct module_attribute mattr;
+        char *name;
+        unsigned long address;
+};
+struct module_sect_attrs
+{
+        struct attribute_group grp;
+        unsigned int nsections;
+        struct module_sect_attr attrs[0];
+};
 static ssize_t module_sect_show(struct module_attribute *mattr,
                                struct module *mod, char *buf)
 {
@@ -1001,7 +1034,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
 static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
 {
-        int section;
+        unsigned int section;
        for (section = 0; section < sect_attrs->nsections; section++)
                kfree(sect_attrs->attrs[section].name);
@@ -1362,10 +1395,9 @@ void *__symbol_get(const char *symbol)
 {
        struct module *owner;
        unsigned long value;
-        const unsigned long *crc;
        preempt_disable();
-        value = __find_symbol(symbol, &owner, &crc, 1);
+        value = find_symbol(symbol, &owner, NULL, true, true);
        if (IS_ERR_VALUE(value))
                value = 0;
        else if (strong_try_module_get(owner))
@@ -1382,33 +1414,33 @@ EXPORT_SYMBOL_GPL(__symbol_get);
 */
 static int verify_export_symbols(struct module *mod)
 {
-        const char *name = NULL;
+        unsigned int i;
-        unsigned long i, ret = 0;
        struct module *owner;
-        const unsigned long *crc;
+        const struct kernel_symbol *s;
+        struct {
-        for (i = 0; i < mod->num_syms; i++)
+                const struct kernel_symbol *sym;
-                if (!IS_ERR_VALUE(__find_symbol(mod->syms[i].name,
+                unsigned int num;
-                                                        &owner, &crc, 1))) {
+        } arr[] = {
-                        name = mod->syms[i].name;
+                { mod->syms, mod->num_syms },
-                        ret = -ENOEXEC;
+                { mod->gpl_syms, mod->num_gpl_syms },
-                        goto dup;
+                { mod->gpl_future_syms, mod->num_gpl_future_syms },
-                }
+                { mod->unused_syms, mod->num_unused_syms },
+                { mod->unused_gpl_syms, mod->num_unused_gpl_syms },
+        };
-        for (i = 0; i < mod->num_gpl_syms; i++)
+        for (i = 0; i < ARRAY_SIZE(arr); i++) {
-                if (!IS_ERR_VALUE(__find_symbol(mod->gpl_syms[i].name,
+                for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
-                                                        &owner, &crc, 1))) {
+                        if (!IS_ERR_VALUE(find_symbol(s->name, &owner,
-                        name = mod->gpl_syms[i].name;
+                                                      NULL, true, false))) {
-                        ret = -ENOEXEC;
+                                printk(KERN_ERR
-                        goto dup;
+                                       "%s: exports duplicate symbol %s"
+                                       " (owned by %s)\n",
+                                       mod->name, s->name, module_name(owner));
+                                return -ENOEXEC;
+                        }
                }
+        }
-dup:
+        return 0;
-        if (ret)
-                printk(KERN_ERR "%s: exports duplicate symbol %s (owned by %s)\n",
-                        mod->name, name, module_name(owner));
-        return ret;
 }
 /* Change all symbols so that st_value encodes the pointer directly. */
@@ -1814,8 +1846,9 @@ static struct module *load_module(void __user *umod,
        unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
 #endif
-        /* Don't keep modinfo section */
+        /* Don't keep modinfo and version sections. */
        sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
+        sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
 #ifdef CONFIG_KALLSYMS
        /* Keep symbol and string tables for decoding later. */
        sechdrs[symindex].sh_flags |= SHF_ALLOC;
@@ -1833,9 +1866,9 @@ static struct module *load_module(void __user *umod,
        modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
        /* This is allowed: modprobe --force will invalidate it. */
        if (!modmagic) {
-                add_taint_module(mod, TAINT_FORCED_MODULE);
+                err = try_to_force_load(mod, "magic");
-                printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
+                if (err)
-                       mod->name);
+                        goto free_hdr;
        } else if (!same_magic(modmagic, vermagic)) {
                printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
                       mod->name, modmagic, vermagic);
@@ -1977,7 +2010,8 @@ static struct module *load_module(void __user *umod,
                mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
        mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr;
        if (unusedgplcrcindex)
-                mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr;
+                mod->unused_gpl_crcs
+                        = (void *)sechdrs[unusedgplcrcindex].sh_addr;
 #ifdef CONFIG_MODVERSIONS
        if ((mod->num_syms && !crcindex) ||
@@ -1985,9 +2019,10 @@ static struct module *load_module(void __user *umod,
            (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
            (mod->num_unused_syms && !unusedcrcindex) ||
            (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
-                printk(KERN_WARNING "%s: No versions for exported symbols."
+                printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
-                       " Tainting kernel.\n", mod->name);
+                err = try_to_force_load(mod, "nocrc");
-                add_taint_module(mod, TAINT_FORCED_MODULE);
+                if (err)
+                        goto cleanup;
        }
 #endif
        markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
@@ -2171,6 +2206,8 @@ sys_init_module(void __user *umod,
                mod->state = MODULE_STATE_GOING;
                synchronize_sched();
                module_put(mod);
+                blocking_notifier_call_chain(&module_notify_list,
+                                             MODULE_STATE_GOING, mod);
                mutex_lock(&module_mutex);
                free_module(mod);
                mutex_unlock(&module_mutex);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 643360d1bb14..823be11584ef 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -31,6 +31,21 @@ static int notifier_chain_register(struct notifier_block **nl,
        return 0;
 }
+static int notifier_chain_cond_register(struct notifier_block **nl,
+                struct notifier_block *n)
+{
+        while ((*nl) != NULL) {
+                if ((*nl) == n)
+                        return 0;
+                if (n->priority > (*nl)->priority)
+                        break;
+                nl = &((*nl)->next);
+        }
+        n->next = *nl;
+        rcu_assign_pointer(*nl, n);
+        return 0;
+}
 static int notifier_chain_unregister(struct notifier_block **nl,
                struct notifier_block *n)
 {
@@ -205,6 +220,29 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
 EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
 /**
+ *      blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain
+ *      @nh: Pointer to head of the blocking notifier chain
+ *      @n: New entry in notifier chain
+ *
+ *      Adds a notifier to a blocking notifier chain, only if not already
+ *      present in the chain.
+ *      Must be called in process context.
+ *
+ *      Currently always returns zero.
+ */
+int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh,
+                struct notifier_block *n)
+{
+        int ret;
+        down_write(&nh->rwsem);
+        ret = notifier_chain_cond_register(&nh->head, n);
+        up_write(&nh->rwsem);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register);
+/**
 *      blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
 *      @nh: Pointer to head of the blocking notifier chain
 *      @n: Entry to remove from notifier chain
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index aead4d69f62b..48d7ed6fc3a4 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -7,6 +7,8 @@
 #include <linux/module.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
 struct ns_cgroup {
        struct cgroup_subsys_state css;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f5d332cf8c63..adc785146a1c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -139,6 +139,18 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
                goto out;
        }
+        /*
+         * CLONE_NEWIPC must detach from the undolist: after switching
+         * to a new ipc namespace, the semaphore arrays from the old
+         * namespace are unreachable.  In clone parlance, CLONE_SYSVSEM
+         * means share undolist with parent, so we must forbid using
+         * it along with CLONE_NEWIPC.
+         */
+        if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) {
+                err = -EINVAL;
+                goto out;
+        }
        new_ns = create_new_namespaces(flags, tsk, tsk->fs);
        if (IS_ERR(new_ns)) {
                err = PTR_ERR(new_ns);
diff --git a/kernel/panic.c b/kernel/panic.c
index 24af9f8bac99..425567f45b9f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -153,6 +153,8 @@ EXPORT_SYMBOL(panic);
 *  'M' - System experienced a machine check exception.
 *  'B' - System has hit bad_page.
 *  'U' - Userspace-defined naughtiness.
+ *  'A' - ACPI table overridden.
+ *  'W' - Taint on warning.
 *
 *      The string is overwritten by the next call to print_taint().
 */
@@ -161,7 +163,7 @@ const char *print_tainted(void)
 {
        static char buf[20];
        if (tainted) {
-                snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c",
+                snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c",
                        tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
                        tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
                        tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
@@ -170,7 +172,8 @@ const char *print_tainted(void)
                        tainted & TAINT_BAD_PAGE ? 'B' : ' ',
                        tainted & TAINT_USER ? 'U' : ' ',
                        tainted & TAINT_DIE ? 'D' : ' ',
-                        tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ');
+                        tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ',
+                        tainted & TAINT_WARN ? 'W' : ' ');
        }
        else
                snprintf(buf, sizeof(buf), "Not tainted");
@@ -312,6 +315,7 @@ void warn_on_slowpath(const char *file, int line)
        print_modules();
        dump_stack();
        print_oops_end_marker();
+        add_taint(TAINT_WARN);
 }
 EXPORT_SYMBOL(warn_on_slowpath);
 #endif
diff --git a/kernel/pid.c b/kernel/pid.c
index 477691576b33..20d59fa2d493 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -111,10 +111,11 @@ EXPORT_SYMBOL(is_container_init);
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-static void free_pidmap(struct pid_namespace *pid_ns, int pid)
+static void free_pidmap(struct upid *upid)
 {
-        struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE;
+        int nr = upid->nr;
-        int offset = pid & BITS_PER_PAGE_MASK;
+        struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
+        int offset = nr & BITS_PER_PAGE_MASK;
        clear_bit(offset, map->page);
        atomic_inc(&map->nr_free);
@@ -232,7 +233,7 @@ void free_pid(struct pid *pid)
        spin_unlock_irqrestore(&pidmap_lock, flags);
        for (i = 0; i <= pid->level; i++)
-                free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr);
+                free_pidmap(pid->numbers + i);
        call_rcu(&pid->rcu, delayed_put_pid);
 }
@@ -278,8 +279,8 @@ out:
        return pid;
 out_free:
-        for (i++; i <= ns->level; i++)
+        while (++i <= ns->level)
-                free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr);
+                free_pidmap(pid->numbers + i);
        kmem_cache_free(ns->pid_cachep, pid);
        pid = NULL;
@@ -316,7 +317,7 @@ EXPORT_SYMBOL_GPL(find_pid);
 /*
 * attach_pid() must be called with the tasklist_lock write-held.
 */
-int attach_pid(struct task_struct *task, enum pid_type type,
+void attach_pid(struct task_struct *task, enum pid_type type,
                struct pid *pid)
 {
        struct pid_link *link;
@@ -324,11 +325,10 @@ int attach_pid(struct task_struct *task, enum pid_type type,
        link = &task->pids[type];
        link->pid = pid;
        hlist_add_head_rcu(&link->node, &pid->tasks[type]);
-        return 0;
 }
-void detach_pid(struct task_struct *task, enum pid_type type)
+static void __change_pid(struct task_struct *task, enum pid_type type,
+                        struct pid *new)
 {
        struct pid_link *link;
        struct pid *pid;
@@ -338,7 +338,7 @@ void detach_pid(struct task_struct *task, enum pid_type type)
        pid = link->pid;
        hlist_del_rcu(&link->node);
-        link->pid = NULL;
+        link->pid = new;
        for (tmp = PIDTYPE_MAX; --tmp >= 0; )
                if (!hlist_empty(&pid->tasks[tmp]))
@@ -347,13 +347,24 @@ void detach_pid(struct task_struct *task, enum pid_type type)
        free_pid(pid);
 }
+void detach_pid(struct task_struct *task, enum pid_type type)
+{
+        __change_pid(task, type, NULL);
+}
+void change_pid(struct task_struct *task, enum pid_type type,
+                struct pid *pid)
+{
+        __change_pid(task, type, pid);
+        attach_pid(task, type, pid);
+}
 /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
 void transfer_pid(struct task_struct *old, struct task_struct *new,
                           enum pid_type type)
 {
        new->pids[type].pid = old->pids[type].pid;
        hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
-        old->pids[type].pid = NULL;
 }
 struct task_struct *pid_task(struct pid *pid, enum pid_type type)
@@ -380,12 +391,6 @@ struct task_struct *find_task_by_pid_type_ns(int type, int nr,
 EXPORT_SYMBOL(find_task_by_pid_type_ns);
-struct task_struct *find_task_by_pid(pid_t nr)
-{
-        return find_task_by_pid_type_ns(PIDTYPE_PID, nr, &init_pid_ns);
-}
-EXPORT_SYMBOL(find_task_by_pid);
 struct task_struct *find_task_by_vpid(pid_t vnr)
 {
        return find_task_by_pid_type_ns(PIDTYPE_PID, vnr,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6d792b66d854..98702b4b8851 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -66,7 +66,7 @@ err_alloc:
        return NULL;
 }
-static struct pid_namespace *create_pid_namespace(int level)
+static struct pid_namespace *create_pid_namespace(unsigned int level)
 {
        struct pid_namespace *ns;
        int i;
@@ -92,7 +92,7 @@ static struct pid_namespace *create_pid_namespace(int level)
        atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
        for (i = 1; i < PIDMAP_ENTRIES; i++) {
-                ns->pidmap[i].page = 0;
+                ns->pidmap[i].page = NULL;
                atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
        }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index ae5c6c147c4b..f1525ad06cb3 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -4,8 +4,9 @@
 #include <linux/sched.h>
 #include <linux/posix-timers.h>
-#include <asm/uaccess.h>
 #include <linux/errno.h>
+#include <linux/math64.h>
+#include <asm/uaccess.h>
 static int check_clock(const clockid_t which_clock)
 {
@@ -47,12 +48,10 @@ static void sample_to_timespec(const clockid_t which_clock,
                               union cpu_time_count cpu,
                               struct timespec *tp)
 {
-        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
+        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
-                tp->tv_sec = div_long_long_rem(cpu.sched,
+                *tp = ns_to_timespec(cpu.sched);
-                                               NSEC_PER_SEC, &tp->tv_nsec);
+        else
-        } else {
                cputime_to_timespec(cpu.cpu, tp);
-        }
 }
 static inline int cpu_time_before(const clockid_t which_clock,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 8476956ffd92..dbd8398ddb0b 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -310,8 +310,7 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
        if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
                struct task_struct *leader;
-                int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
+                int ret = send_sigqueue(timr->sigq, timr->it_process, 0);
-                                        timr->it_process);
                if (likely(ret >= 0))
                        return ret;
@@ -322,8 +321,7 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
                timr->it_process = leader;
        }
-        return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
+        return send_sigqueue(timr->sigq, timr->it_process, 1);
-                                   timr->it_process);
 }
 EXPORT_SYMBOL_GPL(posix_timer_event);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 6233f3b4ae66..b45da40e8d25 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -19,16 +19,6 @@ config PM
          will issue the hlt instruction if nothing is to be done, thereby
          sending the processor to sleep and saving power.
-config PM_LEGACY
-        bool "Legacy Power Management API (DEPRECATED)"
-        depends on PM
-        default n
-        ---help---
-           Support for pm_register() and friends.  This old API is obsoleted
-           by the driver model.
-           If unsure, say N.
 config PM_DEBUG
        bool "Power Management Debug Support"
        depends on PM
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f7dfff28ecdb..597823b5b700 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -4,7 +4,6 @@ EXTRA_CFLAGS	+=	-DDEBUG
 endif
 obj-y                           := main.o
-obj-$(CONFIG_PM_LEGACY)         += pm.o
 obj-$(CONFIG_PM_SLEEP)          += process.o console.o
 obj-$(CONFIG_HIBERNATION)       += swsusp.o disk.o snapshot.o swap.o user.o
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
deleted file mode 100644
index 60c73fa670d5..000000000000
--- a/kernel/power/pm.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- *  pm.c - Power management interface
- *
- *  Copyright (C) 2000 Andrew Henroid
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/pm.h>
-#include <linux/pm_legacy.h>
-#include <linux/interrupt.h>
-#include <linux/mutex.h>
-/*
- *      Locking notes:
- *              pm_devs_lock can be a semaphore providing pm ops are not called
- *      from an interrupt handler (already a bad idea so no change here). Each
- *      change must be protected so that an unlink of an entry doesn't clash
- *      with a pm send - which is permitted to sleep in the current architecture
- *
- *      Module unloads clashing with pm events now work out safely, the module 
- *      unload path will block until the event has been sent. It may well block
- *      until a resume but that will be fine.
- */
- 
-static DEFINE_MUTEX(pm_devs_lock);
-static LIST_HEAD(pm_devs);
-/**
- *      pm_register - register a device with power management
- *      @type: device type 
- *      @id: device ID
- *      @callback: callback function
- *
- *      Add a device to the list of devices that wish to be notified about
- *      power management events. A &pm_dev structure is returned on success,
- *      on failure the return is %NULL.
- *
- *      The callback function will be called in process context and
- *      it may sleep.
- */
- 
-struct pm_dev *pm_register(pm_dev_t type,
-                           unsigned long id,
-                           pm_callback callback)
-{
-        struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
-        if (dev) {
-                dev->type = type;
-                dev->id = id;
-                dev->callback = callback;
-                mutex_lock(&pm_devs_lock);
-                list_add(&dev->entry, &pm_devs);
-                mutex_unlock(&pm_devs_lock);
-        }
-        return dev;
-}
-/**
- *      pm_send - send request to a single device
- *      @dev: device to send to
- *      @rqst: power management request
- *      @data: data for the callback
- *
- *      Issue a power management request to a given device. The 
- *      %PM_SUSPEND and %PM_RESUME events are handled specially. The
- *      data field must hold the intended next state. No call is made
- *      if the state matches.
- *
- *      BUGS: what stops two power management requests occurring in parallel
- *      and conflicting.
- *
- *      WARNING: Calling pm_send directly is not generally recommended, in
- *      particular there is no locking against the pm_dev going away. The
- *      caller must maintain all needed locking or have 'inside knowledge'
- *      on the safety. Also remember that this function is not locked against
- *      pm_unregister. This means that you must handle SMP races on callback
- *      execution and unload yourself.
- */
- 
-static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data)
-{
-        int status = 0;
-        unsigned long prev_state, next_state;
-        if (in_interrupt())
-                BUG();
-        switch (rqst) {
-        case PM_SUSPEND:
-        case PM_RESUME:
-                prev_state = dev->state;
-                next_state = (unsigned long) data;
-                if (prev_state != next_state) {
-                        if (dev->callback)
-                                status = (*dev->callback)(dev, rqst, data);
-                        if (!status) {
-                                dev->state = next_state;
-                                dev->prev_state = prev_state;
-                        }
-                }
-                else {
-                        dev->prev_state = prev_state;
-                }
-                break;
-        default:
-                if (dev->callback)
-                        status = (*dev->callback)(dev, rqst, data);
-                break;
-        }
-        return status;
-}
-/*
- * Undo incomplete request
- */
-static void pm_undo_all(struct pm_dev *last)
-{
-        struct list_head *entry = last->entry.prev;
-        while (entry != &pm_devs) {
-                struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
-                if (dev->state != dev->prev_state) {
-                        /* previous state was zero (running) resume or
-                         * previous state was non-zero (suspended) suspend
-                         */
-                        pm_request_t undo = (dev->prev_state
-                                             ? PM_SUSPEND:PM_RESUME);
-                        pm_send(dev, undo, (void*) dev->prev_state);
-                }
-                entry = entry->prev;
-        }
-}
-/**
- *      pm_send_all - send request to all managed devices
- *      @rqst: power management request
- *      @data: data for the callback
- *
- *      Issue a power management request to a all devices. The 
- *      %PM_SUSPEND events are handled specially. Any device is 
- *      permitted to fail a suspend by returning a non zero (error)
- *      value from its callback function. If any device vetoes a 
- *      suspend request then all other devices that have suspended 
- *      during the processing of this request are restored to their
- *      previous state.
- *
- *      WARNING:  This function takes the pm_devs_lock. The lock is not dropped until
- *      the callbacks have completed. This prevents races against pm locking
- *      functions, races against module unload pm_unregister code. It does
- *      mean however that you must not issue pm_ functions within the callback
- *      or you will deadlock and users will hate you.
- *
- *      Zero is returned on success. If a suspend fails then the status
- *      from the device that vetoes the suspend is returned.
- *
- *      BUGS: what stops two power management requests occurring in parallel
- *      and conflicting.
- */
- 
-int pm_send_all(pm_request_t rqst, void *data)
-{
-        struct list_head *entry;
-        
-        mutex_lock(&pm_devs_lock);
-        entry = pm_devs.next;
-        while (entry != &pm_devs) {
-                struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
-                if (dev->callback) {
-                        int status = pm_send(dev, rqst, data);
-                        if (status) {
-                                /* return devices to previous state on
-                                 * failed suspend request
-                                 */
-                                if (rqst == PM_SUSPEND)
-                                        pm_undo_all(dev);
-                                mutex_unlock(&pm_devs_lock);
-                                return status;
-                        }
-                }
-                entry = entry->next;
-        }
-        mutex_unlock(&pm_devs_lock);
-        return 0;
-}
-EXPORT_SYMBOL(pm_register);
-EXPORT_SYMBOL(pm_send_all);
diff --git a/kernel/printk.c b/kernel/printk.c
index bdd4ea8c3f2b..8fb01c32aa3b 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -111,6 +111,9 @@ struct console_cmdline
        char    name[8];                        /* Name of the driver       */
        int     index;                          /* Minor dev. to use        */
        char    *options;                       /* Options for the driver   */
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+        char    *brl_options;                   /* Options for braille driver */
+#endif
 };
 #define MAX_CMDLINECONSOLES 8
@@ -808,15 +811,60 @@ static void call_console_drivers(unsigned start, unsigned end)
 #endif
+static int __add_preferred_console(char *name, int idx, char *options,
+                                   char *brl_options)
+{
+        struct console_cmdline *c;
+        int i;
+        /*
+         *      See if this tty is not yet registered, and
+         *      if we have a slot free.
+         */
+        for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
+                if (strcmp(console_cmdline[i].name, name) == 0 &&
+                          console_cmdline[i].index == idx) {
+                                if (!brl_options)
+                                        selected_console = i;
+                                return 0;
+                }
+        if (i == MAX_CMDLINECONSOLES)
+                return -E2BIG;
+        if (!brl_options)
+                selected_console = i;
+        c = &console_cmdline[i];
+        strlcpy(c->name, name, sizeof(c->name));
+        c->options = options;
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+        c->brl_options = brl_options;
+#endif
+        c->index = idx;
+        return 0;
+}
 /*
 * Set up a list of consoles.  Called from init/main.c
 */
 static int __init console_setup(char *str)
 {
        char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */
-        char *s, *options;
+        char *s, *options, *brl_options = NULL;
        int idx;
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+        if (!memcmp(str, "brl,", 4)) {
+                brl_options = "";
+                str += 4;
+        } else if (!memcmp(str, "brl=", 4)) {
+                brl_options = str + 4;
+                str = strchr(brl_options, ',');
+                if (!str) {
+                        printk(KERN_ERR "need port name after brl=\n");
+                        return 1;
+                }
+                *(str++) = 0;
+        }
+#endif
        /*
         * Decode str into name, index, options.
         */
@@ -841,7 +889,7 @@ static int __init console_setup(char *str)
        idx = simple_strtoul(s, NULL, 10);
        *s = 0;
-        add_preferred_console(buf, idx, options);
+        __add_preferred_console(buf, idx, options, brl_options);
        return 1;
 }
 __setup("console=", console_setup);
@@ -861,28 +909,7 @@ __setup("console=", console_setup);
 */
 int add_preferred_console(char *name, int idx, char *options)
 {
-        struct console_cmdline *c;
+        return __add_preferred_console(name, idx, options, NULL);
-        int i;
-        /*
-         *      See if this tty is not yet registered, and
-         *      if we have a slot free.
-         */
-        for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
-                if (strcmp(console_cmdline[i].name, name) == 0 &&
-                          console_cmdline[i].index == idx) {
-                                selected_console = i;
-                                return 0;
-                }
-        if (i == MAX_CMDLINECONSOLES)
-                return -E2BIG;
-        selected_console = i;
-        c = &console_cmdline[i];
-        memcpy(c->name, name, sizeof(c->name));
-        c->name[sizeof(c->name) - 1] = 0;
-        c->options = options;
-        c->index = idx;
-        return 0;
 }
 int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options)
@@ -894,7 +921,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
                if (strcmp(console_cmdline[i].name, name) == 0 &&
                          console_cmdline[i].index == idx) {
                                c = &console_cmdline[i];
-                                memcpy(c->name, name_new, sizeof(c->name));
+                                strlcpy(c->name, name_new, sizeof(c->name));
                                c->name[sizeof(c->name) - 1] = 0;
                                c->options = options;
                                c->index = idx_new;
@@ -1163,6 +1190,16 @@ void register_console(struct console *console)
                        continue;
                if (console->index < 0)
                        console->index = console_cmdline[i].index;
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+                if (console_cmdline[i].brl_options) {
+                        console->flags |= CON_BRL;
+                        braille_register_console(console,
+                                        console_cmdline[i].index,
+                                        console_cmdline[i].options,
+                                        console_cmdline[i].brl_options);
+                        return;
+                }
+#endif
                if (console->setup &&
                    console->setup(console, console_cmdline[i].options) != 0)
                        break;
@@ -1221,6 +1258,11 @@ int unregister_console(struct console *console)
        struct console *a, *b;
        int res = 1;
+#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
+        if (console->flags & CON_BRL)
+                return braille_unregister_console(console);
+#endif
        acquire_console_sem();
        if (console_drivers == console) {
                console_drivers=console->next;
@@ -1272,8 +1314,8 @@ late_initcall(disable_boot_consoles);
 */
 void tty_write_message(struct tty_struct *tty, char *msg)
 {
-        if (tty && tty->driver->write)
+        if (tty && tty->ops->write)
-                tty->driver->write(tty, msg, strlen(msg));
+                tty->ops->write(tty, msg, strlen(msg));
        return;
 }
@@ -1287,31 +1329,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 */
 int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 {
-        static DEFINE_SPINLOCK(ratelimit_lock);
+        return __ratelimit(ratelimit_jiffies, ratelimit_burst);
-        static unsigned toks = 10 * 5 * HZ;
-        static unsigned long last_msg;
-        static int missed;
-        unsigned long flags;
-        unsigned long now = jiffies;
-        spin_lock_irqsave(&ratelimit_lock, flags);
-        toks += now - last_msg;
-        last_msg = now;
-        if (toks > (ratelimit_burst * ratelimit_jiffies))
-                toks = ratelimit_burst * ratelimit_jiffies;
-        if (toks >= ratelimit_jiffies) {
-                int lost = missed;
-                missed = 0;
-                toks -= ratelimit_jiffies;
-                spin_unlock_irqrestore(&ratelimit_lock, flags);
-                if (lost)
-                        printk(KERN_WARNING "printk: %d messages suppressed.\n", lost);
-                return 1;
-        }
-        missed++;
-        spin_unlock_irqrestore(&ratelimit_lock, flags);
-        return 0;
 }
 EXPORT_SYMBOL(__printk_ratelimit);
diff --git a/kernel/profile.c b/kernel/profile.c
index 606d7387265c..ae7ead82cbc9 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -587,10 +587,10 @@ static int __init create_proc_profile(void)
                return 0;
        if (create_hash_tables())
                return -1;
-        entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);
+        entry = proc_create("profile", S_IWUSR | S_IRUGO,
+                            NULL, &proc_profile_operations);
        if (!entry)
                return 0;
-        entry->proc_fops = &proc_profile_operations;
        entry->size = (1+prof_len) * sizeof(atomic_t);
        hotcpu_notifier(profile_cpu_callback, 0);
        return 0;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 67e392ed5496..6c19e94fd0a5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -73,7 +73,7 @@ void __ptrace_unlink(struct task_struct *child)
        BUG_ON(!child->ptrace);
        child->ptrace = 0;
-        if (!list_empty(&child->ptrace_list)) {
+        if (ptrace_reparented(child)) {
                list_del_init(&child->ptrace_list);
                remove_parent(child);
                child->parent = child->real_parent;
@@ -168,8 +168,6 @@ int ptrace_attach(struct task_struct *task)
        audit_ptrace(task);
        retval = -EPERM;
-        if (task->pid <= 1)
-                goto out;
        if (same_thread_group(task, current))
                goto out;
@@ -208,8 +206,7 @@ repeat:
        __ptrace_link(task, current);
-        force_sig_specific(SIGSTOP, task);
+        send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
 bad:
        write_unlock_irqrestore(&tasklist_lock, flags);
        task_unlock(task);
@@ -522,12 +519,6 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
 {
        struct task_struct *child;
-        /*
-         * Tracing init is not allowed.
-         */
-        if (pid == 1)
-                return ERR_PTR(-EPERM);
        read_lock(&tasklist_lock);
        child = find_task_by_vpid(pid);
        if (child)
@@ -543,7 +534,6 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
 #define arch_ptrace_attach(child)       do { } while (0)
 #endif
-#ifndef __ARCH_SYS_PTRACE
 asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 {
        struct task_struct *child;
@@ -591,7 +581,6 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
        unlock_kernel();
        return ret;
 }
-#endif /* __ARCH_SYS_PTRACE */
 int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
 {
@@ -612,7 +601,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
        return (copied == sizeof(data)) ? 0 : -EIO;
 }
-#ifdef CONFIG_COMPAT
+#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE
 #include <linux/compat.h>
 int compat_ptrace_request(struct task_struct *child, compat_long_t request,
@@ -667,7 +656,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
        return ret;
 }
-#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
 asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
                                  compat_long_t addr, compat_long_t data)
 {
@@ -710,6 +698,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
        unlock_kernel();
        return ret;
 }
-#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */
+#endif  /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */
-#endif  /* CONFIG_COMPAT */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 47894f919d4e..33acc424667e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -45,6 +45,7 @@
 #include <linux/byteorder/swabb.h>
 #include <linux/stat.h>
 #include <linux/srcu.h>
+#include <linux/slab.h>
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
diff --git a/kernel/relay.c b/kernel/relay.c
index d6204a485818..7de644cdec43 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -65,6 +65,35 @@ static struct vm_operations_struct relay_file_mmap_ops = {
        .close = relay_file_mmap_close,
 };
+/*
+ * allocate an array of pointers of struct page
+ */
+static struct page **relay_alloc_page_array(unsigned int n_pages)
+{
+        struct page **array;
+        size_t pa_size = n_pages * sizeof(struct page *);
+        if (pa_size > PAGE_SIZE) {
+                array = vmalloc(pa_size);
+                if (array)
+                        memset(array, 0, pa_size);
+        } else {
+                array = kzalloc(pa_size, GFP_KERNEL);
+        }
+        return array;
+}
+/*
+ * free an array of pointers of struct page
+ */
+static void relay_free_page_array(struct page **array)
+{
+        if (is_vmalloc_addr(array))
+                vfree(array);
+        else
+                kfree(array);
+}
 /**
 *      relay_mmap_buf: - mmap channel buffer to process address space
 *      @buf: relay channel buffer
@@ -109,7 +138,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
        *size = PAGE_ALIGN(*size);
        n_pages = *size >> PAGE_SHIFT;
-        buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
+        buf->page_array = relay_alloc_page_array(n_pages);
        if (!buf->page_array)
                return NULL;
@@ -130,7 +159,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
 depopulate:
        for (j = 0; j < i; j++)
                __free_page(buf->page_array[j]);
-        kfree(buf->page_array);
+        relay_free_page_array(buf->page_array);
        return NULL;
 }
@@ -189,7 +218,7 @@ static void relay_destroy_buf(struct rchan_buf *buf)
                vunmap(buf->start);
                for (i = 0; i < buf->page_count; i++)
                        __free_page(buf->page_array[i]);
-                kfree(buf->page_array);
+                relay_free_page_array(buf->page_array);
        }
        chan->buf[buf->cpu] = NULL;
        kfree(buf->padding);
@@ -1162,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in,
        ret = 0;
        spliced = 0;
-        while (len) {
+        while (len && !spliced) {
                ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
                if (ret < 0)
                        break;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index efbfc0fc232f..d3c61b4ebef2 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -10,6 +10,7 @@
 #include <linux/types.h>
 #include <linux/parser.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/res_counter.h>
 #include <linux/uaccess.h>
@@ -27,6 +28,8 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
        }
        counter->usage += val;
+        if (counter->usage > counter->max_usage)
+                counter->max_usage = counter->usage;
        return 0;
 }
@@ -65,6 +68,8 @@ res_counter_member(struct res_counter *counter, int member)
        switch (member) {
        case RES_USAGE:
                return &counter->usage;
+        case RES_MAX_USAGE:
+                return &counter->max_usage;
        case RES_LIMIT:
                return &counter->limit;
        case RES_FAILCNT:
@@ -92,6 +97,11 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
                        pos, buf, s - buf);
 }
+u64 res_counter_read_u64(struct res_counter *counter, int member)
+{
+        return *res_counter_member(counter, member);
+}
 ssize_t res_counter_write(struct res_counter *counter, int member,
                const char __user *userbuf, size_t nbytes, loff_t *pos,
                int (*write_strategy)(char *st_buf, unsigned long long *val))
diff --git a/kernel/resource.c b/kernel/resource.c
index cee12cc47cab..74af2d7cb5a1 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -131,14 +131,8 @@ static const struct file_operations proc_iomem_operations = {
 static int __init ioresources_init(void)
 {
-        struct proc_dir_entry *entry;
+        proc_create("ioports", 0, NULL, &proc_ioports_operations);
+        proc_create("iomem", 0, NULL, &proc_iomem_operations);
-        entry = create_proc_entry("ioports", 0, NULL);
-        if (entry)
-                entry->proc_fops = &proc_ioports_operations;
-        entry = create_proc_entry("iomem", 0, NULL);
-        if (entry)
-                entry->proc_fops = &proc_iomem_operations;
        return 0;
 }
 __initcall(ioresources_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index 740fb409e5bb..58fb8af15776 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,16 +75,6 @@
 #include <asm/irq_regs.h>
 /*
- * Scheduler clock - returns current time in nanosec units.
- * This is default implementation.
- * Architectures and sub-architectures can override this.
- */
-unsigned long long __attribute__((weak)) sched_clock(void)
-{
-        return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
-}
-/*
 * Convert user-nice values [ -20 ... 0 ... 19 ]
 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
 * and back.
@@ -242,6 +232,12 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 }
 #endif
+/*
+ * sched_domains_mutex serializes calls to arch_init_sched_domains,
+ * detach_destroy_domains and partition_sched_domains.
+ */
+static DEFINE_MUTEX(sched_domains_mutex);
 #ifdef CONFIG_GROUP_SCHED
 #include <linux/cgroup.h>
@@ -308,9 +304,6 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 */
 static DEFINE_SPINLOCK(task_group_lock);
-/* doms_cur_mutex serializes access to doms_cur[] array */
-static DEFINE_MUTEX(doms_cur_mutex);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_USER_SCHED
 # define INIT_TASK_GROUP_LOAD   (2*NICE_0_LOAD)
@@ -318,7 +311,13 @@ static DEFINE_MUTEX(doms_cur_mutex);
 # define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
 #endif
+/*
+ * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems.
+ * (The default weight is 1024 - so there's no practical
+ *  limitation from this.)
+ */
 #define MIN_SHARES      2
+#define MAX_SHARES      (ULONG_MAX - 1)
 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 #endif
@@ -358,21 +357,9 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 #endif
 }
-static inline void lock_doms_cur(void)
-{
-        mutex_lock(&doms_cur_mutex);
-}
-static inline void unlock_doms_cur(void)
-{
-        mutex_unlock(&doms_cur_mutex);
-}
 #else
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline void lock_doms_cur(void) { }
-static inline void unlock_doms_cur(void) { }
 #endif  /* CONFIG_GROUP_SCHED */
@@ -560,13 +547,7 @@ struct rq {
        unsigned long next_balance;
        struct mm_struct *prev_mm;
-        u64 clock, prev_clock_raw;
+        u64 clock;
-        s64 clock_max_delta;
-        unsigned int clock_warps, clock_overflows, clock_underflows;
-        u64 idle_clock;
-        unsigned int clock_deep_idle_events;
-        u64 tick_timestamp;
        atomic_t nr_iowait;
@@ -631,82 +612,6 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
-#ifdef CONFIG_NO_HZ
-static inline bool nohz_on(int cpu)
-{
-        return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
-}
-static inline u64 max_skipped_ticks(struct rq *rq)
-{
-        return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
-}
-static inline void update_last_tick_seen(struct rq *rq)
-{
-        rq->last_tick_seen = jiffies;
-}
-#else
-static inline u64 max_skipped_ticks(struct rq *rq)
-{
-        return 1;
-}
-static inline void update_last_tick_seen(struct rq *rq)
-{
-}
-#endif
-/*
- * Update the per-runqueue clock, as finegrained as the platform can give
- * us, but without assuming monotonicity, etc.:
- */
-static void __update_rq_clock(struct rq *rq)
-{
-        u64 prev_raw = rq->prev_clock_raw;
-        u64 now = sched_clock();
-        s64 delta = now - prev_raw;
-        u64 clock = rq->clock;
-#ifdef CONFIG_SCHED_DEBUG
-        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-#endif
-        /*
-         * Protect against sched_clock() occasionally going backwards:
-         */
-        if (unlikely(delta < 0)) {
-                clock++;
-                rq->clock_warps++;
-        } else {
-                /*
-                 * Catch too large forward jumps too:
-                 */
-                u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
-                u64 max_time = rq->tick_timestamp + max_jump;
-                if (unlikely(clock + delta > max_time)) {
-                        if (clock < max_time)
-                                clock = max_time;
-                        else
-                                clock++;
-                        rq->clock_overflows++;
-                } else {
-                        if (unlikely(delta > rq->clock_max_delta))
-                                rq->clock_max_delta = delta;
-                        clock += delta;
-                }
-        }
-        rq->prev_clock_raw = now;
-        rq->clock = clock;
-}
-static void update_rq_clock(struct rq *rq)
-{
-        if (likely(smp_processor_id() == cpu_of(rq)))
-                __update_rq_clock(rq);
-}
 /*
 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 * See detach_destroy_domains: synchronize_sched for details.
@@ -722,6 +627,11 @@ static void update_rq_clock(struct rq *rq)
 #define task_rq(p)              cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
+static inline void update_rq_clock(struct rq *rq)
+{
+        rq->clock = sched_clock_cpu(cpu_of(rq));
+}
 /*
 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
 */
@@ -757,14 +667,14 @@ const_debug unsigned int sysctl_sched_features =
 #define SCHED_FEAT(name, enabled)       \
        #name ,
-__read_mostly char *sched_feat_names[] = {
+static __read_mostly char *sched_feat_names[] = {
 #include "sched_features.h"
        NULL
 };
 #undef SCHED_FEAT
-int sched_feat_open(struct inode *inode, struct file *filp)
+static int sched_feat_open(struct inode *inode, struct file *filp)
 {
        filp->private_data = inode->i_private;
        return 0;
@@ -899,7 +809,7 @@ static inline u64 global_rt_runtime(void)
        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
-static const unsigned long long time_sync_thresh = 100000;
+unsigned long long time_sync_thresh = 100000;
 static DEFINE_PER_CPU(unsigned long long, time_offset);
 static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
@@ -913,11 +823,14 @@ static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
 static DEFINE_SPINLOCK(time_sync_lock);
 static unsigned long long prev_global_time;
-static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
+static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
 {
-        unsigned long flags;
+        /*
+         * We want this inlined, to not get tracer function calls
-        spin_lock_irqsave(&time_sync_lock, flags);
+         * in this critical section:
+         */
+        spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
+        __raw_spin_lock(&time_sync_lock.raw_lock);
        if (time < prev_global_time) {
                per_cpu(time_offset, cpu) += prev_global_time - time;
@@ -926,7 +839,8 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
                prev_global_time = time;
        }
-        spin_unlock_irqrestore(&time_sync_lock, flags);
+        __raw_spin_unlock(&time_sync_lock.raw_lock);
+        spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
        return time;
 }
@@ -934,8 +848,6 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
 static unsigned long long __cpu_clock(int cpu)
 {
        unsigned long long now;
-        unsigned long flags;
-        struct rq *rq;
        /*
         * Only call sched_clock() if the scheduler has already been
@@ -944,11 +856,7 @@ static unsigned long long __cpu_clock(int cpu)
        if (unlikely(!scheduler_running))
                return 0;
-        local_irq_save(flags);
+        now = sched_clock_cpu(cpu);
-        rq = cpu_rq(cpu);
-        update_rq_clock(rq);
-        now = rq->clock;
-        local_irq_restore(flags);
        return now;
 }
@@ -960,13 +868,18 @@ static unsigned long long __cpu_clock(int cpu)
 unsigned long long cpu_clock(int cpu)
 {
        unsigned long long prev_cpu_time, time, delta_time;
+        unsigned long flags;
+        local_irq_save(flags);
        prev_cpu_time = per_cpu(prev_cpu_time, cpu);
        time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
        delta_time = time-prev_cpu_time;
-        if (unlikely(delta_time > time_sync_thresh))
+        if (unlikely(delta_time > time_sync_thresh)) {
                time = __sync_cpu_clock(time, cpu);
+                per_cpu(prev_cpu_time, cpu) = time;
+        }
+        local_irq_restore(flags);
        return time;
 }
@@ -1117,43 +1030,6 @@ static struct rq *this_rq_lock(void)
        return rq;
 }
-/*
- * We are going deep-idle (irqs are disabled):
- */
-void sched_clock_idle_sleep_event(void)
-{
-        struct rq *rq = cpu_rq(smp_processor_id());
-        spin_lock(&rq->lock);
-        __update_rq_clock(rq);
-        spin_unlock(&rq->lock);
-        rq->clock_deep_idle_events++;
-}
-EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
-/*
- * We just idled delta nanoseconds (called with irqs disabled):
- */
-void sched_clock_idle_wakeup_event(u64 delta_ns)
-{
-        struct rq *rq = cpu_rq(smp_processor_id());
-        u64 now = sched_clock();
-        rq->idle_clock += delta_ns;
-        /*
-         * Override the previous timestamp and ignore all
-         * sched_clock() deltas that occured while we idled,
-         * and use the PM-provided delta_ns to advance the
-         * rq clock:
-         */
-        spin_lock(&rq->lock);
-        rq->prev_clock_raw = now;
-        rq->clock += delta_ns;
-        spin_unlock(&rq->lock);
-        touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 static void __resched_task(struct task_struct *p, int tif_bit);
 static inline void resched_task(struct task_struct *p)
@@ -1189,6 +1065,7 @@ static inline void resched_rq(struct rq *rq)
 enum {
        HRTICK_SET,             /* re-programm hrtick_timer */
        HRTICK_RESET,           /* not a new slice */
+        HRTICK_BLOCK,           /* stop hrtick operations */
 };
 /*
@@ -1200,6 +1077,8 @@ static inline int hrtick_enabled(struct rq *rq)
 {
        if (!sched_feat(HRTICK))
                return 0;
+        if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
+                return 0;
        return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
@@ -1275,14 +1154,70 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
        spin_lock(&rq->lock);
-        __update_rq_clock(rq);
+        update_rq_clock(rq);
        rq->curr->sched_class->task_tick(rq, rq->curr, 1);
        spin_unlock(&rq->lock);
        return HRTIMER_NORESTART;
 }
-static inline void init_rq_hrtick(struct rq *rq)
+static void hotplug_hrtick_disable(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long flags;
+        spin_lock_irqsave(&rq->lock, flags);
+        rq->hrtick_flags = 0;
+        __set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
+        spin_unlock_irqrestore(&rq->lock, flags);
+        hrtick_clear(rq);
+}
+static void hotplug_hrtick_enable(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long flags;
+        spin_lock_irqsave(&rq->lock, flags);
+        __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
+        spin_unlock_irqrestore(&rq->lock, flags);
+}
+static int
+hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+        int cpu = (int)(long)hcpu;
+        switch (action) {
+        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
+        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE_FROZEN:
+        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
+                hotplug_hrtick_disable(cpu);
+                return NOTIFY_OK;
+        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                hotplug_hrtick_enable(cpu);
+                return NOTIFY_OK;
+        }
+        return NOTIFY_DONE;
+}
+static void init_hrtick(void)
+{
+        hotcpu_notifier(hotplug_hrtick, 0);
+}
+static void init_rq_hrtick(struct rq *rq)
 {
        rq->hrtick_flags = 0;
        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -1319,6 +1254,10 @@ static inline void init_rq_hrtick(struct rq *rq)
 void hrtick_resched(void)
 {
 }
+static inline void init_hrtick(void)
+{
+}
 #endif
 /*
@@ -1438,8 +1377,8 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 {
        u64 tmp;
-        if (unlikely(!lw->inv_weight))
+        if (!lw->inv_weight)
-                lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1);
+                lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1);
        tmp = (u64)delta_exec * weight;
        /*
@@ -1748,6 +1687,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
        if (shares < MIN_SHARES)
                shares = MIN_SHARES;
+        else if (shares > MAX_SHARES)
+                shares = MAX_SHARES;
        __set_se_shares(tg->se[tcpu], shares);
 }
@@ -4339,8 +4280,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
        struct rq *rq = this_rq();
        cputime64_t tmp;
-        if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
+        if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
-                return account_guest_time(p, cputime);
+                account_guest_time(p, cputime);
+                return;
+        }
        p->stime = cputime_add(p->stime, cputime);
@@ -4404,19 +4347,11 @@ void scheduler_tick(void)
        int cpu = smp_processor_id();
        struct rq *rq = cpu_rq(cpu);
        struct task_struct *curr = rq->curr;
-        u64 next_tick = rq->tick_timestamp + TICK_NSEC;
+        sched_clock_tick();
        spin_lock(&rq->lock);
-        __update_rq_clock(rq);
+        update_rq_clock(rq);
-        /*
-         * Let rq->clock advance by at least TICK_NSEC:
-         */
-        if (unlikely(rq->clock < next_tick)) {
-                rq->clock = next_tick;
-                rq->clock_underflows++;
-        }
-        rq->tick_timestamp = rq->clock;
-        update_last_tick_seen(rq);
        update_cpu_load(rq);
        curr->sched_class->task_tick(rq, curr, 0);
        spin_unlock(&rq->lock);
@@ -4570,7 +4505,7 @@ need_resched_nonpreemptible:
         * Do the rq-clock update outside the rq lock:
         */
        local_irq_disable();
-        __update_rq_clock(rq);
+        update_rq_clock(rq);
        spin_lock(&rq->lock);
        clear_tsk_need_resched(prev);
@@ -4595,9 +4530,9 @@ need_resched_nonpreemptible:
        prev->sched_class->put_prev_task(rq, prev);
        next = pick_next_task(rq, prev);
-        sched_info_switch(prev, next);
        if (likely(prev != next)) {
+                sched_info_switch(prev, next);
                rq->nr_switches++;
                rq->curr = next;
                ++*switch_count;
@@ -7755,7 +7690,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 {
        int i, j;
-        lock_doms_cur();
+        mutex_lock(&sched_domains_mutex);
        /* always unregister in case we don't destroy any domains */
        unregister_sched_domain_sysctl();
@@ -7804,7 +7739,7 @@ match2:
        register_sched_domain_sysctl();
-        unlock_doms_cur();
+        mutex_unlock(&sched_domains_mutex);
 }
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -7813,8 +7748,10 @@ int arch_reinit_sched_domains(void)
        int err;
        get_online_cpus();
+        mutex_lock(&sched_domains_mutex);
        detach_destroy_domains(&cpu_online_map);
        err = arch_init_sched_domains(&cpu_online_map);
+        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
        return err;
@@ -7932,13 +7869,16 @@ void __init sched_init_smp(void)
        BUG_ON(sched_group_nodes_bycpu == NULL);
 #endif
        get_online_cpus();
+        mutex_lock(&sched_domains_mutex);
        arch_init_sched_domains(&cpu_online_map);
        cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
        if (cpus_empty(non_isolated_cpus))
                cpu_set(smp_processor_id(), non_isolated_cpus);
+        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
        /* XXX: Theoretical race here - CPU may be hotplugged now */
        hotcpu_notifier(update_sched_domains, 0);
+        init_hrtick();
        /* Move init over to a non-isolated CPU */
        if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
@@ -8025,7 +7965,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
        se->my_q = cfs_rq;
        se->load.weight = tg->shares;
-        se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
+        se->load.inv_weight = 0;
        se->parent = parent;
 }
 #endif
@@ -8149,8 +8089,6 @@ void __init sched_init(void)
                spin_lock_init(&rq->lock);
                lockdep_set_class(&rq->lock, &rq->rq_lock_key);
                rq->nr_running = 0;
-                rq->clock = 1;
-                update_last_tick_seen(rq);
                init_cfs_rq(&rq->cfs, rq);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8294,6 +8232,7 @@ EXPORT_SYMBOL(__might_sleep);
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
        int on_rq;
        update_rq_clock(rq);
        on_rq = p->se.on_rq;
        if (on_rq)
@@ -8325,7 +8264,6 @@ void normalize_rt_tasks(void)
                p->se.sleep_start               = 0;
                p->se.block_start               = 0;
 #endif
-                task_rq(p)->clock               = 0;
                if (!rt_task(p)) {
                        /*
@@ -8692,7 +8630,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
                dequeue_entity(cfs_rq, se, 0);
        se->load.weight = shares;
-        se->load.inv_weight = div64_64((1ULL<<32), shares);
+        se->load.inv_weight = 0;
        if (on_rq)
                enqueue_entity(cfs_rq, se, 0);
@@ -8722,13 +8660,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
        if (!tg->se[0])
                return -EINVAL;
-        /*
-         * A weight of 0 or 1 can cause arithmetics problems.
-         * (The default weight is 1024 - so there's no practical
-         *  limitation from this.)
-         */
        if (shares < MIN_SHARES)
                shares = MIN_SHARES;
+        else if (shares > MAX_SHARES)
+                shares = MAX_SHARES;
        mutex_lock(&shares_mutex);
        if (tg->shares == shares)
@@ -8753,7 +8688,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
                 * force a rebalance
                 */
                cfs_rq_set_shares(tg->cfs_rq[i], 0);
-                set_se_shares(tg->se[i], shares/nr_cpu_ids);
+                set_se_shares(tg->se[i], shares);
        }
        /*
@@ -8787,7 +8722,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
        if (runtime == RUNTIME_INF)
                return 1ULL << 16;
-        return div64_64(runtime << 16, period);
+        return div64_u64(runtime << 16, period);
 }
 #ifdef CONFIG_CGROUP_SCHED
@@ -9057,13 +8992,13 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
                                u64 shareval)
 {
        return sched_group_set_shares(cgroup_tg(cgrp), shareval);
 }
-static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 {
        struct task_group *tg = cgroup_tg(cgrp);
@@ -9073,48 +9008,14 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
 #ifdef CONFIG_RT_GROUP_SCHED
 static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
-                                struct file *file,
+                                s64 val)
-                                const char __user *userbuf,
-                                size_t nbytes, loff_t *unused_ppos)
 {
-        char buffer[64];
+        return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
-        int retval = 0;
-        s64 val;
-        char *end;
-        if (!nbytes)
-                return -EINVAL;
-        if (nbytes >= sizeof(buffer))
-                return -E2BIG;
-        if (copy_from_user(buffer, userbuf, nbytes))
-                return -EFAULT;
-        buffer[nbytes] = 0;     /* nul-terminate */
-        /* strip newline if necessary */
-        if (nbytes && (buffer[nbytes-1] == '\n'))
-                buffer[nbytes-1] = 0;
-        val = simple_strtoll(buffer, &end, 0);
-        if (*end)
-                return -EINVAL;
-        /* Pass to subsystem */
-        retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
-        if (!retval)
-                retval = nbytes;
-        return retval;
 }
-static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
+static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
-                                   struct file *file,
-                                   char __user *buf, size_t nbytes,
-                                   loff_t *ppos)
 {
-        char tmp[64];
+        return sched_group_rt_runtime(cgroup_tg(cgrp));
-        long val = sched_group_rt_runtime(cgroup_tg(cgrp));
-        int len = sprintf(tmp, "%ld\n", val);
-        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
@@ -9133,20 +9034,20 @@ static struct cftype cpu_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
        {
                .name = "shares",
-                .read_uint = cpu_shares_read_uint,
+                .read_u64 = cpu_shares_read_u64,
-                .write_uint = cpu_shares_write_uint,
+                .write_u64 = cpu_shares_write_u64,
        },
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
        {
                .name = "rt_runtime_us",
-                .read = cpu_rt_runtime_read,
+                .read_s64 = cpu_rt_runtime_read,
-                .write = cpu_rt_runtime_write,
+                .write_s64 = cpu_rt_runtime_write,
        },
        {
                .name = "rt_period_us",
-                .read_uint = cpu_rt_period_read_uint,
+                .read_u64 = cpu_rt_period_read_uint,
-                .write_uint = cpu_rt_period_write_uint,
+                .write_u64 = cpu_rt_period_write_uint,
        },
 #endif
 };
@@ -9277,8 +9178,8 @@ out:
 static struct cftype files[] = {
        {
                .name = "usage",
-                .read_uint = cpuusage_read,
+                .read_u64 = cpuusage_read,
-                .write_uint = cpuusage_write,
+                .write_u64 = cpuusage_write,
        },
 };
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
new file mode 100644
index 000000000000..9c597e37f7de
--- /dev/null
+++ b/kernel/sched_clock.c
@@ -0,0 +1,236 @@
+/*
+ * sched_clock for unstable cpu clocks
+ *
+ *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Based on code by:
+ *   Ingo Molnar <mingo@redhat.com>
+ *   Guillaume Chazarain <guichaz@gmail.com>
+ *
+ * Create a semi stable clock from a mixture of other events, including:
+ *  - gtod
+ *  - jiffies
+ *  - sched_clock()
+ *  - explicit idle events
+ *
+ * We use gtod as base and the unstable clock deltas. The deltas are filtered,
+ * making it monotonic and keeping it within an expected window.  This window
+ * is set up using jiffies.
+ *
+ * Furthermore, explicit sleep and wakeup hooks allow us to account for time
+ * that is otherwise invisible (TSC gets stopped).
+ *
+ * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
+ * consistent between cpus (never more than 1 jiffies difference).
+ */
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+struct sched_clock_data {
+        /*
+         * Raw spinlock - this is a special case: this might be called
+         * from within instrumentation code so we dont want to do any
+         * instrumentation ourselves.
+         */
+        raw_spinlock_t          lock;
+        unsigned long           prev_jiffies;
+        u64                     prev_raw;
+        u64                     tick_raw;
+        u64                     tick_gtod;
+        u64                     clock;
+};
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
+static inline struct sched_clock_data *this_scd(void)
+{
+        return &__get_cpu_var(sched_clock_data);
+}
+static inline struct sched_clock_data *cpu_sdc(int cpu)
+{
+        return &per_cpu(sched_clock_data, cpu);
+}
+void sched_clock_init(void)
+{
+        u64 ktime_now = ktime_to_ns(ktime_get());
+        u64 now = 0;
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                struct sched_clock_data *scd = cpu_sdc(cpu);
+                scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+                scd->prev_jiffies = jiffies;
+                scd->prev_raw = now;
+                scd->tick_raw = now;
+                scd->tick_gtod = ktime_now;
+                scd->clock = ktime_now;
+        }
+}
+/*
+ * update the percpu scd from the raw @now value
+ *
+ *  - filter out backward motion
+ *  - use jiffies to generate a min,max window to clip the raw values
+ */
+static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
+{
+        unsigned long now_jiffies = jiffies;
+        long delta_jiffies = now_jiffies - scd->prev_jiffies;
+        u64 clock = scd->clock;
+        u64 min_clock, max_clock;
+        s64 delta = now - scd->prev_raw;
+        WARN_ON_ONCE(!irqs_disabled());
+        min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
+        if (unlikely(delta < 0)) {
+                clock++;
+                goto out;
+        }
+        max_clock = min_clock + TICK_NSEC;
+        if (unlikely(clock + delta > max_clock)) {
+                if (clock < max_clock)
+                        clock = max_clock;
+                else
+                        clock++;
+        } else {
+                clock += delta;
+        }
+ out:
+        if (unlikely(clock < min_clock))
+                clock = min_clock;
+        scd->prev_raw = now;
+        scd->prev_jiffies = now_jiffies;
+        scd->clock = clock;
+}
+static void lock_double_clock(struct sched_clock_data *data1,
+                                struct sched_clock_data *data2)
+{
+        if (data1 < data2) {
+                __raw_spin_lock(&data1->lock);
+                __raw_spin_lock(&data2->lock);
+        } else {
+                __raw_spin_lock(&data2->lock);
+                __raw_spin_lock(&data1->lock);
+        }
+}
+u64 sched_clock_cpu(int cpu)
+{
+        struct sched_clock_data *scd = cpu_sdc(cpu);
+        u64 now, clock;
+        WARN_ON_ONCE(!irqs_disabled());
+        now = sched_clock();
+        if (cpu != raw_smp_processor_id()) {
+                /*
+                 * in order to update a remote cpu's clock based on our
+                 * unstable raw time rebase it against:
+                 *   tick_raw           (offset between raw counters)
+                 *   tick_gotd          (tick offset between cpus)
+                 */
+                struct sched_clock_data *my_scd = this_scd();
+                lock_double_clock(scd, my_scd);
+                now -= my_scd->tick_raw;
+                now += scd->tick_raw;
+                now -= my_scd->tick_gtod;
+                now += scd->tick_gtod;
+                __raw_spin_unlock(&my_scd->lock);
+        } else {
+                __raw_spin_lock(&scd->lock);
+        }
+        __update_sched_clock(scd, now);
+        clock = scd->clock;
+        __raw_spin_unlock(&scd->lock);
+        return clock;
+}
+void sched_clock_tick(void)
+{
+        struct sched_clock_data *scd = this_scd();
+        u64 now, now_gtod;
+        WARN_ON_ONCE(!irqs_disabled());
+        now = sched_clock();
+        now_gtod = ktime_to_ns(ktime_get());
+        __raw_spin_lock(&scd->lock);
+        __update_sched_clock(scd, now);
+        /*
+         * update tick_gtod after __update_sched_clock() because that will
+         * already observe 1 new jiffy; adding a new tick_gtod to that would
+         * increase the clock 2 jiffies.
+         */
+        scd->tick_raw = now;
+        scd->tick_gtod = now_gtod;
+        __raw_spin_unlock(&scd->lock);
+}
+/*
+ * We are going deep-idle (irqs are disabled):
+ */
+void sched_clock_idle_sleep_event(void)
+{
+        sched_clock_cpu(smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
+/*
+ * We just idled delta nanoseconds (called with irqs disabled):
+ */
+void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+        struct sched_clock_data *scd = this_scd();
+        u64 now = sched_clock();
+        /*
+         * Override the previous timestamp and ignore all
+         * sched_clock() deltas that occured while we idled,
+         * and use the PM-provided delta_ns to advance the
+         * rq clock:
+         */
+        __raw_spin_lock(&scd->lock);
+        scd->prev_raw = now;
+        scd->clock += delta_ns;
+        __raw_spin_unlock(&scd->lock);
+        touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
+#endif
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ * This is default implementation.
+ * Architectures and sub-architectures can override this.
+ */
+unsigned long long __attribute__((weak)) sched_clock(void)
+{
+        return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index f3f4af4b8b0f..5f06118fbc31 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -204,13 +204,6 @@ static void print_cpu(struct seq_file *m, int cpu)
        PN(next_balance);
        P(curr->pid);
        PN(clock);
-        PN(idle_clock);
-        PN(prev_clock_raw);
-        P(clock_warps);
-        P(clock_overflows);
-        P(clock_underflows);
-        P(clock_deep_idle_events);
-        PN(clock_max_delta);
        P(cpu_load[0]);
        P(cpu_load[1]);
        P(cpu_load[2]);
@@ -277,12 +270,9 @@ static int __init init_sched_debug_procfs(void)
 {
        struct proc_dir_entry *pe;
-        pe = create_proc_entry("sched_debug", 0644, NULL);
+        pe = proc_create("sched_debug", 0644, NULL, &sched_debug_fops);
        if (!pe)
                return -ENOMEM;
-        pe->proc_fops = &sched_debug_fops;
        return 0;
 }
@@ -360,8 +350,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
                avg_per_cpu = p->se.sum_exec_runtime;
                if (p->se.nr_migrations) {
-                        avg_per_cpu = div64_64(avg_per_cpu,
+                        avg_per_cpu = div64_u64(avg_per_cpu,
-                                               p->se.nr_migrations);
+                                                p->se.nr_migrations);
                } else {
                        avg_per_cpu = -1LL;
                }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 89fa32b4edf2..c863663d204d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -682,6 +682,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
+        account_entity_enqueue(cfs_rq, se);
        if (wakeup) {
                place_entity(cfs_rq, se, 0);
@@ -692,7 +693,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
        check_spread(cfs_rq, se);
        if (se != cfs_rq->curr)
                __enqueue_entity(cfs_rq, se);
-        account_entity_enqueue(cfs_rq, se);
 }
 static void update_avg(u64 *avg, u64 sample)
@@ -841,8 +841,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
         * queued ticks are scheduled to match the slice, so don't bother
         * validating it and just reschedule.
         */
-        if (queued)
+        if (queued) {
-                return resched_task(rq_of(cfs_rq)->curr);
+                resched_task(rq_of(cfs_rq)->curr);
+                return;
+        }
        /*
         * don't let the period tick interfere with the hrtick preemption
         */
@@ -957,7 +959,7 @@ static void yield_task_fair(struct rq *rq)
                return;
        if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
-                __update_rq_clock(rq);
+                update_rq_clock(rq);
                /*
                 * Update run-time statistics of the 'current'.
                 */
@@ -1007,7 +1009,7 @@ static int wake_idle(int cpu, struct task_struct *p)
         * sibling runqueue info. This will avoid the checks and cache miss
         * penalities associated with that.
         */
-        if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
+        if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
                return cpu;
        for_each_domain(cpu, sd) {
@@ -1611,30 +1613,6 @@ static const struct sched_class fair_sched_class = {
 };
 #ifdef CONFIG_SCHED_DEBUG
-static void
-print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth)
-{
-        struct sched_entity *se;
-        if (!cfs_rq)
-                return;
-        list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) {
-                int i;
-                for (i = depth; i; i--)
-                        seq_puts(m, "  ");
-                seq_printf(m, "%lu %s %lu\n",
-                                se->load.weight,
-                                entity_is_task(se) ? "T" : "G",
-                                calc_delta_weight(SCHED_LOAD_SCALE, se)
-                                );
-                if (!entity_is_task(se))
-                        print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1);
-        }
-}
 static void print_cfs_stats(struct seq_file *m, int cpu)
 {
        struct cfs_rq *cfs_rq;
@@ -1642,9 +1620,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
        rcu_read_lock();
        for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
                print_cfs_rq(m, cpu, cfs_rq);
-        seq_printf(m, "\nWeight tree:\n");
-        print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1);
        rcu_read_unlock();
 }
 #endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 2bcafa375633..3a4f92dbbe66 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -99,7 +99,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
 /*
 * Simple, special scheduling class for the per-CPU idle tasks:
 */
-const struct sched_class idle_sched_class = {
+static const struct sched_class idle_sched_class = {
        /* .next is NULL */
        /* no enqueue/yield_task for idle tasks */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c2730a5a4f05..060e87b0cb1c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1098,11 +1098,14 @@ static void post_schedule_rt(struct rq *rq)
        }
 }
+/*
+ * If we are not running and we are not going to reschedule soon, we should
+ * try to push tasks away now
+ */
 static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
 {
        if (!task_running(rq, p) &&
-            (p->prio >= rq->rt.highest_prio) &&
+            !test_tsk_need_resched(rq->curr) &&
            rq->rt.overloaded)
                push_rt_tasks(rq);
 }
@@ -1309,7 +1312,7 @@ static void set_curr_task_rt(struct rq *rq)
        p->se.exec_start = rq->clock;
 }
-const struct sched_class rt_sched_class = {
+static const struct sched_class rt_sched_class = {
        .next                   = &fair_sched_class,
        .enqueue_task           = enqueue_task_rt,
        .dequeue_task           = dequeue_task_rt,
diff --git a/kernel/signal.c b/kernel/signal.c
index 64ad0ed15992..72bb4f51f963 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -39,11 +39,19 @@
 static struct kmem_cache *sigqueue_cachep;
+static int __sig_ignored(struct task_struct *t, int sig)
+{
+        void __user *handler;
+        /* Is it explicitly or implicitly ignored? */
+        handler = t->sighand->action[sig - 1].sa.sa_handler;
+        return handler == SIG_IGN ||
+                (handler == SIG_DFL && sig_kernel_ignore(sig));
+}
 static int sig_ignored(struct task_struct *t, int sig)
 {
-        void __user * handler;
        /*
         * Tracers always want to know about signals..
         */
@@ -58,10 +66,7 @@ static int sig_ignored(struct task_struct *t, int sig)
        if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
                return 0;
-        /* Is it explicitly or implicitly ignored? */
+        return __sig_ignored(t, sig);
-        handler = t->sighand->action[sig-1].sa.sa_handler;
-        return   handler == SIG_IGN ||
-                (handler == SIG_DFL && sig_kernel_ignore(sig));
 }
 /*
@@ -372,7 +377,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 */
 int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 {
-        int signr = 0;
+        int signr;
        /* We only dequeue private signals from ourselves, we don't let
         * signalfd steal them
@@ -405,8 +410,12 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
                        }
                }
        }
        recalc_sigpending();
-        if (signr && unlikely(sig_kernel_stop(signr))) {
+        if (!signr)
+                return 0;
+        if (unlikely(sig_kernel_stop(signr))) {
                /*
                 * Set a marker that we have dequeued a stop signal.  Our
                 * caller might release the siglock and then the pending
@@ -422,9 +431,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
                if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
                        tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
        }
-        if (signr &&
+        if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
-             ((info->si_code & __SI_MASK) == __SI_TIMER) &&
-             info->si_sys_private) {
                /*
                 * Release the siglock to ensure proper locking order
                 * of timer locks outside of siglocks.  Note, we leave
@@ -526,21 +533,34 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
 static int check_kill_permission(int sig, struct siginfo *info,
                                 struct task_struct *t)
 {
-        int error = -EINVAL;
+        struct pid *sid;
+        int error;
        if (!valid_signal(sig))
-                return error;
+                return -EINVAL;
-        if (info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) {
+        if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info)))
-                error = audit_signal_info(sig, t); /* Let audit system see the signal */
+                return 0;
-                if (error)
-                        return error;
+        error = audit_signal_info(sig, t); /* Let audit system see the signal */
-                error = -EPERM;
+        if (error)
-                if (((sig != SIGCONT) ||
-                        (task_session_nr(current) != task_session_nr(t)))
-                    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
-                    && (current->uid ^ t->suid) && (current->uid ^ t->uid)
-                    && !capable(CAP_KILL))
                return error;
+        if ((current->euid ^ t->suid) && (current->euid ^ t->uid) &&
+            (current->uid  ^ t->suid) && (current->uid  ^ t->uid) &&
+            !capable(CAP_KILL)) {
+                switch (sig) {
+                case SIGCONT:
+                        sid = task_session(t);
+                        /*
+                         * We don't return the error if sid == NULL. The
+                         * task was unhashed, the caller must notice this.
+                         */
+                        if (!sid || sid == task_session(current))
+                                break;
+                default:
+                        return -EPERM;
+                }
        }
        return security_task_kill(t, info, sig, 0);
@@ -550,62 +570,44 @@ static int check_kill_permission(int sig, struct siginfo *info,
 static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
 /*
- * Handle magic process-wide effects of stop/continue signals.
+ * Handle magic process-wide effects of stop/continue signals. Unlike
- * Unlike the signal actions, these happen immediately at signal-generation
+ * the signal actions, these happen immediately at signal-generation
 * time regardless of blocking, ignoring, or handling.  This does the
 * actual continuing for SIGCONT, but not the actual stopping for stop
- * signals.  The process stop is done as a signal action for SIG_DFL.
+ * signals. The process stop is done as a signal action for SIG_DFL.
+ *
+ * Returns true if the signal should be actually delivered, otherwise
+ * it should be dropped.
 */
-static void handle_stop_signal(int sig, struct task_struct *p)
+static int prepare_signal(int sig, struct task_struct *p)
 {
+        struct signal_struct *signal = p->signal;
        struct task_struct *t;
-        if (p->signal->flags & SIGNAL_GROUP_EXIT)
+        if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) {
                /*
-                 * The process is in the middle of dying already.
+                 * The process is in the middle of dying, nothing to do.
                 */
-                return;
+        } else if (sig_kernel_stop(sig)) {
-        if (sig_kernel_stop(sig)) {
                /*
                 * This is a stop signal.  Remove SIGCONT from all queues.
                 */
-                rm_from_queue(sigmask(SIGCONT), &p->signal->shared_pending);
+                rm_from_queue(sigmask(SIGCONT), &signal->shared_pending);
                t = p;
                do {
                        rm_from_queue(sigmask(SIGCONT), &t->pending);
-                        t = next_thread(t);
+                } while_each_thread(p, t);
-                } while (t != p);
        } else if (sig == SIGCONT) {
+                unsigned int why;
                /*
                 * Remove all stop signals from all queues,
                 * and wake all threads.
                 */
-                if (unlikely(p->signal->group_stop_count > 0)) {
+                rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
-                        /*
-                         * There was a group stop in progress.  We'll
-                         * pretend it finished before we got here.  We are
-                         * obliged to report it to the parent: if the
-                         * SIGSTOP happened "after" this SIGCONT, then it
-                         * would have cleared this pending SIGCONT.  If it
-                         * happened "before" this SIGCONT, then the parent
-                         * got the SIGCHLD about the stop finishing before
-                         * the continue happened.  We do the notification
-                         * now, and it's as if the stop had finished and
-                         * the SIGCHLD was pending on entry to this kill.
-                         */
-                        p->signal->group_stop_count = 0;
-                        p->signal->flags = SIGNAL_STOP_CONTINUED;
-                        spin_unlock(&p->sighand->siglock);
-                        do_notify_parent_cldstop(p, CLD_STOPPED);
-                        spin_lock(&p->sighand->siglock);
-                }
-                rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
                t = p;
                do {
                        unsigned int state;
                        rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
-                        
                        /*
                         * If there is a handler for SIGCONT, we must make
                         * sure that no thread returns to user mode before
@@ -615,7 +617,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
                         * running the handler.  With the TIF_SIGPENDING
                         * flag set, the thread will pause and acquire the
                         * siglock that we hold now and until we've queued
-                         * the pending signal. 
+                         * the pending signal.
                         *
                         * Wake up the stopped thread _after_ setting
                         * TIF_SIGPENDING
@@ -626,49 +628,163 @@ static void handle_stop_signal(int sig, struct task_struct *p)
                                state |= TASK_INTERRUPTIBLE;
                        }
                        wake_up_state(t, state);
+                } while_each_thread(p, t);
-                        t = next_thread(t);
+                /*
-                } while (t != p);
+                 * Notify the parent with CLD_CONTINUED if we were stopped.
+                 *
+                 * If we were in the middle of a group stop, we pretend it
+                 * was already finished, and then continued. Since SIGCHLD
+                 * doesn't queue we report only CLD_STOPPED, as if the next
+                 * CLD_CONTINUED was dropped.
+                 */
+                why = 0;
+                if (signal->flags & SIGNAL_STOP_STOPPED)
+                        why |= SIGNAL_CLD_CONTINUED;
+                else if (signal->group_stop_count)
+                        why |= SIGNAL_CLD_STOPPED;
-                if (p->signal->flags & SIGNAL_STOP_STOPPED) {
+                if (why) {
                        /*
-                         * We were in fact stopped, and are now continued.
+                         * The first thread which returns from finish_stop()
-                         * Notify the parent with CLD_CONTINUED.
+                         * will take ->siglock, notice SIGNAL_CLD_MASK, and
+                         * notify its parent. See get_signal_to_deliver().
                         */
-                        p->signal->flags = SIGNAL_STOP_CONTINUED;
+                        signal->flags = why | SIGNAL_STOP_CONTINUED;
-                        p->signal->group_exit_code = 0;
+                        signal->group_stop_count = 0;
-                        spin_unlock(&p->sighand->siglock);
+                        signal->group_exit_code = 0;
-                        do_notify_parent_cldstop(p, CLD_CONTINUED);
-                        spin_lock(&p->sighand->siglock);
                } else {
                        /*
                         * We are not stopped, but there could be a stop
                         * signal in the middle of being processed after
                         * being removed from the queue.  Clear that too.
                         */
-                        p->signal->flags = 0;
+                        signal->flags &= ~SIGNAL_STOP_DEQUEUED;
                }
-        } else if (sig == SIGKILL) {
+        }
+        return !sig_ignored(p, sig);
+}
+/*
+ * Test if P wants to take SIG.  After we've checked all threads with this,
+ * it's equivalent to finding no threads not blocking SIG.  Any threads not
+ * blocking SIG were ruled out because they are not running and already
+ * have pending signals.  Such threads will dequeue from the shared queue
+ * as soon as they're available, so putting the signal on the shared queue
+ * will be equivalent to sending it to one such thread.
+ */
+static inline int wants_signal(int sig, struct task_struct *p)
+{
+        if (sigismember(&p->blocked, sig))
+                return 0;
+        if (p->flags & PF_EXITING)
+                return 0;
+        if (sig == SIGKILL)
+                return 1;
+        if (task_is_stopped_or_traced(p))
+                return 0;
+        return task_curr(p) || !signal_pending(p);
+}
+static void complete_signal(int sig, struct task_struct *p, int group)
+{
+        struct signal_struct *signal = p->signal;
+        struct task_struct *t;
+        /*
+         * Now find a thread we can wake up to take the signal off the queue.
+         *
+         * If the main thread wants the signal, it gets first crack.
+         * Probably the least surprising to the average bear.
+         */
+        if (wants_signal(sig, p))
+                t = p;
+        else if (!group || thread_group_empty(p))
+                /*
+                 * There is just one thread and it does not need to be woken.
+                 * It will dequeue unblocked signals before it runs again.
+                 */
+                return;
+        else {
                /*
-                 * Make sure that any pending stop signal already dequeued
+                 * Otherwise try to find a suitable thread.
-                 * is undone by the wakeup for SIGKILL.
                 */
-                p->signal->flags = 0;
+                t = signal->curr_target;
+                while (!wants_signal(sig, t)) {
+                        t = next_thread(t);
+                        if (t == signal->curr_target)
+                                /*
+                                 * No thread needs to be woken.
+                                 * Any eligible threads will see
+                                 * the signal in the queue soon.
+                                 */
+                                return;
+                }
+                signal->curr_target = t;
        }
+        /*
+         * Found a killable thread.  If the signal will be fatal,
+         * then start taking the whole group down immediately.
+         */
+        if (sig_fatal(p, sig) &&
+            !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
+            !sigismember(&t->real_blocked, sig) &&
+            (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
+                /*
+                 * This signal will be fatal to the whole group.
+                 */
+                if (!sig_kernel_coredump(sig)) {
+                        /*
+                         * Start a group exit and wake everybody up.
+                         * This way we don't have other threads
+                         * running and doing things after a slower
+                         * thread has the fatal signal pending.
+                         */
+                        signal->flags = SIGNAL_GROUP_EXIT;
+                        signal->group_exit_code = sig;
+                        signal->group_stop_count = 0;
+                        t = p;
+                        do {
+                                sigaddset(&t->pending.signal, SIGKILL);
+                                signal_wake_up(t, 1);
+                        } while_each_thread(p, t);
+                        return;
+                }
+        }
+        /*
+         * The signal is already in the shared-pending queue.
+         * Tell the chosen thread to wake up and dequeue it.
+         */
+        signal_wake_up(t, sig == SIGKILL);
+        return;
+}
+static inline int legacy_queue(struct sigpending *signals, int sig)
+{
+        return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
-                        struct sigpending *signals)
+                        int group)
 {
-        struct sigqueue * q = NULL;
+        struct sigpending *pending;
-        int ret = 0;
+        struct sigqueue *q;
+        assert_spin_locked(&t->sighand->siglock);
+        if (!prepare_signal(sig, t))
+                return 0;
+        pending = group ? &t->signal->shared_pending : &t->pending;
        /*
-         * Deliver the signal to listening signalfds. This must be called
+         * Short-circuit ignored signals and support queuing
-         * with the sighand lock held.
+         * exactly one non-rt signal, so that we can get more
+         * detailed information about the cause of the signal.
         */
-        signalfd_notify(t, sig);
+        if (legacy_queue(pending, sig))
+                return 0;
        /*
         * fast-pathed signals for kernel-internal things like SIGSTOP
         * or SIGKILL.
@@ -688,7 +804,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
                                             (is_si_special(info) ||
                                              info->si_code >= 0)));
        if (q) {
-                list_add_tail(&q->list, &signals->list);
+                list_add_tail(&q->list, &pending->list);
                switch ((unsigned long) info) {
                case (unsigned long) SEND_SIG_NOINFO:
                        q->info.si_signo = sig;
@@ -718,13 +834,12 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
        }
 out_set:
-        sigaddset(&signals->signal, sig);
+        signalfd_notify(t, sig);
-        return ret;
+        sigaddset(&pending->signal, sig);
+        complete_signal(sig, t, group);
+        return 0;
 }
-#define LEGACY_QUEUE(sigptr, sig) \
-        (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig)))
 int print_fatal_signals;
 static void print_fatal_signal(struct pt_regs *regs, int signr)
@@ -757,29 +872,16 @@ static int __init setup_print_fatal_signals(char *str)
 __setup("print-fatal-signals=", setup_print_fatal_signals);
+int
+__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+{
+        return send_signal(sig, info, p, 1);
+}
 static int
 specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 {
-        int ret = 0;
+        return send_signal(sig, info, t, 0);
-        BUG_ON(!irqs_disabled());
-        assert_spin_locked(&t->sighand->siglock);
-        /* Short-circuit ignored signals.  */
-        if (sig_ignored(t, sig))
-                goto out;
-        /* Support queueing exactly one non-rt signal, so that we
-           can get more detailed information about the cause of
-           the signal. */
-        if (LEGACY_QUEUE(&t->pending, sig))
-                goto out;
-        ret = send_signal(sig, info, t, &t->pending);
-        if (!ret && !sigismember(&t->blocked, sig))
-                signal_wake_up(t, sig == SIGKILL);
-out:
-        return ret;
 }
 /*
@@ -790,7 +892,8 @@ out:
 * since we do not want to have a signal handler that was blocked
 * be invoked when user space had explicitly blocked it.
 *
- * We don't want to have recursive SIGSEGV's etc, for example.
+ * We don't want to have recursive SIGSEGV's etc, for example,
+ * that is why we also clear SIGNAL_UNKILLABLE.
 */
 int
 force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
@@ -810,6 +913,8 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
                        recalc_sigpending_and_wake(t);
                }
        }
+        if (action->sa.sa_handler == SIG_DFL)
+                t->signal->flags &= ~SIGNAL_UNKILLABLE;
        ret = specific_send_sig_info(sig, info, t);
        spin_unlock_irqrestore(&t->sighand->siglock, flags);
@@ -823,134 +928,6 @@ force_sig_specific(int sig, struct task_struct *t)
 }
 /*
- * Test if P wants to take SIG.  After we've checked all threads with this,
- * it's equivalent to finding no threads not blocking SIG.  Any threads not
- * blocking SIG were ruled out because they are not running and already
- * have pending signals.  Such threads will dequeue from the shared queue
- * as soon as they're available, so putting the signal on the shared queue
- * will be equivalent to sending it to one such thread.
- */
-static inline int wants_signal(int sig, struct task_struct *p)
-{
-        if (sigismember(&p->blocked, sig))
-                return 0;
-        if (p->flags & PF_EXITING)
-                return 0;
-        if (sig == SIGKILL)
-                return 1;
-        if (task_is_stopped_or_traced(p))
-                return 0;
-        return task_curr(p) || !signal_pending(p);
-}
-static void
-__group_complete_signal(int sig, struct task_struct *p)
-{
-        struct task_struct *t;
-        /*
-         * Now find a thread we can wake up to take the signal off the queue.
-         *
-         * If the main thread wants the signal, it gets first crack.
-         * Probably the least surprising to the average bear.
-         */
-        if (wants_signal(sig, p))
-                t = p;
-        else if (thread_group_empty(p))
-                /*
-                 * There is just one thread and it does not need to be woken.
-                 * It will dequeue unblocked signals before it runs again.
-                 */
-                return;
-        else {
-                /*
-                 * Otherwise try to find a suitable thread.
-                 */
-                t = p->signal->curr_target;
-                if (t == NULL)
-                        /* restart balancing at this thread */
-                        t = p->signal->curr_target = p;
-                while (!wants_signal(sig, t)) {
-                        t = next_thread(t);
-                        if (t == p->signal->curr_target)
-                                /*
-                                 * No thread needs to be woken.
-                                 * Any eligible threads will see
-                                 * the signal in the queue soon.
-                                 */
-                                return;
-                }
-                p->signal->curr_target = t;
-        }
-        /*
-         * Found a killable thread.  If the signal will be fatal,
-         * then start taking the whole group down immediately.
-         */
-        if (sig_fatal(p, sig) && !(p->signal->flags & SIGNAL_GROUP_EXIT) &&
-            !sigismember(&t->real_blocked, sig) &&
-            (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
-                /*
-                 * This signal will be fatal to the whole group.
-                 */
-                if (!sig_kernel_coredump(sig)) {
-                        /*
-                         * Start a group exit and wake everybody up.
-                         * This way we don't have other threads
-                         * running and doing things after a slower
-                         * thread has the fatal signal pending.
-                         */
-                        p->signal->flags = SIGNAL_GROUP_EXIT;
-                        p->signal->group_exit_code = sig;
-                        p->signal->group_stop_count = 0;
-                        t = p;
-                        do {
-                                sigaddset(&t->pending.signal, SIGKILL);
-                                signal_wake_up(t, 1);
-                        } while_each_thread(p, t);
-                        return;
-                }
-        }
-        /*
-         * The signal is already in the shared-pending queue.
-         * Tell the chosen thread to wake up and dequeue it.
-         */
-        signal_wake_up(t, sig == SIGKILL);
-        return;
-}
-int
-__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
-{
-        int ret = 0;
-        assert_spin_locked(&p->sighand->siglock);
-        handle_stop_signal(sig, p);
-        /* Short-circuit ignored signals.  */
-        if (sig_ignored(p, sig))
-                return ret;
-        if (LEGACY_QUEUE(&p->signal->shared_pending, sig))
-                /* This is a non-RT signal and we already have one queued.  */
-                return ret;
-        /*
-         * Put this signal on the shared-pending queue, or fail with EAGAIN.
-         * We always use the shared queue for process-wide signals,
-         * to avoid several races.
-         */
-        ret = send_signal(sig, info, p, &p->signal->shared_pending);
-        if (unlikely(ret))
-                return ret;
-        __group_complete_signal(sig, p);
-        return 0;
-}
-/*
 * Nuke all other threads in the group.
 */
 void zap_other_threads(struct task_struct *p)
@@ -978,13 +955,11 @@ int __fatal_signal_pending(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(__fatal_signal_pending);
-/*
- * Must be called under rcu_read_lock() or with tasklist_lock read-held.
- */
 struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
 {
        struct sighand_struct *sighand;
+        rcu_read_lock();
        for (;;) {
                sighand = rcu_dereference(tsk->sighand);
                if (unlikely(sighand == NULL))
@@ -995,6 +970,7 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
                        break;
                spin_unlock_irqrestore(&sighand->siglock, *flags);
        }
+        rcu_read_unlock();
        return sighand;
 }
@@ -1043,9 +1019,6 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
        struct task_struct *p;
        rcu_read_lock();
-        if (unlikely(sig_needs_tasklist(sig)))
-                read_lock(&tasklist_lock);
 retry:
        p = pid_task(pid, PIDTYPE_PID);
        if (p) {
@@ -1059,10 +1032,8 @@ retry:
                         */
                        goto retry;
        }
-        if (unlikely(sig_needs_tasklist(sig)))
-                read_unlock(&tasklist_lock);
        rcu_read_unlock();
        return error;
 }
@@ -1159,8 +1130,7 @@ static int kill_something_info(int sig, struct siginfo *info, int pid)
 */
 /*
- * These two are the most common entry points.  They send a signal
+ * The caller must ensure the task can't exit.
- * just to the specific thread.
 */
 int
 send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
@@ -1175,17 +1145,9 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
        if (!valid_signal(sig))
                return -EINVAL;
-        /*
-         * We need the tasklist lock even for the specific
-         * thread case (when we don't need to follow the group
-         * lists) in order to avoid races with "p->sighand"
-         * going away or changing from under us.
-         */
-        read_lock(&tasklist_lock);  
        spin_lock_irqsave(&p->sighand->siglock, flags);
        ret = specific_send_sig_info(sig, info, p);
        spin_unlock_irqrestore(&p->sighand->siglock, flags);
-        read_unlock(&tasklist_lock);
        return ret;
 }
@@ -1291,28 +1253,24 @@ void sigqueue_free(struct sigqueue *q)
        __sigqueue_free(q);
 }
-int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
+int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
 {
+        int sig = q->info.si_signo;
+        struct sigpending *pending;
        unsigned long flags;
-        int ret = 0;
+        int ret;
        BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-        /*
+        ret = -1;
-         * The rcu based delayed sighand destroy makes it possible to
+        if (!likely(lock_task_sighand(t, &flags)))
-         * run this without tasklist lock held. The task struct itself
+                goto ret;
-         * cannot go away as create_timer did get_task_struct().
-         *
-         * We return -1, when the task is marked exiting, so
-         * posix_timer_event can redirect it to the group leader
-         */
-        rcu_read_lock();
-        if (!likely(lock_task_sighand(p, &flags))) {
+        ret = 1; /* the signal is ignored */
-                ret = -1;
+        if (!prepare_signal(sig, t))
-                goto out_err;
+                goto out;
-        }
+        ret = 0;
        if (unlikely(!list_empty(&q->list))) {
                /*
                 * If an SI_TIMER entry is already queue just increment
@@ -1322,77 +1280,15 @@ int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
                q->info.si_overrun++;
                goto out;
        }
-        /* Short-circuit ignored signals.  */
-        if (sig_ignored(p, sig)) {
-                ret = 1;
-                goto out;
-        }
-        /*
-         * Deliver the signal to listening signalfds. This must be called
-         * with the sighand lock held.
-         */
-        signalfd_notify(p, sig);
-        list_add_tail(&q->list, &p->pending.list);
-        sigaddset(&p->pending.signal, sig);
-        if (!sigismember(&p->blocked, sig))
-                signal_wake_up(p, sig == SIGKILL);
-out:
-        unlock_task_sighand(p, &flags);
-out_err:
-        rcu_read_unlock();
-        return ret;
-}
-int
-send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
-{
-        unsigned long flags;
-        int ret = 0;
-        BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-        read_lock(&tasklist_lock);
-        /* Since it_lock is held, p->sighand cannot be NULL. */
-        spin_lock_irqsave(&p->sighand->siglock, flags);
-        handle_stop_signal(sig, p);
-        /* Short-circuit ignored signals.  */
-        if (sig_ignored(p, sig)) {
-                ret = 1;
-                goto out;
-        }
-        if (unlikely(!list_empty(&q->list))) {
+        signalfd_notify(t, sig);
-                /*
+        pending = group ? &t->signal->shared_pending : &t->pending;
-                 * If an SI_TIMER entry is already queue just increment
+        list_add_tail(&q->list, &pending->list);
-                 * the overrun count.  Other uses should not try to
+        sigaddset(&pending->signal, sig);
-                 * send the signal multiple times.
+        complete_signal(sig, t, group);
-                 */
-                BUG_ON(q->info.si_code != SI_TIMER);
-                q->info.si_overrun++;
-                goto out;
-        } 
-        /*
-         * Deliver the signal to listening signalfds. This must be called
-         * with the sighand lock held.
-         */
-        signalfd_notify(p, sig);
-        /*
-         * Put this signal on the shared-pending queue.
-         * We always use the shared queue for process-wide signals,
-         * to avoid several races.
-         */
-        list_add_tail(&q->list, &p->signal->shared_pending.list);
-        sigaddset(&p->signal->shared_pending.signal, sig);
-        __group_complete_signal(sig, p);
 out:
-        spin_unlock_irqrestore(&p->sighand->siglock, flags);
+        unlock_task_sighand(t, &flags);
-        read_unlock(&tasklist_lock);
+ret:
        return ret;
 }
@@ -1723,8 +1619,9 @@ static int do_signal_stop(int signr)
        } else {
                struct task_struct *t;
-                if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
+                if (unlikely((sig->flags & (SIGNAL_STOP_DEQUEUED | SIGNAL_UNKILLABLE))
-                    unlikely(sig->group_exit_task))
+                                         != SIGNAL_STOP_DEQUEUED) ||
+                    unlikely(signal_group_exit(sig)))
                        return 0;
                /*
                 * There is no group stop already in progress.
@@ -1799,8 +1696,9 @@ static int ptrace_signal(int signr, siginfo_t *info,
 int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
                          struct pt_regs *regs, void *cookie)
 {
-        sigset_t *mask = &current->blocked;
+        struct sighand_struct *sighand = current->sighand;
-        int signr = 0;
+        struct signal_struct *signal = current->signal;
+        int signr;
 relock:
        /*
@@ -1811,16 +1709,32 @@ relock:
         */
        try_to_freeze();
-        spin_lock_irq(&current->sighand->siglock);
+        spin_lock_irq(&sighand->siglock);
+        /*
+         * Every stopped thread goes here after wakeup. Check to see if
+         * we should notify the parent, prepare_signal(SIGCONT) encodes
+         * the CLD_ si_code into SIGNAL_CLD_MASK bits.
+         */
+        if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
+                int why = (signal->flags & SIGNAL_STOP_CONTINUED)
+                                ? CLD_CONTINUED : CLD_STOPPED;
+                signal->flags &= ~SIGNAL_CLD_MASK;
+                spin_unlock_irq(&sighand->siglock);
+                read_lock(&tasklist_lock);
+                do_notify_parent_cldstop(current->group_leader, why);
+                read_unlock(&tasklist_lock);
+                goto relock;
+        }
        for (;;) {
                struct k_sigaction *ka;
-                if (unlikely(current->signal->group_stop_count > 0) &&
+                if (unlikely(signal->group_stop_count > 0) &&
                    do_signal_stop(0))
                        goto relock;
-                signr = dequeue_signal(current, mask, info);
+                signr = dequeue_signal(current, &current->blocked, info);
                if (!signr)
                        break; /* will return 0 */
@@ -1830,7 +1744,7 @@ relock:
                                continue;
                }
-                ka = &current->sighand->action[signr-1];
+                ka = &sighand->action[signr-1];
                if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
                        continue;
                if (ka->sa.sa_handler != SIG_DFL) {
@@ -1852,7 +1766,8 @@ relock:
                /*
                 * Global init gets no signals it doesn't want.
                 */
-                if (is_global_init(current))
+                if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
+                    !signal_group_exit(signal))
                        continue;
                if (sig_kernel_stop(signr)) {
@@ -1867,14 +1782,14 @@ relock:
                         * We need to check for that and bail out if necessary.
                         */
                        if (signr != SIGSTOP) {
-                                spin_unlock_irq(&current->sighand->siglock);
+                                spin_unlock_irq(&sighand->siglock);
                                /* signals can be posted during this window */
                                if (is_current_pgrp_orphaned())
                                        goto relock;
-                                spin_lock_irq(&current->sighand->siglock);
+                                spin_lock_irq(&sighand->siglock);
                        }
                        if (likely(do_signal_stop(signr))) {
@@ -1889,15 +1804,16 @@ relock:
                        continue;
                }
-                spin_unlock_irq(&current->sighand->siglock);
+                spin_unlock_irq(&sighand->siglock);
                /*
                 * Anything else is fatal, maybe with a core dump.
                 */
                current->flags |= PF_SIGNALED;
-                if ((signr != SIGKILL) && print_fatal_signals)
-                        print_fatal_signal(regs, signr);
                if (sig_kernel_coredump(signr)) {
+                        if (print_fatal_signals)
+                                print_fatal_signal(regs, signr);
                        /*
                         * If it was able to dump core, this kills all
                         * other threads in the group and synchronizes with
@@ -1915,7 +1831,7 @@ relock:
                do_group_exit(signr);
                /* NOTREACHED */
        }
-        spin_unlock_irq(&current->sighand->siglock);
+        spin_unlock_irq(&sighand->siglock);
        return signr;
 }
@@ -2259,6 +2175,7 @@ static int do_tkill(int tgid, int pid, int sig)
        int error;
        struct siginfo info;
        struct task_struct *p;
+        unsigned long flags;
        error = -ESRCH;
        info.si_signo = sig;
@@ -2267,22 +2184,24 @@ static int do_tkill(int tgid, int pid, int sig)
        info.si_pid = task_tgid_vnr(current);
        info.si_uid = current->uid;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        p = find_task_by_vpid(pid);
        if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
                error = check_kill_permission(sig, &info, p);
                /*
                 * The null signal is a permissions and process existence
                 * probe.  No signal is actually delivered.
+                 *
+                 * If lock_task_sighand() fails we pretend the task dies
+                 * after receiving the signal. The window is tiny, and the
+                 * signal is private anyway.
                 */
-                if (!error && sig && p->sighand) {
+                if (!error && sig && lock_task_sighand(p, &flags)) {
-                        spin_lock_irq(&p->sighand->siglock);
-                        handle_stop_signal(sig, p);
                        error = specific_send_sig_info(sig, &info, p);
-                        spin_unlock_irq(&p->sighand->siglock);
+                        unlock_task_sighand(p, &flags);
                }
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return error;
 }
@@ -2339,13 +2258,14 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
 int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 {
+        struct task_struct *t = current;
        struct k_sigaction *k;
        sigset_t mask;
        if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
                return -EINVAL;
-        k = &current->sighand->action[sig-1];
+        k = &t->sighand->action[sig-1];
        spin_lock_irq(&current->sighand->siglock);
        if (oact)
@@ -2366,9 +2286,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
                 *   (for example, SIGCHLD), shall cause the pending signal to
                 *   be discarded, whether or not it is blocked"
                 */
-                if (act->sa.sa_handler == SIG_IGN ||
+                if (__sig_ignored(t, sig)) {
-                   (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) {
-                        struct task_struct *t = current;
                        sigemptyset(&mask);
                        sigaddset(&mask, sig);
                        rm_from_queue_full(&mask, &t->signal->shared_pending);
@@ -2623,7 +2541,7 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
        current->state = TASK_INTERRUPTIBLE;
        schedule();
-        set_thread_flag(TIF_RESTORE_SIGMASK);
+        set_restore_sigmask();
        return -ERESTARTNOHAND;
 }
 #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3c44956ee7e2..36e061740047 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -589,16 +589,20 @@ static void takeover_tasklets(unsigned int cpu)
        local_irq_disable();
        /* Find end, append list for that CPU. */
-        *__get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).head;
+        if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
-        __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail;
+                *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head;
-        per_cpu(tasklet_vec, cpu).head = NULL;
+                __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail;
-        per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
+                per_cpu(tasklet_vec, cpu).head = NULL;
+                per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
+        }
        raise_softirq_irqoff(TASKLET_SOFTIRQ);
-        *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head;
+        if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
-        __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail;
+                *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head;
-        per_cpu(tasklet_hi_vec, cpu).head = NULL;
+                __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail;
-        per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
+                per_cpu(tasklet_hi_vec, cpu).head = NULL;
+                per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
+        }
        raise_softirq_irqoff(HI_SOFTIRQ);
        local_irq_enable();
diff --git a/kernel/sys.c b/kernel/sys.c
index f2a451366953..895d2d4c9493 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -978,8 +978,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
                goto out;
        if (task_pgrp(p) != pgrp) {
-                detach_pid(p, PIDTYPE_PGID);
+                change_pid(p, PIDTYPE_PGID, pgrp);
-                attach_pid(p, PIDTYPE_PGID, pgrp);
                set_task_pgrp(p, pid_nr(pgrp));
        }
@@ -992,54 +991,67 @@ out:
 asmlinkage long sys_getpgid(pid_t pid)
 {
+        struct task_struct *p;
+        struct pid *grp;
+        int retval;
+        rcu_read_lock();
        if (!pid)
-                return task_pgrp_vnr(current);
+                grp = task_pgrp(current);
        else {
-                int retval;
-                struct task_struct *p;
-                read_lock(&tasklist_lock);
-                p = find_task_by_vpid(pid);
                retval = -ESRCH;
-                if (p) {
+                p = find_task_by_vpid(pid);
-                        retval = security_task_getpgid(p);
+                if (!p)
-                        if (!retval)
+                        goto out;
-                                retval = task_pgrp_vnr(p);
+                grp = task_pgrp(p);
-                }
+                if (!grp)
-                read_unlock(&tasklist_lock);
+                        goto out;
-                return retval;
+                retval = security_task_getpgid(p);
+                if (retval)
+                        goto out;
        }
+        retval = pid_vnr(grp);
+out:
+        rcu_read_unlock();
+        return retval;
 }
 #ifdef __ARCH_WANT_SYS_GETPGRP
 asmlinkage long sys_getpgrp(void)
 {
-        /* SMP - assuming writes are word atomic this is fine */
+        return sys_getpgid(0);
-        return task_pgrp_vnr(current);
 }
 #endif
 asmlinkage long sys_getsid(pid_t pid)
 {
+        struct task_struct *p;
+        struct pid *sid;
+        int retval;
+        rcu_read_lock();
        if (!pid)
-                return task_session_vnr(current);
+                sid = task_session(current);
        else {
-                int retval;
-                struct task_struct *p;
-                rcu_read_lock();
-                p = find_task_by_vpid(pid);
                retval = -ESRCH;
-                if (p) {
+                p = find_task_by_vpid(pid);
-                        retval = security_task_getsid(p);
+                if (!p)
-                        if (!retval)
+                        goto out;
-                                retval = task_session_vnr(p);
+                sid = task_session(p);
-                }
+                if (!sid)
-                rcu_read_unlock();
+                        goto out;
-                return retval;
+                retval = security_task_getsid(p);
+                if (retval)
+                        goto out;
        }
+        retval = pid_vnr(sid);
+out:
+        rcu_read_unlock();
+        return retval;
 }
 asmlinkage long sys_setsid(void)
@@ -1545,6 +1557,19 @@ out:
 *
 */
+static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r,
+                                     cputime_t *utimep, cputime_t *stimep)
+{
+        *utimep = cputime_add(*utimep, t->utime);
+        *stimep = cputime_add(*stimep, t->stime);
+        r->ru_nvcsw += t->nvcsw;
+        r->ru_nivcsw += t->nivcsw;
+        r->ru_minflt += t->min_flt;
+        r->ru_majflt += t->maj_flt;
+        r->ru_inblock += task_io_get_inblock(t);
+        r->ru_oublock += task_io_get_oublock(t);
+}
 static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 {
        struct task_struct *t;
@@ -1554,12 +1579,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
        memset((char *) r, 0, sizeof *r);
        utime = stime = cputime_zero;
-        rcu_read_lock();
+        if (who == RUSAGE_THREAD) {
-        if (!lock_task_sighand(p, &flags)) {
+                accumulate_thread_rusage(p, r, &utime, &stime);
-                rcu_read_unlock();
+                goto out;
-                return;
        }
+        if (!lock_task_sighand(p, &flags))
+                return;
        switch (who) {
                case RUSAGE_BOTH:
                case RUSAGE_CHILDREN:
@@ -1586,14 +1613,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
                        r->ru_oublock += p->signal->oublock;
                        t = p;
                        do {
-                                utime = cputime_add(utime, t->utime);
+                                accumulate_thread_rusage(t, r, &utime, &stime);
-                                stime = cputime_add(stime, t->stime);
-                                r->ru_nvcsw += t->nvcsw;
-                                r->ru_nivcsw += t->nivcsw;
-                                r->ru_minflt += t->min_flt;
-                                r->ru_majflt += t->maj_flt;
-                                r->ru_inblock += task_io_get_inblock(t);
-                                r->ru_oublock += task_io_get_oublock(t);
                                t = next_thread(t);
                        } while (t != p);
                        break;
@@ -1601,10 +1621,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
                default:
                        BUG();
        }
        unlock_task_sighand(p, &flags);
-        rcu_read_unlock();
+out:
        cputime_to_timeval(utime, &r->ru_utime);
        cputime_to_timeval(stime, &r->ru_stime);
 }
@@ -1618,7 +1637,8 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 asmlinkage long sys_getrusage(int who, struct rusage __user *ru)
 {
-        if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN)
+        if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
+            who != RUSAGE_THREAD)
                return -EINVAL;
        return getrusage(current, who, ru);
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index fd3364827ccf..d7ffdc59816a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -38,6 +38,7 @@
 #include <linux/writeback.h>
 #include <linux/hugetlb.h>
 #include <linux/initrd.h>
+#include <linux/key.h>
 #include <linux/times.h>
 #include <linux/limits.h>
 #include <linux/dcache.h>
@@ -144,12 +145,6 @@ extern int no_unaligned_warning;
 extern int max_lock_depth;
 #endif
-#ifdef CONFIG_SYSCTL_SYSCALL
-static int parse_table(int __user *, int, void __user *, size_t __user *,
-                void __user *, size_t, struct ctl_table *);
-#endif
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -809,6 +804,14 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &proc_dostring,
                .strategy       = &sysctl_string,
        },
+#ifdef CONFIG_KEYS
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "keys",
+                .mode           = 0555,
+                .child          = key_sysctls,
+        },
+#endif
 /*
 * NOTE: do not add new entries to this table unless you have read
 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1430,6 +1433,76 @@ void register_sysctl_root(struct ctl_table_root *root)
 }
 #ifdef CONFIG_SYSCTL_SYSCALL
+/* Perform the actual read/write of a sysctl table entry. */
+static int do_sysctl_strategy(struct ctl_table_root *root,
+                        struct ctl_table *table,
+                        int __user *name, int nlen,
+                        void __user *oldval, size_t __user *oldlenp,
+                        void __user *newval, size_t newlen)
+{
+        int op = 0, rc;
+        if (oldval)
+                op |= 004;
+        if (newval)
+                op |= 002;
+        if (sysctl_perm(root, table, op))
+                return -EPERM;
+        if (table->strategy) {
+                rc = table->strategy(table, name, nlen, oldval, oldlenp,
+                                     newval, newlen);
+                if (rc < 0)
+                        return rc;
+                if (rc > 0)
+                        return 0;
+        }
+        /* If there is no strategy routine, or if the strategy returns
+         * zero, proceed with automatic r/w */
+        if (table->data && table->maxlen) {
+                rc = sysctl_data(table, name, nlen, oldval, oldlenp,
+                                 newval, newlen);
+                if (rc < 0)
+                        return rc;
+        }
+        return 0;
+}
+static int parse_table(int __user *name, int nlen,
+                       void __user *oldval, size_t __user *oldlenp,
+                       void __user *newval, size_t newlen,
+                       struct ctl_table_root *root,
+                       struct ctl_table *table)
+{
+        int n;
+repeat:
+        if (!nlen)
+                return -ENOTDIR;
+        if (get_user(n, name))
+                return -EFAULT;
+        for ( ; table->ctl_name || table->procname; table++) {
+                if (!table->ctl_name)
+                        continue;
+                if (n == table->ctl_name) {
+                        int error;
+                        if (table->child) {
+                                if (sysctl_perm(root, table, 001))
+                                        return -EPERM;
+                                name++;
+                                nlen--;
+                                table = table->child;
+                                goto repeat;
+                        }
+                        error = do_sysctl_strategy(root, table, name, nlen,
+                                                   oldval, oldlenp,
+                                                   newval, newlen);
+                        return error;
+                }
+        }
+        return -ENOTDIR;
+}
 int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
               void __user *newval, size_t newlen)
 {
@@ -1447,7 +1520,8 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
        for (head = sysctl_head_next(NULL); head;
                        head = sysctl_head_next(head)) {
                error = parse_table(name, nlen, oldval, oldlenp, 
-                                        newval, newlen, head->ctl_table);
+                                        newval, newlen,
+                                        head->root, head->ctl_table);
                if (error != -ENOTDIR) {
                        sysctl_head_finish(head);
                        break;
@@ -1493,84 +1567,22 @@ static int test_perm(int mode, int op)
        return -EACCES;
 }
-int sysctl_perm(struct ctl_table *table, int op)
+int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
 {
        int error;
+        int mode;
        error = security_sysctl(table, op);
        if (error)
                return error;
-        return test_perm(table->mode, op);
-}
-#ifdef CONFIG_SYSCTL_SYSCALL
-static int parse_table(int __user *name, int nlen,
-                       void __user *oldval, size_t __user *oldlenp,
-                       void __user *newval, size_t newlen,
-                       struct ctl_table *table)
-{
-        int n;
-repeat:
-        if (!nlen)
-                return -ENOTDIR;
-        if (get_user(n, name))
-                return -EFAULT;
-        for ( ; table->ctl_name || table->procname; table++) {
-                if (!table->ctl_name)
-                        continue;
-                if (n == table->ctl_name) {
-                        int error;
-                        if (table->child) {
-                                if (sysctl_perm(table, 001))
-                                        return -EPERM;
-                                name++;
-                                nlen--;
-                                table = table->child;
-                                goto repeat;
-                        }
-                        error = do_sysctl_strategy(table, name, nlen,
-                                                   oldval, oldlenp,
-                                                   newval, newlen);
-                        return error;
-                }
-        }
-        return -ENOTDIR;
-}
-/* Perform the actual read/write of a sysctl table entry. */
+        if (root->permissions)
-int do_sysctl_strategy (struct ctl_table *table,
+                mode = root->permissions(root, current->nsproxy, table);
-                        int __user *name, int nlen,
+        else
-                        void __user *oldval, size_t __user *oldlenp,
+                mode = table->mode;
-                        void __user *newval, size_t newlen)
-{
-        int op = 0, rc;
-        if (oldval)
-                op |= 004;
-        if (newval) 
-                op |= 002;
-        if (sysctl_perm(table, op))
-                return -EPERM;
-        if (table->strategy) {
+        return test_perm(mode, op);
-                rc = table->strategy(table, name, nlen, oldval, oldlenp,
-                                     newval, newlen);
-                if (rc < 0)
-                        return rc;
-                if (rc > 0)
-                        return 0;
-        }
-        /* If there is no strategy routine, or if the strategy returns
-         * zero, proceed with automatic r/w */
-        if (table->data && table->maxlen) {
-                rc = sysctl_data(table, name, nlen, oldval, oldlenp,
-                                 newval, newlen);
-                if (rc < 0)
-                        return rc;
-        }
-        return 0;
 }
-#endif /* CONFIG_SYSCTL_SYSCALL */
 static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
 {
@@ -1583,9 +1595,13 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
 static __init int sysctl_init(void)
 {
-        int err;
        sysctl_set_parent(NULL, root_table);
-        err = sysctl_check_table(current->nsproxy, root_table);
+#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
+        {
+                int err;
+                err = sysctl_check_table(current->nsproxy, root_table);
+        }
+#endif
        return 0;
 }
@@ -1712,10 +1728,12 @@ struct ctl_table_header *__register_sysctl_paths(
        header->unregistering = NULL;
        header->root = root;
        sysctl_set_parent(NULL, header->ctl_table);
+#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
        if (sysctl_check_table(namespaces, header->ctl_table)) {
                kfree(header);
                return NULL;
        }
+#endif
        spin_lock(&sysctl_lock);
        header_list = lookup_header_list(root, namespaces);
        list_add_tail(&header->ctl_entry, header_list);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 07e86a828073..4a23517169a6 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -183,7 +183,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
        if (!tsk) {
                rcu_read_lock();
-                tsk = find_task_by_pid(pid);
+                tsk = find_task_by_vpid(pid);
                if (tsk)
                        get_task_struct(tsk);
                rcu_read_unlock();
@@ -230,7 +230,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
         */
        rcu_read_lock();
        if (!first)
-                first = find_task_by_pid(tgid);
+                first = find_task_by_vpid(tgid);
        if (!first || !lock_task_sighand(first, &flags))
                goto out;
@@ -547,7 +547,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
        if (!stats)
                goto err;
-        rc = fill_pid(tsk->pid, tsk, stats);
+        rc = fill_pid(-1, tsk, stats);
        if (rc < 0)
                goto err;
diff --git a/kernel/time.c b/kernel/time.c
index 35d373a98782..6a08660b4fac 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -35,6 +35,8 @@
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/math64.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -244,7 +246,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
        return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
 #else
 # if BITS_PER_LONG == 32
-        return ((u64)HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
+        return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
 # else
        return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN;
 # endif
@@ -260,7 +262,7 @@ unsigned int inline jiffies_to_usecs(const unsigned long j)
        return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
 #else
 # if BITS_PER_LONG == 32
-        return ((u64)HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
+        return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
 # else
        return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
 # endif
@@ -390,13 +392,17 @@ EXPORT_SYMBOL(set_normalized_timespec);
 struct timespec ns_to_timespec(const s64 nsec)
 {
        struct timespec ts;
+        s32 rem;
        if (!nsec)
                return (struct timespec) {0, 0};
-        ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, &ts.tv_nsec);
+        ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
-        if (unlikely(nsec < 0))
+        if (unlikely(rem < 0)) {
-                set_normalized_timespec(&ts, ts.tv_sec, ts.tv_nsec);
+                ts.tv_sec--;
+                rem += NSEC_PER_SEC;
+        }
+        ts.tv_nsec = rem;
        return ts;
 }
@@ -470,7 +476,7 @@ unsigned long msecs_to_jiffies(const unsigned int m)
        if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
                return MAX_JIFFY_OFFSET;
-        return ((u64)MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
+        return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
                >> MSEC_TO_HZ_SHR32;
 #endif
 }
@@ -485,7 +491,7 @@ unsigned long usecs_to_jiffies(const unsigned int u)
 #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
        return u * (HZ / USEC_PER_SEC);
 #else
-        return ((u64)USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
+        return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
                >> USEC_TO_HZ_SHR32;
 #endif
 }
@@ -526,8 +532,10 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
         * Convert jiffies to nanoseconds and separate with
         * one divide.
         */
-        u64 nsec = (u64)jiffies * TICK_NSEC;
+        u32 rem;
-        value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec);
+        value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
+                                    NSEC_PER_SEC, &rem);
+        value->tv_nsec = rem;
 }
 EXPORT_SYMBOL(jiffies_to_timespec);
@@ -565,12 +573,11 @@ void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
         * Convert jiffies to nanoseconds and separate with
         * one divide.
         */
-        u64 nsec = (u64)jiffies * TICK_NSEC;
+        u32 rem;
-        long tv_usec;
-        value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec);
+        value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
-        tv_usec /= NSEC_PER_USEC;
+                                    NSEC_PER_SEC, &rem);
-        value->tv_usec = tv_usec;
+        value->tv_usec = rem / NSEC_PER_USEC;
 }
 EXPORT_SYMBOL(jiffies_to_timeval);
@@ -586,9 +593,7 @@ clock_t jiffies_to_clock_t(long x)
        return x / (HZ / USER_HZ);
 # endif
 #else
-        u64 tmp = (u64)x * TICK_NSEC;
+        return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
-        do_div(tmp, (NSEC_PER_SEC / USER_HZ));
-        return (long)tmp;
 #endif
 }
 EXPORT_SYMBOL(jiffies_to_clock_t);
@@ -600,16 +605,12 @@ unsigned long clock_t_to_jiffies(unsigned long x)
                return ~0UL;
        return x * (HZ / USER_HZ);
 #else
-        u64 jif;
        /* Don't worry about loss of precision here .. */
        if (x >= ~0UL / HZ * USER_HZ)
                return ~0UL;
        /* .. but do try to contain it here */
-        jif = x * (u64) HZ;
+        return div_u64((u64)x * HZ, USER_HZ);
-        do_div(jif, USER_HZ);
-        return jif;
 #endif
 }
 EXPORT_SYMBOL(clock_t_to_jiffies);
@@ -618,10 +619,9 @@ u64 jiffies_64_to_clock_t(u64 x)
 {
 #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
 # if HZ < USER_HZ
-        x *= USER_HZ;
+        x = div_u64(x * USER_HZ, HZ);
-        do_div(x, HZ);
 # elif HZ > USER_HZ
-        do_div(x, HZ / USER_HZ);
+        x = div_u64(x, HZ / USER_HZ);
 # else
        /* Nothing to do */
 # endif
@@ -631,8 +631,7 @@ u64 jiffies_64_to_clock_t(u64 x)
         * but even this doesn't overflow in hundreds of years
         * in 64 bits, so..
         */
-        x *= TICK_NSEC;
+        x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
-        do_div(x, (NSEC_PER_SEC / USER_HZ));
 #endif
        return x;
 }
@@ -641,21 +640,17 @@ EXPORT_SYMBOL(jiffies_64_to_clock_t);
 u64 nsec_to_clock_t(u64 x)
 {
 #if (NSEC_PER_SEC % USER_HZ) == 0
-        do_div(x, (NSEC_PER_SEC / USER_HZ));
+        return div_u64(x, NSEC_PER_SEC / USER_HZ);
 #elif (USER_HZ % 512) == 0
-        x *= USER_HZ/512;
+        return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512);
-        do_div(x, (NSEC_PER_SEC / 512));
 #else
        /*
         * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
         * overflow after 64.99 years.
         * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
         */
-        x *= 9;
+        return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ);
-        do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) /
-                                  USER_HZ));
 #endif
-        return x;
 }
 #if (BITS_PER_LONG < 64)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 73961f35fdc8..dadde5361f32 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -471,10 +471,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
 /*
 * Sysfs setup bits:
 */
-static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
+static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
                   sysfs_override_clocksource);
-static SYSDEV_ATTR(available_clocksource, 0600,
+static SYSDEV_ATTR(available_clocksource, 0444,
                   sysfs_show_available_clocksources, NULL);
 static struct sysdev_class clocksource_sysclass = {
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5fd9b9469770..5125ddd8196b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -15,7 +15,8 @@
 #include <linux/jiffies.h>
 #include <linux/hrtimer.h>
 #include <linux/capability.h>
-#include <asm/div64.h>
+#include <linux/math64.h>
+#include <linux/clocksource.h>
 #include <asm/timex.h>
 /*
@@ -23,11 +24,14 @@
 */
 unsigned long tick_usec = TICK_USEC;            /* USER_HZ period (usec) */
 unsigned long tick_nsec;                        /* ACTHZ period (nsec) */
-static u64 tick_length, tick_length_base;
+u64 tick_length;
+static u64 tick_length_base;
+static struct hrtimer leap_timer;
 #define MAX_TICKADJ             500             /* microsecs */
 #define MAX_TICKADJ_SCALED      (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
-                                  TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ)
+                                  NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
 /*
 * phase-lock loop variables
@@ -35,11 +39,12 @@ static u64 tick_length, tick_length_base;
 /* TIME_ERROR prevents overwriting the CMOS clock */
 static int time_state = TIME_OK;        /* clock synchronization status */
 int time_status = STA_UNSYNC;           /* clock status bits            */
-static s64 time_offset;         /* time adjustment (ns)         */
+static long time_tai;                   /* TAI offset (s)               */
+static s64 time_offset;                 /* time adjustment (ns)         */
 static long time_constant = 2;          /* pll time constant            */
 long time_maxerror = NTP_PHASE_LIMIT;   /* maximum error (us)           */
 long time_esterror = NTP_PHASE_LIMIT;   /* estimated error (us)         */
-long time_freq;                         /* frequency offset (scaled ppm)*/
+static s64 time_freq;                   /* frequency offset (scaled ns/s)*/
 static long time_reftime;               /* time at last adjustment (s)  */
 long time_adjust;
 static long ntp_tick_adj;
@@ -47,16 +52,56 @@ static long ntp_tick_adj;
 static void ntp_update_frequency(void)
 {
        u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
-                                << TICK_LENGTH_SHIFT;
+                                << NTP_SCALE_SHIFT;
-        second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT;
+        second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT;
-        second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
+        second_length += time_freq;
        tick_length_base = second_length;
-        do_div(second_length, HZ);
+        tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
-        tick_nsec = second_length >> TICK_LENGTH_SHIFT;
+        tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ);
+}
+static void ntp_update_offset(long offset)
+{
+        long mtemp;
+        s64 freq_adj;
+        if (!(time_status & STA_PLL))
+                return;
-        do_div(tick_length_base, NTP_INTERVAL_FREQ);
+        if (!(time_status & STA_NANO))
+                offset *= NSEC_PER_USEC;
+        /*
+         * Scale the phase adjustment and
+         * clamp to the operating range.
+         */
+        offset = min(offset, MAXPHASE);
+        offset = max(offset, -MAXPHASE);
+        /*
+         * Select how the frequency is to be controlled
+         * and in which mode (PLL or FLL).
+         */
+        if (time_status & STA_FREQHOLD || time_reftime == 0)
+                time_reftime = xtime.tv_sec;
+        mtemp = xtime.tv_sec - time_reftime;
+        time_reftime = xtime.tv_sec;
+        freq_adj = (s64)offset * mtemp;
+        freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant);
+        time_status &= ~STA_MODE;
+        if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
+                freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL),
+                                    mtemp);
+                time_status |= STA_MODE;
+        }
+        freq_adj += time_freq;
+        freq_adj = min(freq_adj, MAXFREQ_SCALED);
+        time_freq = max(freq_adj, -MAXFREQ_SCALED);
+        time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
 }
 /**
@@ -78,62 +123,70 @@ void ntp_clear(void)
 }
 /*
- * this routine handles the overflow of the microsecond field
+ * Leap second processing. If in leap-insert state at the end of the
- *
+ * day, the system clock is set back one second; if in leap-delete
- * The tricky bits of code to handle the accurate clock support
+ * state, the system clock is set ahead one second.
- * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
- * They were originally developed for SUN and DEC kernels.
- * All the kudos should go to Dave for this stuff.
 */
-void second_overflow(void)
+static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
 {
-        long time_adj;
+        enum hrtimer_restart res = HRTIMER_NORESTART;
-        /* Bump the maxerror field */
+        write_seqlock_irq(&xtime_lock);
-        time_maxerror += MAXFREQ >> SHIFT_USEC;
-        if (time_maxerror > NTP_PHASE_LIMIT) {
-                time_maxerror = NTP_PHASE_LIMIT;
-                time_status |= STA_UNSYNC;
-        }
-        /*
-         * Leap second processing. If in leap-insert state at the end of the
-         * day, the system clock is set back one second; if in leap-delete
-         * state, the system clock is set ahead one second. The microtime()
-         * routine or external clock driver will insure that reported time is
-         * always monotonic. The ugly divides should be replaced.
-         */
        switch (time_state) {
        case TIME_OK:
-                if (time_status & STA_INS)
-                        time_state = TIME_INS;
-                else if (time_status & STA_DEL)
-                        time_state = TIME_DEL;
                break;
        case TIME_INS:
-                if (xtime.tv_sec % 86400 == 0) {
+                xtime.tv_sec--;
-                        xtime.tv_sec--;
+                wall_to_monotonic.tv_sec++;
-                        wall_to_monotonic.tv_sec++;
+                time_state = TIME_OOP;
-                        time_state = TIME_OOP;
+                printk(KERN_NOTICE "Clock: "
-                        printk(KERN_NOTICE "Clock: inserting leap second "
+                       "inserting leap second 23:59:60 UTC\n");
-                                        "23:59:60 UTC\n");
+                leap_timer.expires = ktime_add_ns(leap_timer.expires,
-                }
+                                                  NSEC_PER_SEC);
+                res = HRTIMER_RESTART;
                break;
        case TIME_DEL:
-                if ((xtime.tv_sec + 1) % 86400 == 0) {
+                xtime.tv_sec++;
-                        xtime.tv_sec++;
+                time_tai--;
-                        wall_to_monotonic.tv_sec--;
+                wall_to_monotonic.tv_sec--;
-                        time_state = TIME_WAIT;
+                time_state = TIME_WAIT;
-                        printk(KERN_NOTICE "Clock: deleting leap second "
+                printk(KERN_NOTICE "Clock: "
-                                        "23:59:59 UTC\n");
+                       "deleting leap second 23:59:59 UTC\n");
-                }
                break;
        case TIME_OOP:
+                time_tai++;
                time_state = TIME_WAIT;
-                break;
+                /* fall through */
        case TIME_WAIT:
                if (!(time_status & (STA_INS | STA_DEL)))
-                time_state = TIME_OK;
+                        time_state = TIME_OK;
+                break;
+        }
+        update_vsyscall(&xtime, clock);
+        write_sequnlock_irq(&xtime_lock);
+        return res;
+}
+/*
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ */
+void second_overflow(void)
+{
+        s64 time_adj;
+        /* Bump the maxerror field */
+        time_maxerror += MAXFREQ / NSEC_PER_USEC;
+        if (time_maxerror > NTP_PHASE_LIMIT) {
+                time_maxerror = NTP_PHASE_LIMIT;
+                time_status |= STA_UNSYNC;
        }
        /*
@@ -143,7 +196,7 @@ void second_overflow(void)
        tick_length = tick_length_base;
        time_adj = shift_right(time_offset, SHIFT_PLL + time_constant);
        time_offset -= time_adj;
-        tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE);
+        tick_length += time_adj;
        if (unlikely(time_adjust)) {
                if (time_adjust > MAX_TICKADJ) {
@@ -154,25 +207,12 @@ void second_overflow(void)
                        tick_length -= MAX_TICKADJ_SCALED;
                } else {
                        tick_length += (s64)(time_adjust * NSEC_PER_USEC /
-                                        NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT;
+                                        NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT;
                        time_adjust = 0;
                }
        }
 }
-/*
- * Return how long ticks are at the moment, that is, how much time
- * update_wall_time_one_tick will add to xtime next time we call it
- * (assuming no calls to do_adjtimex in the meantime).
- * The return value is in fixed-point nanoseconds shifted by the
- * specified number of bits to the right of the binary point.
- * This function has no side-effects.
- */
-u64 current_tick_length(void)
-{
-        return tick_length;
-}
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
 /* Disable the cmos update - used by virtualization and embedded */
@@ -236,8 +276,8 @@ static inline void notify_cmos_timer(void) { }
 */
 int do_adjtimex(struct timex *txc)
 {
-        long mtemp, save_adjust, rem;
+        struct timespec ts;
-        s64 freq_adj, temp64;
+        long save_adjust, sec;
        int result;
        /* In order to modify anything, you gotta be super-user! */
@@ -247,147 +287,132 @@ int do_adjtimex(struct timex *txc)
        /* Now we validate the data before disabling interrupts */
        if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
-          /* singleshot must not be used with any other mode bits */
+                /* singleshot must not be used with any other mode bits */
-                if (txc->modes != ADJ_OFFSET_SINGLESHOT &&
+                if (txc->modes & ~ADJ_OFFSET_SS_READ)
-                                        txc->modes != ADJ_OFFSET_SS_READ)
                        return -EINVAL;
        }
-        if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
-          /* adjustment Offset limited to +- .512 seconds */
-                if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
-                        return -EINVAL;
        /* if the quartz is off by more than 10% something is VERY wrong ! */
        if (txc->modes & ADJ_TICK)
                if (txc->tick <  900000/USER_HZ ||
                    txc->tick > 1100000/USER_HZ)
                        return -EINVAL;
+        if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
+                hrtimer_cancel(&leap_timer);
+        getnstimeofday(&ts);
        write_seqlock_irq(&xtime_lock);
-        result = time_state;    /* mostly `TIME_OK' */
        /* Save for later - semantics of adjtime is to return old value */
        save_adjust = time_adjust;
-#if 0   /* STA_CLOCKERR is never set yet */
-        time_status &= ~STA_CLOCKERR;           /* reset STA_CLOCKERR */
-#endif
        /* If there are input parameters, then process them */
-        if (txc->modes)
+        if (txc->modes) {
-        {
+                if (txc->modes & ADJ_STATUS) {
-            if (txc->modes & ADJ_STATUS)        /* only set allowed bits */
+                        if ((time_status & STA_PLL) &&
-                time_status =  (txc->status & ~STA_RONLY) |
+                            !(txc->status & STA_PLL)) {
-                              (time_status & STA_RONLY);
+                                time_state = TIME_OK;
+                                time_status = STA_UNSYNC;
-            if (txc->modes & ADJ_FREQUENCY) {   /* p. 22 */
+                        }
-                if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
+                        /* only set allowed bits */
-                    result = -EINVAL;
+                        time_status &= STA_RONLY;
-                    goto leave;
+                        time_status |= txc->status & ~STA_RONLY;
-                }
-                time_freq = ((s64)txc->freq * NSEC_PER_USEC)
+                        switch (time_state) {
-                                >> (SHIFT_USEC - SHIFT_NSEC);
+                        case TIME_OK:
-            }
+                        start_timer:
+                                sec = ts.tv_sec;
-            if (txc->modes & ADJ_MAXERROR) {
+                                if (time_status & STA_INS) {
-                if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
+                                        time_state = TIME_INS;
-                    result = -EINVAL;
+                                        sec += 86400 - sec % 86400;
-                    goto leave;
+                                        hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
+                                } else if (time_status & STA_DEL) {
+                                        time_state = TIME_DEL;
+                                        sec += 86400 - (sec + 1) % 86400;
+                                        hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
+                                }
+                                break;
+                        case TIME_INS:
+                        case TIME_DEL:
+                                time_state = TIME_OK;
+                                goto start_timer;
+                                break;
+                        case TIME_WAIT:
+                                if (!(time_status & (STA_INS | STA_DEL)))
+                                        time_state = TIME_OK;
+                                break;
+                        case TIME_OOP:
+                                hrtimer_restart(&leap_timer);
+                                break;
+                        }
                }
-                time_maxerror = txc->maxerror;
-            }
-            if (txc->modes & ADJ_ESTERROR) {
+                if (txc->modes & ADJ_NANO)
-                if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
+                        time_status |= STA_NANO;
-                    result = -EINVAL;
+                if (txc->modes & ADJ_MICRO)
-                    goto leave;
+                        time_status &= ~STA_NANO;
+                if (txc->modes & ADJ_FREQUENCY) {
+                        time_freq = (s64)txc->freq * PPM_SCALE;
+                        time_freq = min(time_freq, MAXFREQ_SCALED);
+                        time_freq = max(time_freq, -MAXFREQ_SCALED);
                }
-                time_esterror = txc->esterror;
-            }
-            if (txc->modes & ADJ_TIMECONST) {   /* p. 24 */
+                if (txc->modes & ADJ_MAXERROR)
-                if (txc->constant < 0) {        /* NTP v4 uses values > 6 */
+                        time_maxerror = txc->maxerror;
-                    result = -EINVAL;
+                if (txc->modes & ADJ_ESTERROR)
-                    goto leave;
+                        time_esterror = txc->esterror;
+                if (txc->modes & ADJ_TIMECONST) {
+                        time_constant = txc->constant;
+                        if (!(time_status & STA_NANO))
+                                time_constant += 4;
+                        time_constant = min(time_constant, (long)MAXTC);
+                        time_constant = max(time_constant, 0l);
                }
-                time_constant = min(txc->constant + 4, (long)MAXTC);
-            }
-            if (txc->modes & ADJ_OFFSET) {      /* values checked earlier */
+                if (txc->modes & ADJ_TAI && txc->constant > 0)
-                if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
+                        time_tai = txc->constant;
-                    /* adjtime() is independent from ntp_adjtime() */
-                    time_adjust = txc->offset;
+                if (txc->modes & ADJ_OFFSET) {
+                        if (txc->modes == ADJ_OFFSET_SINGLESHOT)
+                                /* adjtime() is independent from ntp_adjtime() */
+                                time_adjust = txc->offset;
+                        else
+                                ntp_update_offset(txc->offset);
                }
-                else if (time_status & STA_PLL) {
+                if (txc->modes & ADJ_TICK)
-                    time_offset = txc->offset * NSEC_PER_USEC;
+                        tick_usec = txc->tick;
-                    /*
+                if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
-                     * Scale the phase adjustment and
+                        ntp_update_frequency();
-                     * clamp to the operating range.
+        }
-                     */
-                    time_offset = min(time_offset, (s64)MAXPHASE * NSEC_PER_USEC);
+        result = time_state;    /* mostly `TIME_OK' */
-                    time_offset = max(time_offset, (s64)-MAXPHASE * NSEC_PER_USEC);
+        if (time_status & (STA_UNSYNC|STA_CLOCKERR))
-                    /*
-                     * Select whether the frequency is to be controlled
-                     * and in which mode (PLL or FLL). Clamp to the operating
-                     * range. Ugly multiply/divide should be replaced someday.
-                     */
-                    if (time_status & STA_FREQHOLD || time_reftime == 0)
-                        time_reftime = xtime.tv_sec;
-                    mtemp = xtime.tv_sec - time_reftime;
-                    time_reftime = xtime.tv_sec;
-                    freq_adj = time_offset * mtemp;
-                    freq_adj = shift_right(freq_adj, time_constant * 2 +
-                                           (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
-                    if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
-                        u64 utemp64;
-                        temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL);
-                        if (time_offset < 0) {
-                            utemp64 = -temp64;
-                            do_div(utemp64, mtemp);
-                            freq_adj -= utemp64;
-                        } else {
-                            utemp64 = temp64;
-                            do_div(utemp64, mtemp);
-                            freq_adj += utemp64;
-                        }
-                    }
-                    freq_adj += time_freq;
-                    freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
-                    time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
-                    time_offset = div_long_long_rem_signed(time_offset,
-                                                           NTP_INTERVAL_FREQ,
-                                                           &rem);
-                    time_offset <<= SHIFT_UPDATE;
-                } /* STA_PLL */
-            } /* txc->modes & ADJ_OFFSET */
-            if (txc->modes & ADJ_TICK)
-                tick_usec = txc->tick;
-            if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
-                    ntp_update_frequency();
-        } /* txc->modes */
-leave:  if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
                result = TIME_ERROR;
        if ((txc->modes == ADJ_OFFSET_SINGLESHOT) ||
-                        (txc->modes == ADJ_OFFSET_SS_READ))
+            (txc->modes == ADJ_OFFSET_SS_READ))
                txc->offset = save_adjust;
-        else
+        else {
-                txc->offset = ((long)shift_right(time_offset, SHIFT_UPDATE)) *
+                txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
-                                NTP_INTERVAL_FREQ / 1000;
+                                          NTP_SCALE_SHIFT);
-        txc->freq          = (time_freq / NSEC_PER_USEC) <<
+                if (!(time_status & STA_NANO))
-                                (SHIFT_USEC - SHIFT_NSEC);
+                        txc->offset /= NSEC_PER_USEC;
+        }
+        txc->freq          = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
+                                         (s64)PPM_SCALE_INV,
+                                         NTP_SCALE_SHIFT);
        txc->maxerror      = time_maxerror;
        txc->esterror      = time_esterror;
        txc->status        = time_status;
        txc->constant      = time_constant;
        txc->precision     = 1;
-        txc->tolerance     = MAXFREQ;
+        txc->tolerance     = MAXFREQ_SCALED / PPM_SCALE;
        txc->tick          = tick_usec;
+        txc->tai           = time_tai;
        /* PPS is not implemented, so these are zero */
        txc->ppsfreq       = 0;
@@ -399,9 +424,15 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
        txc->errcnt        = 0;
        txc->stbcnt        = 0;
        write_sequnlock_irq(&xtime_lock);
-        do_gettimeofday(&txc->time);
+        txc->time.tv_sec = ts.tv_sec;
+        txc->time.tv_usec = ts.tv_nsec;
+        if (!(time_status & STA_NANO))
+                txc->time.tv_usec /= NSEC_PER_USEC;
        notify_cmos_timer();
-        return(result);
+        return result;
 }
 static int __init ntp_tick_adj_setup(char *str)
@@ -411,3 +442,10 @@ static int __init ntp_tick_adj_setup(char *str)
 }
 __setup("ntp_tick_adj=", ntp_tick_adj_setup);
+void __init ntp_init(void)
+{
+        ntp_clear();
+        hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+        leap_timer.function = ntp_leap_second;
+}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2d6087c7cf98..e91c29f961c9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -53,7 +53,7 @@ void update_xtime_cache(u64 nsec)
        timespec_add_ns(&xtime_cache, nsec);
 }
-static struct clocksource *clock; /* pointer to current clocksource */
+struct clocksource *clock;
 #ifdef CONFIG_GENERIC_TIME
@@ -246,7 +246,7 @@ void __init timekeeping_init(void)
        write_seqlock_irqsave(&xtime_lock, flags);
-        ntp_clear();
+        ntp_init();
        clock = clocksource_get_next();
        clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -371,7 +371,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
         * here.  This is tuned so that an error of about 1 msec is adjusted
         * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
         */
-        error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
+        error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
        error2 = abs(error2);
        for (look_ahead = 0; error2 > 0; look_ahead++)
                error2 >>= 2;
@@ -380,8 +380,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
         * Now calculate the error in (1 << look_ahead) ticks, but first
         * remove the single look ahead already included in the error.
         */
-        tick_error = current_tick_length() >>
+        tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1);
-                (TICK_LENGTH_SHIFT - clock->shift + 1);
        tick_error -= clock->xtime_interval >> 1;
        error = ((error - tick_error) >> look_ahead) + tick_error;
@@ -412,7 +411,7 @@ static void clocksource_adjust(s64 offset)
        s64 error, interval = clock->cycle_interval;
        int adj;
-        error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
+        error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1);
        if (error > interval) {
                error >>= 2;
                if (likely(error <= interval))
@@ -434,7 +433,7 @@ static void clocksource_adjust(s64 offset)
        clock->xtime_interval += interval;
        clock->xtime_nsec -= offset;
        clock->error -= (interval - offset) <<
-                        (TICK_LENGTH_SHIFT - clock->shift);
+                        (NTP_SCALE_SHIFT - clock->shift);
 }
 /**
@@ -473,8 +472,8 @@ void update_wall_time(void)
                }
                /* accumulate error between NTP and clock interval */
-                clock->error += current_tick_length();
+                clock->error += tick_length;
-                clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
+                clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
        }
        /* correct the clock when NTP error is too big */
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 67fe8fc21fb1..a40e20fd0001 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -278,12 +278,9 @@ static int __init init_timer_list_procfs(void)
 {
        struct proc_dir_entry *pe;
-        pe = create_proc_entry("timer_list", 0644, NULL);
+        pe = proc_create("timer_list", 0644, NULL, &timer_list_fops);
        if (!pe)
                return -ENOMEM;
-        pe->proc_fops = &timer_list_fops;
        return 0;
 }
 __initcall(init_timer_list_procfs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 417da8c5bc72..c994530d166d 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -415,12 +415,9 @@ static int __init init_tstats_procfs(void)
 {
        struct proc_dir_entry *pe;
-        pe = create_proc_entry("timer_stats", 0644, NULL);
+        pe = proc_create("timer_stats", 0644, NULL, &tstats_fops);
        if (!pe)
                return -ENOMEM;
-        pe->proc_fops = &tstats_fops;
        return 0;
 }
 __initcall(init_tstats_procfs);
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
index 41468035473c..eb51d76e058a 100644
--- a/kernel/timeconst.pl
+++ b/kernel/timeconst.pl
@@ -1,7 +1,7 @@
 #!/usr/bin/perl
 # -----------------------------------------------------------------------
 #
-#   Copyright 2007 rPath, Inc. - All Rights Reserved
+#   Copyright 2007-2008 rPath, Inc. - All Rights Reserved
 #
 #   This file is part of the Linux kernel, and is made available under
 #   the terms of the GNU General Public License version 2 or (at your
@@ -20,198 +20,138 @@
 %canned_values = (
        24 => [
                '0xa6aaaaab','0x2aaaaaa',26,
-                '0xa6aaaaaaaaaaaaab','0x2aaaaaaaaaaaaaa',58,
                125,3,
                '0xc49ba5e4','0x1fbe76c8b4',37,
-                '0xc49ba5e353f7ceda','0x1fbe76c8b439581062',69,
                3,125,
                '0xa2c2aaab','0xaaaa',16,
-                '0xa2c2aaaaaaaaaaab','0xaaaaaaaaaaaa',48,
                125000,3,
                '0xc9539b89','0x7fffbce4217d',47,
-                '0xc9539b8887229e91','0x7fffbce4217d2849cb25',79,
                3,125000,
        ], 32 => [
                '0xfa000000','0x6000000',27,
-                '0xfa00000000000000','0x600000000000000',59,
                125,4,
                '0x83126e98','0xfdf3b645a',36,
-                '0x83126e978d4fdf3c','0xfdf3b645a1cac0831',68,
                4,125,
                '0xf4240000','0x0',17,
-                '0xf424000000000000','0x0',49,
                31250,1,
                '0x8637bd06','0x3fff79c842fa',46,
-                '0x8637bd05af6c69b6','0x3fff79c842fa5093964a',78,
                1,31250,
        ], 48 => [
                '0xa6aaaaab','0x6aaaaaa',27,
-                '0xa6aaaaaaaaaaaaab','0x6aaaaaaaaaaaaaa',59,
                125,6,
                '0xc49ba5e4','0xfdf3b645a',36,
-                '0xc49ba5e353f7ceda','0xfdf3b645a1cac0831',68,
                6,125,
                '0xa2c2aaab','0x15555',17,
-                '0xa2c2aaaaaaaaaaab','0x1555555555555',49,
                62500,3,
                '0xc9539b89','0x3fffbce4217d',46,
-                '0xc9539b8887229e91','0x3fffbce4217d2849cb25',78,
                3,62500,
        ], 64 => [
                '0xfa000000','0xe000000',28,
-                '0xfa00000000000000','0xe00000000000000',60,
                125,8,
                '0x83126e98','0x7ef9db22d',35,
-                '0x83126e978d4fdf3c','0x7ef9db22d0e560418',67,
                8,125,
                '0xf4240000','0x0',18,
-                '0xf424000000000000','0x0',50,
                15625,1,
                '0x8637bd06','0x1fff79c842fa',45,
-                '0x8637bd05af6c69b6','0x1fff79c842fa5093964a',77,
                1,15625,
        ], 100 => [
                '0xa0000000','0x0',28,
-                '0xa000000000000000','0x0',60,
                10,1,
                '0xcccccccd','0x733333333',35,
-                '0xcccccccccccccccd','0x73333333333333333',67,
                1,10,
                '0x9c400000','0x0',18,
-                '0x9c40000000000000','0x0',50,
                10000,1,
                '0xd1b71759','0x1fff2e48e8a7',45,
-                '0xd1b71758e219652c','0x1fff2e48e8a71de69ad4',77,
                1,10000,
        ], 122 => [
                '0x8325c53f','0xfbcda3a',28,
-                '0x8325c53ef368eb05','0xfbcda3ac10c9714',60,
                500,61,
                '0xf9db22d1','0x7fbe76c8b',35,
-                '0xf9db22d0e560418a','0x7fbe76c8b43958106',67,
                61,500,
                '0x8012e2a0','0x3ef36',18,
-                '0x8012e29f79b47583','0x3ef368eb04325',50,
                500000,61,
                '0xffda4053','0x1ffffbce4217',45,
-                '0xffda4052d666a983','0x1ffffbce4217d2849cb2',77,
                61,500000,
        ], 128 => [
                '0xfa000000','0x1e000000',29,
-                '0xfa00000000000000','0x1e00000000000000',61,
                125,16,
                '0x83126e98','0x3f7ced916',34,
-                '0x83126e978d4fdf3c','0x3f7ced916872b020c',66,
                16,125,
                '0xf4240000','0x40000',19,
-                '0xf424000000000000','0x4000000000000',51,
                15625,2,
                '0x8637bd06','0xfffbce4217d',44,
-                '0x8637bd05af6c69b6','0xfffbce4217d2849cb25',76,
                2,15625,
        ], 200 => [
                '0xa0000000','0x0',29,
-                '0xa000000000000000','0x0',61,
                5,1,
                '0xcccccccd','0x333333333',34,
-                '0xcccccccccccccccd','0x33333333333333333',66,
                1,5,
                '0x9c400000','0x0',19,
-                '0x9c40000000000000','0x0',51,
                5000,1,
                '0xd1b71759','0xfff2e48e8a7',44,
-                '0xd1b71758e219652c','0xfff2e48e8a71de69ad4',76,
                1,5000,
        ], 250 => [
                '0x80000000','0x0',29,
-                '0x8000000000000000','0x0',61,
                4,1,
                '0x80000000','0x180000000',33,
-                '0x8000000000000000','0x18000000000000000',65,
                1,4,
                '0xfa000000','0x0',20,
-                '0xfa00000000000000','0x0',52,
                4000,1,
                '0x83126e98','0x7ff7ced9168',43,
-                '0x83126e978d4fdf3c','0x7ff7ced916872b020c4',75,
                1,4000,
        ], 256 => [
                '0xfa000000','0x3e000000',30,
-                '0xfa00000000000000','0x3e00000000000000',62,
                125,32,
                '0x83126e98','0x1fbe76c8b',33,
-                '0x83126e978d4fdf3c','0x1fbe76c8b43958106',65,
                32,125,
                '0xf4240000','0xc0000',20,
-                '0xf424000000000000','0xc000000000000',52,
                15625,4,
                '0x8637bd06','0x7ffde7210be',43,
-                '0x8637bd05af6c69b6','0x7ffde7210be9424e592',75,
                4,15625,
        ], 300 => [
                '0xd5555556','0x2aaaaaaa',30,
-                '0xd555555555555556','0x2aaaaaaaaaaaaaaa',62,
                10,3,
                '0x9999999a','0x1cccccccc',33,
-                '0x999999999999999a','0x1cccccccccccccccc',65,
                3,10,
                '0xd0555556','0xaaaaa',20,
-                '0xd055555555555556','0xaaaaaaaaaaaaa',52,
                10000,3,
                '0x9d495183','0x7ffcb923a29',43,
-                '0x9d495182a9930be1','0x7ffcb923a29c779a6b5',75,
                3,10000,
        ], 512 => [
                '0xfa000000','0x7e000000',31,
-                '0xfa00000000000000','0x7e00000000000000',63,
                125,64,
                '0x83126e98','0xfdf3b645',32,
-                '0x83126e978d4fdf3c','0xfdf3b645a1cac083',64,
                64,125,
                '0xf4240000','0x1c0000',21,
-                '0xf424000000000000','0x1c000000000000',53,
                15625,8,
                '0x8637bd06','0x3ffef39085f',42,
-                '0x8637bd05af6c69b6','0x3ffef39085f4a1272c9',74,
                8,15625,
        ], 1000 => [
                '0x80000000','0x0',31,
-                '0x8000000000000000','0x0',63,
                1,1,
                '0x80000000','0x0',31,
-                '0x8000000000000000','0x0',63,
                1,1,
                '0xfa000000','0x0',22,
-                '0xfa00000000000000','0x0',54,
                1000,1,
                '0x83126e98','0x1ff7ced9168',41,
-                '0x83126e978d4fdf3c','0x1ff7ced916872b020c4',73,
                1,1000,
        ], 1024 => [
                '0xfa000000','0xfe000000',32,
-                '0xfa00000000000000','0xfe00000000000000',64,
                125,128,
                '0x83126e98','0x7ef9db22',31,
-                '0x83126e978d4fdf3c','0x7ef9db22d0e56041',63,
                128,125,
                '0xf4240000','0x3c0000',22,
-                '0xf424000000000000','0x3c000000000000',54,
                15625,16,
                '0x8637bd06','0x1fff79c842f',41,
-                '0x8637bd05af6c69b6','0x1fff79c842fa5093964',73,
                16,15625,
        ], 1200 => [
                '0xd5555556','0xd5555555',32,
-                '0xd555555555555556','0xd555555555555555',64,
                5,6,
                '0x9999999a','0x66666666',31,
-                '0x999999999999999a','0x6666666666666666',63,
                6,5,
                '0xd0555556','0x2aaaaa',22,
-                '0xd055555555555556','0x2aaaaaaaaaaaaa',54,
                2500,3,
                '0x9d495183','0x1ffcb923a29',41,
-                '0x9d495182a9930be1','0x1ffcb923a29c779a6b5',73,
                3,2500,
        ]
 );
@@ -264,6 +204,15 @@ sub fmuls($$$) {
        return 0;
 }
+# Generate a hex value if the result fits in 64 bits;
+# otherwise skip.
+sub bignum_hex($) {
+        my($x) = @_;
+        my $s = $x->as_hex();
+        return (length($s) > 18) ? undef : $s;
+}
 # Provides mul, adj, and shr factors for a specific
 # (bit, time, hz) combination
 sub muladj($$$) {
@@ -271,7 +220,7 @@ sub muladj($$$) {
        my $s = fmuls($b, $t, $hz);
        my $m = fmul($s, $t, $hz);
        my $a = fadj($s, $t, $hz);
-        return ($m->as_hex(), $a->as_hex(), $s);
+        return (bignum_hex($m), bignum_hex($a), $s);
 }
 # Provides numerator, denominator values
@@ -288,12 +237,10 @@ sub conversions($$) {
        # HZ_TO_xx
        push(@val, muladj(32, $t, $hz));
-        push(@val, muladj(64, $t, $hz));
        push(@val, numden($t, $hz));
        # xx_TO_HZ
        push(@val, muladj(32, $hz, $t));
-        push(@val, muladj(64, $hz, $t));
        push(@val, numden($hz, $t));
        return @val;
@@ -318,6 +265,19 @@ sub compute_values($) {
        return @val;
 }
+sub outputval($$)
+{
+        my($name, $val) = @_;
+        my $csuf;
+        if (defined($val)) {
+            if ($name !~ /SHR/) {
+                $val = "U64_C($val)";
+            }
+            printf "#define %-23s %s\n", $name.$csuf, $val.$csuf;
+        }
+}
 sub output($@)
 {
        my($hz, @val) = @_;
@@ -331,6 +291,7 @@ sub output($@)
        print "\n";
        print "#include <linux/param.h>\n";
+        print "#include <linux/types.h>\n";
        print "\n";
        print "#if HZ != $hz\n";
@@ -340,15 +301,13 @@ sub output($@)
        foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
                      'HZ_TO_USEC','USEC_TO_HZ') {
-                foreach $bit (32, 64) {
+                foreach $bit (32) {
                        foreach $suf ('MUL', 'ADJ', 'SHR') {
-                                printf "#define %-23s %s\n",
+                                outputval("${pfx}_$suf$bit", shift(@val));
-                                        "${pfx}_$suf$bit", shift(@val);
                        }
                }
                foreach $suf ('NUM', 'DEN') {
-                        printf "#define %-23s %s\n",
+                        outputval("${pfx}_$suf", shift(@val));
-                                "${pfx}_$suf", shift(@val);
                }
        }
@@ -356,6 +315,23 @@ sub output($@)
        print "#endif /* KERNEL_TIMECONST_H */\n";
 }
+# Pretty-print Perl values
+sub perlvals(@) {
+        my $v;
+        my @l = ();
+        foreach $v (@_) {
+                if (!defined($v)) {
+                        push(@l, 'undef');
+                } elsif ($v =~ /^0x/) {
+                        push(@l, "\'".$v."\'");
+                } else {
+                        push(@l, $v.'');
+                }
+        }
+        return join(',', @l);
+}
 ($hz) = @ARGV;
 # Use this to generate the %canned_values structure
@@ -373,15 +349,15 @@ if ($hz eq '--can') {
                print "$pf$hz => [\n";
                while (scalar(@values)) {
                        my $bit;
-                        foreach $bit (32, 64) {
+                        foreach $bit (32) {
                                my $m = shift(@values);
                                my $a = shift(@values);
                                my $s = shift(@values);
-                                print "\t\t\'",$m,"\',\'",$a,"\',",$s,",\n";
+                                print "\t\t", perlvals($m,$a,$s), ",\n";
                        }
                        my $n = shift(@values);
                        my $d = shift(@values);
-                        print "\t\t",$n,',',$d,",\n";
+                        print "\t\t", perlvals($n,$d), ",\n";
                }
                print "\t]";
                $pf = ', ';
diff --git a/kernel/timer.c b/kernel/timer.c
index f3d35d4ea42e..ceacc6626572 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -320,14 +320,130 @@ static void timer_stats_account_timer(struct timer_list *timer)
 static void timer_stats_account_timer(struct timer_list *timer) {}
 #endif
-/**
+#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
- * init_timer - initialize a timer.
- * @timer: the timer to be initialized
+static struct debug_obj_descr timer_debug_descr;
- *
- * init_timer() must be done to a timer prior calling *any* of the
+/*
- * other timer functions.
+ * fixup_init is called when:
+ * - an active object is initialized
 */
-void init_timer(struct timer_list *timer)
+static int timer_fixup_init(void *addr, enum debug_obj_state state)
+{
+        struct timer_list *timer = addr;
+        switch (state) {
+        case ODEBUG_STATE_ACTIVE:
+                del_timer_sync(timer);
+                debug_object_init(timer, &timer_debug_descr);
+                return 1;
+        default:
+                return 0;
+        }
+}
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ */
+static int timer_fixup_activate(void *addr, enum debug_obj_state state)
+{
+        struct timer_list *timer = addr;
+        switch (state) {
+        case ODEBUG_STATE_NOTAVAILABLE:
+                /*
+                 * This is not really a fixup. The timer was
+                 * statically initialized. We just make sure that it
+                 * is tracked in the object tracker.
+                 */
+                if (timer->entry.next == NULL &&
+                    timer->entry.prev == TIMER_ENTRY_STATIC) {
+                        debug_object_init(timer, &timer_debug_descr);
+                        debug_object_activate(timer, &timer_debug_descr);
+                        return 0;
+                } else {
+                        WARN_ON_ONCE(1);
+                }
+                return 0;
+        case ODEBUG_STATE_ACTIVE:
+                WARN_ON(1);
+        default:
+                return 0;
+        }
+}
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int timer_fixup_free(void *addr, enum debug_obj_state state)
+{
+        struct timer_list *timer = addr;
+        switch (state) {
+        case ODEBUG_STATE_ACTIVE:
+                del_timer_sync(timer);
+                debug_object_free(timer, &timer_debug_descr);
+                return 1;
+        default:
+                return 0;
+        }
+}
+static struct debug_obj_descr timer_debug_descr = {
+        .name           = "timer_list",
+        .fixup_init     = timer_fixup_init,
+        .fixup_activate = timer_fixup_activate,
+        .fixup_free     = timer_fixup_free,
+};
+static inline void debug_timer_init(struct timer_list *timer)
+{
+        debug_object_init(timer, &timer_debug_descr);
+}
+static inline void debug_timer_activate(struct timer_list *timer)
+{
+        debug_object_activate(timer, &timer_debug_descr);
+}
+static inline void debug_timer_deactivate(struct timer_list *timer)
+{
+        debug_object_deactivate(timer, &timer_debug_descr);
+}
+static inline void debug_timer_free(struct timer_list *timer)
+{
+        debug_object_free(timer, &timer_debug_descr);
+}
+static void __init_timer(struct timer_list *timer);
+void init_timer_on_stack(struct timer_list *timer)
+{
+        debug_object_init_on_stack(timer, &timer_debug_descr);
+        __init_timer(timer);
+}
+EXPORT_SYMBOL_GPL(init_timer_on_stack);
+void destroy_timer_on_stack(struct timer_list *timer)
+{
+        debug_object_free(timer, &timer_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
+#else
+static inline void debug_timer_init(struct timer_list *timer) { }
+static inline void debug_timer_activate(struct timer_list *timer) { }
+static inline void debug_timer_deactivate(struct timer_list *timer) { }
+#endif
+static void __init_timer(struct timer_list *timer)
 {
        timer->entry.next = NULL;
        timer->base = __raw_get_cpu_var(tvec_bases);
@@ -337,6 +453,19 @@ void init_timer(struct timer_list *timer)
        memset(timer->start_comm, 0, TASK_COMM_LEN);
 #endif
 }
+/**
+ * init_timer - initialize a timer.
+ * @timer: the timer to be initialized
+ *
+ * init_timer() must be done to a timer prior calling *any* of the
+ * other timer functions.
+ */
+void init_timer(struct timer_list *timer)
+{
+        debug_timer_init(timer);
+        __init_timer(timer);
+}
 EXPORT_SYMBOL(init_timer);
 void init_timer_deferrable(struct timer_list *timer)
@@ -351,6 +480,8 @@ static inline void detach_timer(struct timer_list *timer,
 {
        struct list_head *entry = &timer->entry;
+        debug_timer_deactivate(timer);
        __list_del(entry->prev, entry->next);
        if (clear_pending)
                entry->next = NULL;
@@ -405,6 +536,8 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
                ret = 1;
        }
+        debug_timer_activate(timer);
        new_base = __get_cpu_var(tvec_bases);
        if (base != new_base) {
@@ -450,6 +583,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
        BUG_ON(timer_pending(timer) || !timer->function);
        spin_lock_irqsave(&base->lock, flags);
        timer_set_base(timer, base);
+        debug_timer_activate(timer);
        internal_add_timer(base, timer);
        /*
         * Check whether the other CPU is idle and needs to be
@@ -1086,11 +1220,14 @@ signed long __sched schedule_timeout(signed long timeout)
        expire = timeout + jiffies;
-        setup_timer(&timer, process_timeout, (unsigned long)current);
+        setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
        __mod_timer(&timer, expire);
        schedule();
        del_singleshot_timer_sync(&timer);
+        /* Remove the timer from the object tracker */
+        destroy_timer_on_stack(&timer);
        timeout = expire - jiffies;
 out:
diff --git a/kernel/user.c b/kernel/user.c
index debce602bfdd..865ecf57a096 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -53,10 +53,6 @@ struct user_struct root_user = {
        .files          = ATOMIC_INIT(0),
        .sigpending     = ATOMIC_INIT(0),
        .locked_shm     = 0,
-#ifdef CONFIG_KEYS
-        .uid_keyring    = &root_user_keyring,
-        .session_keyring = &root_session_keyring,
-#endif
 #ifdef CONFIG_USER_SCHED
        .tg             = &init_task_group,
 #endif
@@ -388,7 +384,7 @@ void free_uid(struct user_struct *up)
                local_irq_restore(flags);
 }
-struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
+struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 {
        struct hlist_head *hashent = uidhashentry(ns, uid);
        struct user_struct *up, *new;
@@ -403,29 +399,15 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
        spin_unlock_irq(&uidhash_lock);
        if (!up) {
-                new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
+                new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
                if (!new)
                        goto out_unlock;
                new->uid = uid;
                atomic_set(&new->__count, 1);
-                atomic_set(&new->processes, 0);
-                atomic_set(&new->files, 0);
-                atomic_set(&new->sigpending, 0);
-#ifdef CONFIG_INOTIFY_USER
-                atomic_set(&new->inotify_watches, 0);
-                atomic_set(&new->inotify_devs, 0);
-#endif
-#ifdef CONFIG_POSIX_MQUEUE
-                new->mq_bytes = 0;
-#endif
-                new->locked_shm = 0;
-                if (alloc_uid_keyring(new, current) < 0)
-                        goto out_free_user;
                if (sched_create_user(new) < 0)
-                        goto out_put_keys;
+                        goto out_free_user;
                if (uids_user_create(new))
                        goto out_destoy_sched;
@@ -459,9 +441,6 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 out_destoy_sched:
        sched_destroy_user(new);
-out_put_keys:
-        key_put(new->uid_keyring);
-        key_put(new->session_keyring);
 out_free_user:
        kmem_cache_free(uid_cachep, new);
 out_unlock:
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 4c9006275df7..a9ab0596de44 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/version.h>
 #include <linux/nsproxy.h>
+#include <linux/slab.h>
 #include <linux/user_namespace.h>
 /*
@@ -73,3 +74,4 @@ void free_user_ns(struct kref *kref)
        release_uids(ns);
        kfree(ns);
 }
+EXPORT_SYMBOL(free_user_ns);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 816d7b24fa03..64d398f12444 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/version.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 /*
 * Clone a new ns copying an original utsname, setting refcount to 1
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 00ff4d08e370..29fc39f1029c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -158,8 +158,8 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 *
 * Returns 0 if @work was already on a queue, non-zero otherwise.
 *
- * We queue the work to the CPU it was submitted, but there is no
+ * We queue the work to the CPU on which it was submitted, but if the CPU dies
- * guarantee that it will be processed by that CPU.
+ * it can be processed by another CPU.
 */
 int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
@@ -195,7 +195,6 @@ static void delayed_work_timer_fn(unsigned long __data)
 int queue_delayed_work(struct workqueue_struct *wq,
                        struct delayed_work *dwork, unsigned long delay)
 {
-        timer_stats_timer_set_start_info(&dwork->timer);
        if (delay == 0)
                return queue_work(wq, &dwork->work);
@@ -219,11 +218,12 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
        struct timer_list *timer = &dwork->timer;
        struct work_struct *work = &dwork->work;
-        timer_stats_timer_set_start_info(&dwork->timer);
        if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
                BUG_ON(timer_pending(timer));
                BUG_ON(!list_empty(&work->entry));
+                timer_stats_timer_set_start_info(&dwork->timer);
                /* This stores cwq for the moment, for the timer_fn */
                set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
                timer->expires = jiffies + delay;
@@ -247,7 +247,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
        if (cwq->run_depth > 3) {
                /* morton gets to eat his hat */
                printk("%s: recursion depth exceeded: %d\n",
-                        __FUNCTION__, cwq->run_depth);
+                        __func__, cwq->run_depth);
                dump_stack();
        }
        while (!list_empty(&cwq->worklist)) {
@@ -564,7 +564,6 @@ EXPORT_SYMBOL(schedule_work);
 int schedule_delayed_work(struct delayed_work *dwork,
                                        unsigned long delay)
 {
-        timer_stats_timer_set_start_info(&dwork->timer);
        return queue_delayed_work(keventd_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);
@@ -581,7 +580,6 @@ EXPORT_SYMBOL(schedule_delayed_work);
 int schedule_delayed_work_on(int cpu,
                        struct delayed_work *dwork, unsigned long delay)
 {
-        timer_stats_timer_set_start_info(&dwork->timer);
        return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work_on);
@@ -772,7 +770,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 }
 EXPORT_SYMBOL_GPL(__create_workqueue_key);
-static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 {
        /*
         * Our caller is either destroy_workqueue() or CPU_DEAD,
@@ -808,19 +806,16 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 void destroy_workqueue(struct workqueue_struct *wq)
 {
        const cpumask_t *cpu_map = wq_cpu_map(wq);
-        struct cpu_workqueue_struct *cwq;
        int cpu;
        get_online_cpus();
        spin_lock(&workqueue_lock);
        list_del(&wq->list);
        spin_unlock(&workqueue_lock);
-        put_online_cpus();
-        for_each_cpu_mask(cpu, *cpu_map) {
+        for_each_cpu_mask(cpu, *cpu_map)
-                cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+                cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
-                cleanup_workqueue_thread(cwq, cpu);
+        put_online_cpus();
-        }
        free_percpu(wq->cpu_wq);
        kfree(wq);
@@ -838,7 +833,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        action &= ~CPU_TASKS_FROZEN;
        switch (action) {
        case CPU_UP_PREPARE:
                cpu_set(cpu, cpu_populated_map);
        }
@@ -861,11 +855,17 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                case CPU_UP_CANCELED:
                        start_workqueue_thread(cwq, -1);
                case CPU_DEAD:
-                        cleanup_workqueue_thread(cwq, cpu);
+                        cleanup_workqueue_thread(cwq);
                        break;
                }
        }
+        switch (action) {
+        case CPU_UP_CANCELED:
+        case CPU_DEAD:
+                cpu_clear(cpu, cpu_populated_map);
+        }
        return NOTIFY_OK;
 }
author	Steve French <sfrench@us.ibm.com>	2008-05-06 13:55:32 -0400
committer	Steve French <sfrench@us.ibm.com>	2008-05-06 13:55:32 -0400
commit	a815752ac0ffdb910e92958d41d28f4fb28e5296 (patch)
tree	a3aa16a282354da0debe8e3a3a7ed8aac6e54001 /kernel
parent	5ade9deaaa3e1f7291467d97b238648e43eae15e (diff)
parent	a15306365a16380f3bafee9e181ba01231d4acd7 (diff)