39 files changed, 4658 insertions, 513 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index d9b690ac684b..76c9a11b72d6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2,7 +2,7 @@
 * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
 * System-call specific features have moved to auditsc.c
 *
- * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify
@@ -65,7 +65,9 @@
 * (Initialization happens after skb_init is called.) */
 static int      audit_initialized;
-/* No syscall auditing will take place unless audit_enabled != 0. */
+/* 0 - no auditing
+ * 1 - auditing enabled
+ * 2 - auditing enabled and configuration is locked/unchangeable. */
 int             audit_enabled;
 /* Default state when kernel boots without any parameters. */
@@ -239,102 +241,150 @@ void audit_log_lost(const char *message)
 static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
 {
-        int old = audit_rate_limit;
+        int res, rc = 0, old = audit_rate_limit;
+        /* check if we are locked */
+        if (audit_enabled == 2)
+                res = 0;
+        else
+                res = 1;
        if (sid) {
                char *ctx = NULL;
                u32 len;
-                int rc;
+                if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) {
-                if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
-                        return rc;
-                else
                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-                                "audit_rate_limit=%d old=%d by auid=%u subj=%s",
+                                "audit_rate_limit=%d old=%d by auid=%u"
-                                limit, old, loginuid, ctx);
+                                " subj=%s res=%d",
-                kfree(ctx);
+                                limit, old, loginuid, ctx, res);
-        } else
+                        kfree(ctx);
-                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                } else
-                        "audit_rate_limit=%d old=%d by auid=%u",
+                        res = 0; /* Something weird, deny request */
-                        limit, old, loginuid);
+        }
-        audit_rate_limit = limit;
+        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-        return 0;
+                "audit_rate_limit=%d old=%d by auid=%u res=%d",
+                limit, old, loginuid, res);
+        /* If we are allowed, make the change */
+        if (res == 1)
+                audit_rate_limit = limit;
+        /* Not allowed, update reason */
+        else if (rc == 0)
+                rc = -EPERM;
+        return rc;
 }
 static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
 {
-        int old = audit_backlog_limit;
+        int res, rc = 0, old = audit_backlog_limit;
+        /* check if we are locked */
+        if (audit_enabled == 2)
+                res = 0;
+        else
+                res = 1;
        if (sid) {
                char *ctx = NULL;
                u32 len;
-                int rc;
+                if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) {
-                if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
-                        return rc;
-                else
                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-                            "audit_backlog_limit=%d old=%d by auid=%u subj=%s",
+                                "audit_backlog_limit=%d old=%d by auid=%u"
-                                limit, old, loginuid, ctx);
+                                " subj=%s res=%d",
-                kfree(ctx);
+                                limit, old, loginuid, ctx, res);
-        } else
+                        kfree(ctx);
-                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                } else
-                        "audit_backlog_limit=%d old=%d by auid=%u",
+                        res = 0; /* Something weird, deny request */
-                        limit, old, loginuid);
+        }
-        audit_backlog_limit = limit;
+        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-        return 0;
+                "audit_backlog_limit=%d old=%d by auid=%u res=%d",
+                limit, old, loginuid, res);
+        /* If we are allowed, make the change */
+        if (res == 1)
+                audit_backlog_limit = limit;
+        /* Not allowed, update reason */
+        else if (rc == 0)
+                rc = -EPERM;
+        return rc;
 }
 static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
 {
-        int old = audit_enabled;
+        int res, rc = 0, old = audit_enabled;
-        if (state != 0 && state != 1)
+        if (state < 0 || state > 2)
                return -EINVAL;
+        /* check if we are locked */
+        if (audit_enabled == 2)
+                res = 0;
+        else
+                res = 1;
        if (sid) {
                char *ctx = NULL;
                u32 len;
-                int rc;
+                if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) {
-                if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
-                        return rc;
-                else
                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-                                "audit_enabled=%d old=%d by auid=%u subj=%s",
+                                "audit_enabled=%d old=%d by auid=%u"
-                                state, old, loginuid, ctx);
+                                " subj=%s res=%d",
-                kfree(ctx);
+                                state, old, loginuid, ctx, res);
-        } else
+                        kfree(ctx);
-                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                } else
-                        "audit_enabled=%d old=%d by auid=%u",
+                        res = 0; /* Something weird, deny request */
-                        state, old, loginuid);
+        }
-        audit_enabled = state;
+        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-        return 0;
+                "audit_enabled=%d old=%d by auid=%u res=%d",
+                state, old, loginuid, res);
+        /* If we are allowed, make the change */
+        if (res == 1)
+                audit_enabled = state;
+        /* Not allowed, update reason */
+        else if (rc == 0)
+                rc = -EPERM;
+        return rc;
 }
 static int audit_set_failure(int state, uid_t loginuid, u32 sid)
 {
-        int old = audit_failure;
+        int res, rc = 0, old = audit_failure;
        if (state != AUDIT_FAIL_SILENT
            && state != AUDIT_FAIL_PRINTK
            && state != AUDIT_FAIL_PANIC)
                return -EINVAL;
+        /* check if we are locked */
+        if (audit_enabled == 2)
+                res = 0;
+        else
+                res = 1;
        if (sid) {
                char *ctx = NULL;
                u32 len;
-                int rc;
+                if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) {
-                if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
-                        return rc;
-                else
                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-                                "audit_failure=%d old=%d by auid=%u subj=%s",
+                                "audit_failure=%d old=%d by auid=%u"
-                                state, old, loginuid, ctx);
+                                " subj=%s res=%d",
-                kfree(ctx);
+                                state, old, loginuid, ctx, res);
-        } else
+                        kfree(ctx);
-                audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+                } else
-                        "audit_failure=%d old=%d by auid=%u",
+                        res = 0; /* Something weird, deny request */
-                        state, old, loginuid);
+        }
-        audit_failure = state;
+        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-        return 0;
+                "audit_failure=%d old=%d by auid=%u res=%d",
+                state, old, loginuid, res);
+        /* If we are allowed, make the change */
+        if (res == 1)
+                audit_failure = state;
+        /* Not allowed, update reason */
+        else if (rc == 0)
+                rc = -EPERM;
+        return rc;
 }
 static int kauditd_thread(void *dummy)
@@ -599,6 +649,30 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        case AUDIT_DEL:
                if (nlmsg_len(nlh) < sizeof(struct audit_rule))
                        return -EINVAL;
+                if (audit_enabled == 2) {
+                        ab = audit_log_start(NULL, GFP_KERNEL,
+                                        AUDIT_CONFIG_CHANGE);
+                        if (ab) {
+                                audit_log_format(ab,
+                                                 "pid=%d uid=%u auid=%u",
+                                                 pid, uid, loginuid);
+                                if (sid) {
+                                        if (selinux_sid_to_string(
+                                                        sid, &ctx, &len)) {
+                                                audit_log_format(ab,
+                                                        " ssid=%u", sid);
+                                                /* Maybe call audit_panic? */
+                                        } else
+                                                audit_log_format(ab,
+                                                        " subj=%s", ctx);
+                                        kfree(ctx);
+                                }
+                                audit_log_format(ab, " audit_enabled=%d res=0",
+                                        audit_enabled);
+                                audit_log_end(ab);
+                        }
+                        return -EPERM;
+                }
                /* fallthrough */
        case AUDIT_LIST:
                err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
@@ -609,6 +683,30 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        case AUDIT_DEL_RULE:
                if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
                        return -EINVAL;
+                if (audit_enabled == 2) {
+                        ab = audit_log_start(NULL, GFP_KERNEL,
+                                        AUDIT_CONFIG_CHANGE);
+                        if (ab) {
+                                audit_log_format(ab,
+                                                 "pid=%d uid=%u auid=%u",
+                                                 pid, uid, loginuid);
+                                if (sid) {
+                                        if (selinux_sid_to_string(
+                                                        sid, &ctx, &len)) {
+                                                audit_log_format(ab,
+                                                        " ssid=%u", sid);
+                                                /* Maybe call audit_panic? */
+                                        } else
+                                                audit_log_format(ab,
+                                                        " subj=%s", ctx);
+                                        kfree(ctx);
+                                }
+                                audit_log_format(ab, " audit_enabled=%d res=0",
+                                        audit_enabled);
+                                audit_log_end(ab);
+                        }
+                        return -EPERM;
+                }
                /* fallthrough */
        case AUDIT_LIST_RULES:
                err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 87865f8b4ce3..3749193aed8c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -937,9 +937,10 @@ static void audit_update_watch(struct audit_parent *parent,
                }
                ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
-                audit_log_format(ab, "audit updated rules specifying path=");
+                audit_log_format(ab, "op=updated rules specifying path=");
                audit_log_untrustedstring(ab, owatch->path);
                audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino);
+                audit_log_format(ab, " list=%d res=1", r->listnr);
                audit_log_end(ab);
                audit_remove_watch(owatch);
@@ -969,14 +970,14 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
                        e = container_of(r, struct audit_entry, rule);
                        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
-                        audit_log_format(ab, "audit implicitly removed rule path=");
+                        audit_log_format(ab, "op=remove rule path=");
                        audit_log_untrustedstring(ab, w->path);
                        if (r->filterkey) {
                                audit_log_format(ab, " key=");
                                audit_log_untrustedstring(ab, r->filterkey);
                        } else
                                audit_log_format(ab, " key=(null)");
-                        audit_log_format(ab, " list=%d", r->listnr);
+                        audit_log_format(ab, " list=%d res=1", r->listnr);
                        audit_log_end(ab);
                        list_del(&r->rlist);
@@ -1410,7 +1411,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
                        audit_log_format(ab, " subj=%s", ctx);
                kfree(ctx);
        }
-        audit_log_format(ab, " %s rule key=", action);
+        audit_log_format(ab, " op=%s rule key=", action);
        if (rule->filterkey)
                audit_log_untrustedstring(ab, rule->filterkey);
        else
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 298897559ca4..359955800dd2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -170,6 +170,11 @@ struct audit_aux_data_sockaddr {
        char                    a[0];
 };
+struct audit_aux_data_fd_pair {
+        struct  audit_aux_data d;
+        int     fd[2];
+};
 struct audit_aux_data_path {
        struct audit_aux_data   d;
        struct dentry           *dentry;
@@ -961,6 +966,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                        audit_log_d_path(ab, "path=", axi->dentry, axi->mnt);
                        break; }
+                case AUDIT_FD_PAIR: {
+                        struct audit_aux_data_fd_pair *axs = (void *)aux;
+                        audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
+                        break; }
                }
                audit_log_end(ab);
        }
@@ -1815,6 +1825,36 @@ int audit_socketcall(int nargs, unsigned long *args)
 }
 /**
+ * __audit_fd_pair - record audit data for pipe and socketpair
+ * @fd1: the first file descriptor
+ * @fd2: the second file descriptor
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_fd_pair(int fd1, int fd2)
+{
+        struct audit_context *context = current->audit_context;
+        struct audit_aux_data_fd_pair *ax;
+        if (likely(!context)) {
+                return 0;
+        }
+        ax = kmalloc(sizeof(*ax), GFP_KERNEL);
+        if (!ax) {
+                return -ENOMEM;
+        }
+        ax->fd[0] = fd1;
+        ax->fd[1] = fd2;
+        ax->d.type = AUDIT_FD_PAIR;
+        ax->d.next = context->aux;
+        context->aux = (void *)ax;
+        return 0;
+}
+/**
 * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
 * @len: data length in user space
 * @a: data address in kernel space
diff --git a/kernel/fork.c b/kernel/fork.c
index 0b6293d94d96..d154cc786489 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -858,7 +858,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
        init_sigpending(&sig->shared_pending);
        INIT_LIST_HEAD(&sig->posix_timers);
-        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
+        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        sig->it_real_incr.tv64 = 0;
        sig->real_timer.function = it_real_fn;
        sig->tsk = tsk;
diff --git a/kernel/futex.c b/kernel/futex.c
index 5a737de857d3..e749e7df14b1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1134,7 +1134,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
        if (sec != MAX_SCHEDULE_TIMEOUT) {
                to = &timeout;
-                hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
+                hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
                hrtimer_init_sleeper(to, current);
                to->timer.expires = ktime_set(sec, nsec);
        }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f44e499e8fca..476cb0c0b4a4 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1,8 +1,9 @@
 /*
 *  linux/kernel/hrtimer.c
 *
- *  Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
- *  Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
+ *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
 *
 *  High-resolution kernel timers
 *
@@ -31,12 +32,17 @@
 */
 #include <linux/cpu.h>
+#include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/hrtimer.h>
 #include <linux/notifier.h>
 #include <linux/syscalls.h>
+#include <linux/kallsyms.h>
 #include <linux/interrupt.h>
+#include <linux/tick.h>
+#include <linux/seq_file.h>
+#include <linux/err.h>
 #include <asm/uaccess.h>
@@ -45,7 +51,7 @@
 *
 * returns the time in ktime_t format
 */
-static ktime_t ktime_get(void)
+ktime_t ktime_get(void)
 {
        struct timespec now;
@@ -59,7 +65,7 @@ static ktime_t ktime_get(void)
 *
 * returns the time in ktime_t format
 */
-static ktime_t ktime_get_real(void)
+ktime_t ktime_get_real(void)
 {
        struct timespec now;
@@ -79,21 +85,22 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
 * This ensures that we capture erroneous accesses to these clock ids
 * rather than moving them into the range of valid clock id's.
 */
+DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
-#define MAX_HRTIMER_BASES 2
-static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
 {
+        .clock_base =
        {
-                .index = CLOCK_REALTIME,
+                {
-                .get_time = &ktime_get_real,
+                        .index = CLOCK_REALTIME,
-                .resolution = KTIME_REALTIME_RES,
+                        .get_time = &ktime_get_real,
-        },
+                        .resolution = KTIME_LOW_RES,
-        {
+                },
-                .index = CLOCK_MONOTONIC,
+                {
-                .get_time = &ktime_get,
+                        .index = CLOCK_MONOTONIC,
-                .resolution = KTIME_MONOTONIC_RES,
+                        .get_time = &ktime_get,
-        },
+                        .resolution = KTIME_LOW_RES,
+                },
+        }
 };
 /**
@@ -125,20 +132,35 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
 * Get the coarse grained time at the softirq based on xtime and
 * wall_to_monotonic.
 */
-static void hrtimer_get_softirq_time(struct hrtimer_base *base)
+static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 {
        ktime_t xtim, tomono;
+        struct timespec xts;
        unsigned long seq;
        do {
                seq = read_seqbegin(&xtime_lock);
-                xtim = timespec_to_ktime(xtime);
+#ifdef CONFIG_NO_HZ
-                tomono = timespec_to_ktime(wall_to_monotonic);
+                getnstimeofday(&xts);
+#else
+                xts = xtime;
+#endif
        } while (read_seqretry(&xtime_lock, seq));
-        base[CLOCK_REALTIME].softirq_time = xtim;
+        xtim = timespec_to_ktime(xts);
-        base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono);
+        tomono = timespec_to_ktime(wall_to_monotonic);
+        base->clock_base[CLOCK_REALTIME].softirq_time = xtim;
+        base->clock_base[CLOCK_MONOTONIC].softirq_time =
+                ktime_add(xtim, tomono);
+}
+/*
+ * Helper function to check, whether the timer is running the callback
+ * function
+ */
+static inline int hrtimer_callback_running(struct hrtimer *timer)
+{
+        return timer->state & HRTIMER_STATE_CALLBACK;
 }
 /*
@@ -147,8 +169,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base)
 */
 #ifdef CONFIG_SMP
-#define set_curr_timer(b, t)            do { (b)->curr_timer = (t); } while (0)
 /*
 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
 * means that all timers which are tied to this base via timer->base are
@@ -161,19 +181,20 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base)
 * possible to set timer->base = NULL and drop the lock: the timer remains
 * locked.
 */
-static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer,
+static
-                                              unsigned long *flags)
+struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
+                                             unsigned long *flags)
 {
-        struct hrtimer_base *base;
+        struct hrtimer_clock_base *base;
        for (;;) {
                base = timer->base;
                if (likely(base != NULL)) {
-                        spin_lock_irqsave(&base->lock, *flags);
+                        spin_lock_irqsave(&base->cpu_base->lock, *flags);
                        if (likely(base == timer->base))
                                return base;
                        /* The timer has migrated to another CPU: */
-                        spin_unlock_irqrestore(&base->lock, *flags);
+                        spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
                }
                cpu_relax();
        }
@@ -182,12 +203,14 @@ static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer,
 /*
 * Switch the timer base to the current CPU when possible.
 */
-static inline struct hrtimer_base *
+static inline struct hrtimer_clock_base *
-switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
 {
-        struct hrtimer_base *new_base;
+        struct hrtimer_clock_base *new_base;
+        struct hrtimer_cpu_base *new_cpu_base;
-        new_base = &__get_cpu_var(hrtimer_bases)[base->index];
+        new_cpu_base = &__get_cpu_var(hrtimer_bases);
+        new_base = &new_cpu_base->clock_base[base->index];
        if (base != new_base) {
                /*
@@ -199,13 +222,13 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
                 * completed. There is no conflict as we hold the lock until
                 * the timer is enqueued.
                 */
-                if (unlikely(base->curr_timer == timer))
+                if (unlikely(hrtimer_callback_running(timer)))
                        return base;
                /* See the comment in lock_timer_base() */
                timer->base = NULL;
-                spin_unlock(&base->lock);
+                spin_unlock(&base->cpu_base->lock);
-                spin_lock(&new_base->lock);
+                spin_lock(&new_base->cpu_base->lock);
                timer->base = new_base;
        }
        return new_base;
@@ -213,19 +236,17 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
 #else /* CONFIG_SMP */
-#define set_curr_timer(b, t)            do { } while (0)
+static inline struct hrtimer_clock_base *
-static inline struct hrtimer_base *
 lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 {
-        struct hrtimer_base *base = timer->base;
+        struct hrtimer_clock_base *base = timer->base;
-        spin_lock_irqsave(&base->lock, *flags);
+        spin_lock_irqsave(&base->cpu_base->lock, *flags);
        return base;
 }
-#define switch_hrtimer_base(t, b)       (b)
+# define switch_hrtimer_base(t, b)      (b)
 #endif  /* !CONFIG_SMP */
@@ -256,15 +277,12 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
        return ktime_add(kt, tmp);
 }
-#else /* CONFIG_KTIME_SCALAR */
 # endif /* !CONFIG_KTIME_SCALAR */
 /*
 * Divide a ktime value by a nanosecond value
 */
-static unsigned long ktime_divns(const ktime_t kt, s64 div)
+unsigned long ktime_divns(const ktime_t kt, s64 div)
 {
        u64 dclc, inc, dns;
        int sft = 0;
@@ -281,18 +299,311 @@ static unsigned long ktime_divns(const ktime_t kt, s64 div)
        return (unsigned long) dclc;
 }
-#else /* BITS_PER_LONG < 64 */
-# define ktime_divns(kt, div)           (unsigned long)((kt).tv64 / (div))
 #endif /* BITS_PER_LONG >= 64 */
+/* High resolution timer related functions */
+#ifdef CONFIG_HIGH_RES_TIMERS
+/*
+ * High resolution timer enabled ?
+ */
+static int hrtimer_hres_enabled __read_mostly  = 1;
+/*
+ * Enable / Disable high resolution mode
+ */
+static int __init setup_hrtimer_hres(char *str)
+{
+        if (!strcmp(str, "off"))
+                hrtimer_hres_enabled = 0;
+        else if (!strcmp(str, "on"))
+                hrtimer_hres_enabled = 1;
+        else
+                return 0;
+        return 1;
+}
+__setup("highres=", setup_hrtimer_hres);
+/*
+ * hrtimer_high_res_enabled - query, if the highres mode is enabled
+ */
+static inline int hrtimer_is_hres_enabled(void)
+{
+        return hrtimer_hres_enabled;
+}
+/*
+ * Is the high resolution mode active ?
+ */
+static inline int hrtimer_hres_active(void)
+{
+        return __get_cpu_var(hrtimer_bases).hres_active;
+}
+/*
+ * Reprogram the event source with checking both queues for the
+ * next event
+ * Called with interrupts disabled and base->lock held
+ */
+static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
+{
+        int i;
+        struct hrtimer_clock_base *base = cpu_base->clock_base;
+        ktime_t expires;
+        cpu_base->expires_next.tv64 = KTIME_MAX;
+        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+                struct hrtimer *timer;
+                if (!base->first)
+                        continue;
+                timer = rb_entry(base->first, struct hrtimer, node);
+                expires = ktime_sub(timer->expires, base->offset);
+                if (expires.tv64 < cpu_base->expires_next.tv64)
+                        cpu_base->expires_next = expires;
+        }
+        if (cpu_base->expires_next.tv64 != KTIME_MAX)
+                tick_program_event(cpu_base->expires_next, 1);
+}
+/*
+ * Shared reprogramming for clock_realtime and clock_monotonic
+ *
+ * When a timer is enqueued and expires earlier than the already enqueued
+ * timers, we have to check, whether it expires earlier than the timer for
+ * which the clock event device was armed.
+ *
+ * Called with interrupts disabled and base->cpu_base.lock held
+ */
+static int hrtimer_reprogram(struct hrtimer *timer,
+                             struct hrtimer_clock_base *base)
+{
+        ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
+        ktime_t expires = ktime_sub(timer->expires, base->offset);
+        int res;
+        /*
+         * When the callback is running, we do not reprogram the clock event
+         * device. The timer callback is either running on a different CPU or
+         * the callback is executed in the hrtimer_interupt context. The
+         * reprogramming is handled either by the softirq, which called the
+         * callback or at the end of the hrtimer_interrupt.
+         */
+        if (hrtimer_callback_running(timer))
+                return 0;
+        if (expires.tv64 >= expires_next->tv64)
+                return 0;
+        /*
+         * Clockevents returns -ETIME, when the event was in the past.
+         */
+        res = tick_program_event(expires, 0);
+        if (!IS_ERR_VALUE(res))
+                *expires_next = expires;
+        return res;
+}
+/*
+ * Retrigger next event is called after clock was set
+ *
+ * Called with interrupts disabled via on_each_cpu()
+ */
+static void retrigger_next_event(void *arg)
+{
+        struct hrtimer_cpu_base *base;
+        struct timespec realtime_offset;
+        unsigned long seq;
+        if (!hrtimer_hres_active())
+                return;
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                set_normalized_timespec(&realtime_offset,
+                                        -wall_to_monotonic.tv_sec,
+                                        -wall_to_monotonic.tv_nsec);
+        } while (read_seqretry(&xtime_lock, seq));
+        base = &__get_cpu_var(hrtimer_bases);
+        /* Adjust CLOCK_REALTIME offset */
+        spin_lock(&base->lock);
+        base->clock_base[CLOCK_REALTIME].offset =
+                timespec_to_ktime(realtime_offset);
+        hrtimer_force_reprogram(base);
+        spin_unlock(&base->lock);
+}
+/*
+ * Clock realtime was set
+ *
+ * Change the offset of the realtime clock vs. the monotonic
+ * clock.
+ *
+ * We might have to reprogram the high resolution timer interrupt. On
+ * SMP we call the architecture specific code to retrigger _all_ high
+ * resolution timer interrupts. On UP we just disable interrupts and
+ * call the high resolution interrupt code.
+ */
+void clock_was_set(void)
+{
+        /* Retrigger the CPU local events everywhere */
+        on_each_cpu(retrigger_next_event, NULL, 0, 1);
+}
+/*
+ * Check, whether the timer is on the callback pending list
+ */
+static inline int hrtimer_cb_pending(const struct hrtimer *timer)
+{
+        return timer->state & HRTIMER_STATE_PENDING;
+}
+/*
+ * Remove a timer from the callback pending list
+ */
+static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
+{
+        list_del_init(&timer->cb_entry);
+}
+/*
+ * Initialize the high resolution related parts of cpu_base
+ */
+static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
+{
+        base->expires_next.tv64 = KTIME_MAX;
+        base->hres_active = 0;
+        INIT_LIST_HEAD(&base->cb_pending);
+}
+/*
+ * Initialize the high resolution related parts of a hrtimer
+ */
+static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
+{
+        INIT_LIST_HEAD(&timer->cb_entry);
+}
+/*
+ * When High resolution timers are active, try to reprogram. Note, that in case
+ * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
+ * check happens. The timer gets enqueued into the rbtree. The reprogramming
+ * and expiry check is done in the hrtimer_interrupt or in the softirq.
+ */
+static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
+                                            struct hrtimer_clock_base *base)
+{
+        if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
+                /* Timer is expired, act upon the callback mode */
+                switch(timer->cb_mode) {
+                case HRTIMER_CB_IRQSAFE_NO_RESTART:
+                        /*
+                         * We can call the callback from here. No restart
+                         * happens, so no danger of recursion
+                         */
+                        BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
+                        return 1;
+                case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
+                        /*
+                         * This is solely for the sched tick emulation with
+                         * dynamic tick support to ensure that we do not
+                         * restart the tick right on the edge and end up with
+                         * the tick timer in the softirq ! The calling site
+                         * takes care of this.
+                         */
+                        return 1;
+                case HRTIMER_CB_IRQSAFE:
+                case HRTIMER_CB_SOFTIRQ:
+                        /*
+                         * Move everything else into the softirq pending list !
+                         */
+                        list_add_tail(&timer->cb_entry,
+                                      &base->cpu_base->cb_pending);
+                        timer->state = HRTIMER_STATE_PENDING;
+                        raise_softirq(HRTIMER_SOFTIRQ);
+                        return 1;
+                default:
+                        BUG();
+                }
+        }
+        return 0;
+}
+/*
+ * Switch to high resolution mode
+ */
+static void hrtimer_switch_to_hres(void)
+{
+        struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+        unsigned long flags;
+        if (base->hres_active)
+                return;
+        local_irq_save(flags);
+        if (tick_init_highres()) {
+                local_irq_restore(flags);
+                return;
+        }
+        base->hres_active = 1;
+        base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES;
+        base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES;
+        tick_setup_sched_timer();
+        /* "Retrigger" the interrupt to get things going */
+        retrigger_next_event(NULL);
+        local_irq_restore(flags);
+        printk(KERN_INFO "Switched to high resolution mode on CPU %d\n",
+               smp_processor_id());
+}
+#else
+static inline int hrtimer_hres_active(void) { return 0; }
+static inline int hrtimer_is_hres_enabled(void) { return 0; }
+static inline void hrtimer_switch_to_hres(void) { }
+static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
+static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
+                                            struct hrtimer_clock_base *base)
+{
+        return 0;
+}
+static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; }
+static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
+static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
+#endif /* CONFIG_HIGH_RES_TIMERS */
+#ifdef CONFIG_TIMER_STATS
+void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
+{
+        if (timer->start_site)
+                return;
+        timer->start_site = addr;
+        memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
+        timer->start_pid = current->pid;
+}
+#endif
 /*
 * Counterpart to lock_timer_base above:
 */
 static inline
 void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 {
-        spin_unlock_irqrestore(&timer->base->lock, *flags);
+        spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
 }
 /**
@@ -342,7 +653,8 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 * The timer is inserted in expiry order. Insertion into the
 * red black tree is O(log(n)). Must hold the base lock.
 */
-static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+static void enqueue_hrtimer(struct hrtimer *timer,
+                            struct hrtimer_clock_base *base, int reprogram)
 {
        struct rb_node **link = &base->active.rb_node;
        struct rb_node *parent = NULL;
@@ -368,39 +680,85 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
         * Insert the timer to the rbtree and check whether it
         * replaces the first pending timer
         */
-        rb_link_node(&timer->node, parent, link);
-        rb_insert_color(&timer->node, &base->active);
        if (!base->first || timer->expires.tv64 <
-            rb_entry(base->first, struct hrtimer, node)->expires.tv64)
+            rb_entry(base->first, struct hrtimer, node)->expires.tv64) {
+                /*
+                 * Reprogram the clock event device. When the timer is already
+                 * expired hrtimer_enqueue_reprogram has either called the
+                 * callback or added it to the pending list and raised the
+                 * softirq.
+                 *
+                 * This is a NOP for !HIGHRES
+                 */
+                if (reprogram && hrtimer_enqueue_reprogram(timer, base))
+                        return;
                base->first = &timer->node;
+        }
+        rb_link_node(&timer->node, parent, link);
+        rb_insert_color(&timer->node, &base->active);
+        /*
+         * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
+         * state of a possibly running callback.
+         */
+        timer->state |= HRTIMER_STATE_ENQUEUED;
 }
 /*
 * __remove_hrtimer - internal function to remove a timer
 *
 * Caller must hold the base lock.
+ *
+ * High resolution timer mode reprograms the clock event device when the
+ * timer is the one which expires next. The caller can disable this by setting
+ * reprogram to zero. This is useful, when the context does a reprogramming
+ * anyway (e.g. timer interrupt)
 */
-static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+static void __remove_hrtimer(struct hrtimer *timer,
+                             struct hrtimer_clock_base *base,
+                             unsigned long newstate, int reprogram)
 {
-        /*
+        /* High res. callback list. NOP for !HIGHRES */
-         * Remove the timer from the rbtree and replace the
+        if (hrtimer_cb_pending(timer))
-         * first entry pointer if necessary.
+                hrtimer_remove_cb_pending(timer);
-         */
+        else {
-        if (base->first == &timer->node)
+                /*
-                base->first = rb_next(&timer->node);
+                 * Remove the timer from the rbtree and replace the
-        rb_erase(&timer->node, &base->active);
+                 * first entry pointer if necessary.
-        rb_set_parent(&timer->node, &timer->node);
+                 */
+                if (base->first == &timer->node) {
+                        base->first = rb_next(&timer->node);
+                        /* Reprogram the clock event device. if enabled */
+                        if (reprogram && hrtimer_hres_active())
+                                hrtimer_force_reprogram(base->cpu_base);
+                }
+                rb_erase(&timer->node, &base->active);
+        }
+        timer->state = newstate;
 }
 /*
 * remove hrtimer, called with base lock held
 */
 static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 {
-        if (hrtimer_active(timer)) {
+        if (hrtimer_is_queued(timer)) {
-                __remove_hrtimer(timer, base);
+                int reprogram;
+                /*
+                 * Remove the timer and force reprogramming when high
+                 * resolution mode is active and the timer is on the current
+                 * CPU. If we remove a timer on another CPU, reprogramming is
+                 * skipped. The interrupt event on this CPU is fired and
+                 * reprogramming happens in the interrupt handler. This is a
+                 * rare case and less expensive than a smp call.
+                 */
+                timer_stats_hrtimer_clear_start_info(timer);
+                reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
+                __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
+                                 reprogram);
                return 1;
        }
        return 0;
@@ -419,7 +777,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 int
 hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 {
-        struct hrtimer_base *base, *new_base;
+        struct hrtimer_clock_base *base, *new_base;
        unsigned long flags;
        int ret;
@@ -431,7 +789,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
        /* Switch the timer base, if necessary: */
        new_base = switch_hrtimer_base(timer, base);
-        if (mode == HRTIMER_REL) {
+        if (mode == HRTIMER_MODE_REL) {
                tim = ktime_add(tim, new_base->get_time());
                /*
                 * CONFIG_TIME_LOW_RES is a temporary way for architectures
@@ -446,7 +804,9 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
        }
        timer->expires = tim;
-        enqueue_hrtimer(timer, new_base);
+        timer_stats_hrtimer_set_start_info(timer);
+        enqueue_hrtimer(timer, new_base, base == new_base);
        unlock_hrtimer_base(timer, &flags);
@@ -466,13 +826,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start);
 */
 int hrtimer_try_to_cancel(struct hrtimer *timer)
 {
-        struct hrtimer_base *base;
+        struct hrtimer_clock_base *base;
        unsigned long flags;
        int ret = -1;
        base = lock_hrtimer_base(timer, &flags);
-        if (base->curr_timer != timer)
+        if (!hrtimer_callback_running(timer))
                ret = remove_hrtimer(timer, base);
        unlock_hrtimer_base(timer, &flags);
@@ -508,19 +868,19 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
 */
 ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
 {
-        struct hrtimer_base *base;
+        struct hrtimer_clock_base *base;
        unsigned long flags;
        ktime_t rem;
        base = lock_hrtimer_base(timer, &flags);
-        rem = ktime_sub(timer->expires, timer->base->get_time());
+        rem = ktime_sub(timer->expires, base->get_time());
        unlock_hrtimer_base(timer, &flags);
        return rem;
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
-#ifdef CONFIG_NO_IDLE_HZ
+#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
 /**
 * hrtimer_get_next_event - get the time until next expiry event
 *
@@ -529,26 +889,31 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
 */
 ktime_t hrtimer_get_next_event(void)
 {
-        struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
+        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+        struct hrtimer_clock_base *base = cpu_base->clock_base;
        ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
        unsigned long flags;
        int i;
-        for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) {
+        spin_lock_irqsave(&cpu_base->lock, flags);
-                struct hrtimer *timer;
-                spin_lock_irqsave(&base->lock, flags);
+        if (!hrtimer_hres_active()) {
-                if (!base->first) {
+                for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
-                        spin_unlock_irqrestore(&base->lock, flags);
+                        struct hrtimer *timer;
-                        continue;
+                        if (!base->first)
+                                continue;
+                        timer = rb_entry(base->first, struct hrtimer, node);
+                        delta.tv64 = timer->expires.tv64;
+                        delta = ktime_sub(delta, base->get_time());
+                        if (delta.tv64 < mindelta.tv64)
+                                mindelta.tv64 = delta.tv64;
                }
-                timer = rb_entry(base->first, struct hrtimer, node);
-                delta.tv64 = timer->expires.tv64;
-                spin_unlock_irqrestore(&base->lock, flags);
-                delta = ktime_sub(delta, base->get_time());
-                if (delta.tv64 < mindelta.tv64)
-                        mindelta.tv64 = delta.tv64;
        }
+        spin_unlock_irqrestore(&cpu_base->lock, flags);
        if (mindelta.tv64 < 0)
                mindelta.tv64 = 0;
        return mindelta;
@@ -564,17 +929,23 @@ ktime_t hrtimer_get_next_event(void)
 void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
                  enum hrtimer_mode mode)
 {
-        struct hrtimer_base *bases;
+        struct hrtimer_cpu_base *cpu_base;
        memset(timer, 0, sizeof(struct hrtimer));
-        bases = __raw_get_cpu_var(hrtimer_bases);
+        cpu_base = &__raw_get_cpu_var(hrtimer_bases);
-        if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS)
+        if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
                clock_id = CLOCK_MONOTONIC;
-        timer->base = &bases[clock_id];
+        timer->base = &cpu_base->clock_base[clock_id];
-        rb_set_parent(&timer->node, &timer->node);
+        hrtimer_init_timer_hres(timer);
+#ifdef CONFIG_TIMER_STATS
+        timer->start_site = NULL;
+        timer->start_pid = -1;
+        memset(timer->start_comm, 0, TASK_COMM_LEN);
+#endif
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
@@ -588,21 +959,159 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
 */
 int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 {
-        struct hrtimer_base *bases;
+        struct hrtimer_cpu_base *cpu_base;
-        bases = __raw_get_cpu_var(hrtimer_bases);
+        cpu_base = &__raw_get_cpu_var(hrtimer_bases);
-        *tp = ktime_to_timespec(bases[which_clock].resolution);
+        *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution);
        return 0;
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_res);
+#ifdef CONFIG_HIGH_RES_TIMERS
+/*
+ * High resolution timer interrupt
+ * Called with interrupts disabled
+ */
+void hrtimer_interrupt(struct clock_event_device *dev)
+{
+        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+        struct hrtimer_clock_base *base;
+        ktime_t expires_next, now;
+        int i, raise = 0;
+        BUG_ON(!cpu_base->hres_active);
+        cpu_base->nr_events++;
+        dev->next_event.tv64 = KTIME_MAX;
+ retry:
+        now = ktime_get();
+        expires_next.tv64 = KTIME_MAX;
+        base = cpu_base->clock_base;
+        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+                ktime_t basenow;
+                struct rb_node *node;
+                spin_lock(&cpu_base->lock);
+                basenow = ktime_add(now, base->offset);
+                while ((node = base->first)) {
+                        struct hrtimer *timer;
+                        timer = rb_entry(node, struct hrtimer, node);
+                        if (basenow.tv64 < timer->expires.tv64) {
+                                ktime_t expires;
+                                expires = ktime_sub(timer->expires,
+                                                    base->offset);
+                                if (expires.tv64 < expires_next.tv64)
+                                        expires_next = expires;
+                                break;
+                        }
+                        /* Move softirq callbacks to the pending list */
+                        if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
+                                __remove_hrtimer(timer, base,
+                                                 HRTIMER_STATE_PENDING, 0);
+                                list_add_tail(&timer->cb_entry,
+                                              &base->cpu_base->cb_pending);
+                                raise = 1;
+                                continue;
+                        }
+                        __remove_hrtimer(timer, base,
+                                         HRTIMER_STATE_CALLBACK, 0);
+                        timer_stats_account_hrtimer(timer);
+                        /*
+                         * Note: We clear the CALLBACK bit after
+                         * enqueue_hrtimer to avoid reprogramming of
+                         * the event hardware. This happens at the end
+                         * of this function anyway.
+                         */
+                        if (timer->function(timer) != HRTIMER_NORESTART) {
+                                BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+                                enqueue_hrtimer(timer, base, 0);
+                        }
+                        timer->state &= ~HRTIMER_STATE_CALLBACK;
+                }
+                spin_unlock(&cpu_base->lock);
+                base++;
+        }
+        cpu_base->expires_next = expires_next;
+        /* Reprogramming necessary ? */
+        if (expires_next.tv64 != KTIME_MAX) {
+                if (tick_program_event(expires_next, 0))
+                        goto retry;
+        }
+        /* Raise softirq ? */
+        if (raise)
+                raise_softirq(HRTIMER_SOFTIRQ);
+}
+static void run_hrtimer_softirq(struct softirq_action *h)
+{
+        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+        spin_lock_irq(&cpu_base->lock);
+        while (!list_empty(&cpu_base->cb_pending)) {
+                enum hrtimer_restart (*fn)(struct hrtimer *);
+                struct hrtimer *timer;
+                int restart;
+                timer = list_entry(cpu_base->cb_pending.next,
+                                   struct hrtimer, cb_entry);
+                timer_stats_account_hrtimer(timer);
+                fn = timer->function;
+                __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
+                spin_unlock_irq(&cpu_base->lock);
+                restart = fn(timer);
+                spin_lock_irq(&cpu_base->lock);
+                timer->state &= ~HRTIMER_STATE_CALLBACK;
+                if (restart == HRTIMER_RESTART) {
+                        BUG_ON(hrtimer_active(timer));
+                        /*
+                         * Enqueue the timer, allow reprogramming of the event
+                         * device
+                         */
+                        enqueue_hrtimer(timer, timer->base, 1);
+                } else if (hrtimer_active(timer)) {
+                        /*
+                         * If the timer was rearmed on another CPU, reprogram
+                         * the event device.
+                         */
+                        if (timer->base->first == &timer->node)
+                                hrtimer_reprogram(timer, timer->base);
+                }
+        }
+        spin_unlock_irq(&cpu_base->lock);
+}
+#endif  /* CONFIG_HIGH_RES_TIMERS */
 /*
 * Expire the per base hrtimer-queue:
 */
-static inline void run_hrtimer_queue(struct hrtimer_base *base)
+static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
+                                     int index)
 {
        struct rb_node *node;
+        struct hrtimer_clock_base *base = &cpu_base->clock_base[index];
        if (!base->first)
                return;
@@ -610,53 +1119,72 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
        if (base->get_softirq_time)
                base->softirq_time = base->get_softirq_time();
-        spin_lock_irq(&base->lock);
+        spin_lock_irq(&cpu_base->lock);
        while ((node = base->first)) {
                struct hrtimer *timer;
-                int (*fn)(struct hrtimer *);
+                enum hrtimer_restart (*fn)(struct hrtimer *);
                int restart;
                timer = rb_entry(node, struct hrtimer, node);
                if (base->softirq_time.tv64 <= timer->expires.tv64)
                        break;
+                timer_stats_account_hrtimer(timer);
                fn = timer->function;
-                set_curr_timer(base, timer);
+                __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
-                __remove_hrtimer(timer, base);
+                spin_unlock_irq(&cpu_base->lock);
-                spin_unlock_irq(&base->lock);
                restart = fn(timer);
-                spin_lock_irq(&base->lock);
+                spin_lock_irq(&cpu_base->lock);
+                timer->state &= ~HRTIMER_STATE_CALLBACK;
                if (restart != HRTIMER_NORESTART) {
                        BUG_ON(hrtimer_active(timer));
-                        enqueue_hrtimer(timer, base);
+                        enqueue_hrtimer(timer, base, 0);
                }
        }
-        set_curr_timer(base, NULL);
+        spin_unlock_irq(&cpu_base->lock);
-        spin_unlock_irq(&base->lock);
 }
 /*
 * Called from timer softirq every jiffy, expire hrtimers:
+ *
+ * For HRT its the fall back code to run the softirq in the timer
+ * softirq context in case the hrtimer initialization failed or has
+ * not been done yet.
 */
 void hrtimer_run_queues(void)
 {
-        struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
+        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
        int i;
-        hrtimer_get_softirq_time(base);
+        if (hrtimer_hres_active())
+                return;
+        /*
+         * This _is_ ugly: We have to check in the softirq context,
+         * whether we can switch to highres and / or nohz mode. The
+         * clocksource switch happens in the timer interrupt with
+         * xtime_lock held. Notification from there only sets the
+         * check bit in the tick_oneshot code, otherwise we might
+         * deadlock vs. xtime_lock.
+         */
+        if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+                hrtimer_switch_to_hres();
-        for (i = 0; i < MAX_HRTIMER_BASES; i++)
+        hrtimer_get_softirq_time(cpu_base);
-                run_hrtimer_queue(&base[i]);
+        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+                run_hrtimer_queue(cpu_base, i);
 }
 /*
 * Sleep related functions:
 */
-static int hrtimer_wakeup(struct hrtimer *timer)
+static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
 {
        struct hrtimer_sleeper *t =
                container_of(timer, struct hrtimer_sleeper, timer);
@@ -673,6 +1201,9 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 {
        sl->timer.function = hrtimer_wakeup;
        sl->task = task;
+#ifdef CONFIG_HIGH_RES_TIMERS
+        sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
+#endif
 }
 static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
@@ -683,10 +1214,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
                set_current_state(TASK_INTERRUPTIBLE);
                hrtimer_start(&t->timer, t->timer.expires, mode);
-                schedule();
+                if (likely(t->task))
+                        schedule();
                hrtimer_cancel(&t->timer);
-                mode = HRTIMER_ABS;
+                mode = HRTIMER_MODE_ABS;
        } while (t->task && !signal_pending(current));
@@ -702,10 +1234,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
        restart->fn = do_no_restart_syscall;
-        hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS);
+        hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS);
        t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
-        if (do_nanosleep(&t, HRTIMER_ABS))
+        if (do_nanosleep(&t, HRTIMER_MODE_ABS))
                return 0;
        rmtp = (struct timespec __user *) restart->arg1;
@@ -738,7 +1270,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
                return 0;
        /* Absolute timers do not update the rmtp value and restart: */
-        if (mode == HRTIMER_ABS)
+        if (mode == HRTIMER_MODE_ABS)
                return -ERESTARTNOHAND;
        if (rmtp) {
@@ -771,7 +1303,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
        if (!timespec_valid(&tu))
                return -EINVAL;
-        return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC);
+        return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 }
 /*
@@ -779,56 +1311,60 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
 */
 static void __devinit init_hrtimers_cpu(int cpu)
 {
-        struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu);
+        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
        int i;
-        for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) {
+        spin_lock_init(&cpu_base->lock);
-                spin_lock_init(&base->lock);
+        lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key);
-                lockdep_set_class(&base->lock, &base->lock_key);
-        }
+        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+                cpu_base->clock_base[i].cpu_base = cpu_base;
+        hrtimer_init_hres(cpu_base);
 }
 #ifdef CONFIG_HOTPLUG_CPU
-static void migrate_hrtimer_list(struct hrtimer_base *old_base,
+static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
-                                struct hrtimer_base *new_base)
+                                struct hrtimer_clock_base *new_base)
 {
        struct hrtimer *timer;
        struct rb_node *node;
        while ((node = rb_first(&old_base->active))) {
                timer = rb_entry(node, struct hrtimer, node);
-                __remove_hrtimer(timer, old_base);
+                BUG_ON(hrtimer_callback_running(timer));
+                __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
                timer->base = new_base;
-                enqueue_hrtimer(timer, new_base);
+                /*
+                 * Enqueue the timer. Allow reprogramming of the event device
+                 */
+                enqueue_hrtimer(timer, new_base, 1);
        }
 }
 static void migrate_hrtimers(int cpu)
 {
-        struct hrtimer_base *old_base, *new_base;
+        struct hrtimer_cpu_base *old_base, *new_base;
        int i;
        BUG_ON(cpu_online(cpu));
-        old_base = per_cpu(hrtimer_bases, cpu);
+        old_base = &per_cpu(hrtimer_bases, cpu);
-        new_base = get_cpu_var(hrtimer_bases);
+        new_base = &get_cpu_var(hrtimer_bases);
-        local_irq_disable();
-        for (i = 0; i < MAX_HRTIMER_BASES; i++) {
+        tick_cancel_sched_timer(cpu);
-                spin_lock(&new_base->lock);
+        local_irq_disable();
-                spin_lock(&old_base->lock);
-                BUG_ON(old_base->curr_timer);
-                migrate_hrtimer_list(old_base, new_base);
+        spin_lock(&new_base->lock);
+        spin_lock(&old_base->lock);
-                spin_unlock(&old_base->lock);
+        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-                spin_unlock(&new_base->lock);
+                migrate_hrtimer_list(&old_base->clock_base[i],
-                old_base++;
+                                     &new_base->clock_base[i]);
-                new_base++;
        }
+        spin_unlock(&old_base->lock);
+        spin_unlock(&new_base->lock);
        local_irq_enable();
        put_cpu_var(hrtimer_bases);
@@ -848,6 +1384,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_DEAD:
+                clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);
                migrate_hrtimers(cpu);
                break;
 #endif
@@ -868,5 +1405,8 @@ void __init hrtimers_init(void)
        hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
                          (void *)(long)smp_processor_id());
        register_cpu_notifier(&hrtimers_nb);
+#ifdef CONFIG_HIGH_RES_TIMERS
+        open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL);
+#endif
 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 475e8a71bcdc..0133f4f9e9f0 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -168,7 +168,7 @@ EXPORT_SYMBOL(set_irq_data);
 /**
 *      set_irq_data - set irq type data for an irq
 *      @irq:   Interrupt number
- *      @data:  Pointer to interrupt specific data
+ *      @entry: Pointer to MSI descriptor data
 *
 *      Set the hardware irq controller data for an irq
 */
@@ -230,10 +230,6 @@ static void default_enable(unsigned int irq)
 */
 static void default_disable(unsigned int irq)
 {
-        struct irq_desc *desc = irq_desc + irq;
-        if (!(desc->status & IRQ_DELAYED_DISABLE))
-                desc->chip->mask(irq);
 }
 /*
@@ -298,13 +294,18 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
        if (unlikely(desc->status & IRQ_INPROGRESS))
                goto out_unlock;
-        desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
        kstat_cpu(cpu).irqs[irq]++;
        action = desc->action;
-        if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+        if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
+                if (desc->chip->mask)
+                        desc->chip->mask(irq);
+                desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+                desc->status |= IRQ_PENDING;
                goto out_unlock;
+        }
+        desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING);
        desc->status |= IRQ_INPROGRESS;
        spin_unlock(&desc->lock);
@@ -396,11 +397,13 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        /*
         * If its disabled or no action available
-         * keep it masked and get out of here
+         * then mask it and get out of here:
         */
        action = desc->action;
        if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
                desc->status |= IRQ_PENDING;
+                if (desc->chip->mask)
+                        desc->chip->mask(irq);
                goto out;
        }
@@ -562,10 +565,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
        /* Uninstall? */
        if (handle == handle_bad_irq) {
-                if (desc->chip != &no_irq_chip) {
+                if (desc->chip != &no_irq_chip)
-                        desc->chip->mask(irq);
+                        mask_ack_irq(desc, irq);
-                        desc->chip->ack(irq);
-                }
                desc->status |= IRQ_DISABLED;
                desc->depth = 1;
        }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index acc5d9fe462b..5597c157442a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -38,6 +38,46 @@ void synchronize_irq(unsigned int irq)
 }
 EXPORT_SYMBOL(synchronize_irq);
+/**
+ *      irq_can_set_affinity - Check if the affinity of a given irq can be set
+ *      @irq:           Interrupt to check
+ *
+ */
+int irq_can_set_affinity(unsigned int irq)
+{
+        struct irq_desc *desc = irq_desc + irq;
+        if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
+            !desc->chip->set_affinity)
+                return 0;
+        return 1;
+}
+/**
+ *      irq_set_affinity - Set the irq affinity of a given irq
+ *      @irq:           Interrupt to set affinity
+ *      @cpumask:       cpumask
+ *
+ */
+int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+{
+        struct irq_desc *desc = irq_desc + irq;
+        if (!desc->chip->set_affinity)
+                return -EINVAL;
+        set_balance_irq_affinity(irq, cpumask);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+        set_pending_irq(irq, cpumask);
+#else
+        desc->affinity = cpumask;
+        desc->chip->set_affinity(irq, cpumask);
+#endif
+        return 0;
+}
 #endif
 /**
@@ -281,6 +321,10 @@ int setup_irq(unsigned int irq, struct irqaction *new)
        if (new->flags & IRQF_PERCPU)
                desc->status |= IRQ_PER_CPU;
 #endif
+        /* Exclude IRQ from balancing */
+        if (new->flags & IRQF_NOBALANCING)
+                desc->status |= IRQ_NO_BALANCING;
        if (!shared) {
                irq_chip_set_defaults(desc->chip);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6d3be06e8ce6..2db91eb54ad8 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -16,26 +16,6 @@ static struct proc_dir_entry *root_irq_dir;
 #ifdef CONFIG_SMP
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
-{
-        set_balance_irq_affinity(irq, mask_val);
-        /*
-         * Save these away for later use. Re-progam when the
-         * interrupt is pending
-         */
-        set_pending_irq(irq, mask_val);
-}
-#else
-void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
-{
-        set_balance_irq_affinity(irq, mask_val);
-        irq_desc[irq].affinity = mask_val;
-        irq_desc[irq].chip->set_affinity(irq, mask_val);
-}
-#endif
 static int irq_affinity_read_proc(char *page, char **start, off_t off,
                                  int count, int *eof, void *data)
 {
@@ -55,7 +35,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
        cpumask_t new_value, tmp;
        if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
-                                CHECK_IRQ_PER_CPU(irq_desc[irq].status))
+            irq_balancing_disabled(irq))
                return -EIO;
        err = cpumask_parse_user(buffer, count, new_value);
@@ -73,7 +53,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
                   code to set default SMP affinity. */
                return select_smp_affinity(irq) ? -EINVAL : full_count;
-        proc_set_irq_affinity(irq, new_value);
+        irq_set_affinity(irq, new_value);
        return full_count;
 }
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 204ed7939e75..307c6a632ef6 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -128,18 +128,13 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
 /*
 * The timer is automagically restarted, when interval != 0
 */
-int it_real_fn(struct hrtimer *timer)
+enum hrtimer_restart it_real_fn(struct hrtimer *timer)
 {
        struct signal_struct *sig =
            container_of(timer, struct signal_struct, real_timer);
        send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
-        if (sig->it_real_incr.tv64 != 0) {
-                hrtimer_forward(timer, timer->base->softirq_time,
-                                sig->it_real_incr);
-                return HRTIMER_RESTART;
-        }
        return HRTIMER_NORESTART;
 }
@@ -231,11 +226,14 @@ again:
                        spin_unlock_irq(&tsk->sighand->siglock);
                        goto again;
                }
-                tsk->signal->it_real_incr =
-                        timeval_to_ktime(value->it_interval);
                expires = timeval_to_ktime(value->it_value);
-                if (expires.tv64 != 0)
+                if (expires.tv64 != 0) {
-                        hrtimer_start(timer, expires, HRTIMER_REL);
+                        tsk->signal->it_real_incr =
+                                timeval_to_ktime(value->it_interval);
+                        hrtimer_start(timer, expires, HRTIMER_MODE_REL);
+                } else
+                        tsk->signal->it_real_incr.tv64 = 0;
                spin_unlock_irq(&tsk->sighand->siglock);
                break;
        case ITIMER_VIRTUAL:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 3a7379aa31ca..9f923f8ce6a0 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -36,6 +36,8 @@
 #include <linux/resource.h>
 #include <asm/uaccess.h>
+extern int delete_module(const char *name, unsigned int flags);
 extern int max_threads;
 static struct workqueue_struct *khelper_wq;
@@ -46,6 +48,7 @@ static struct workqueue_struct *khelper_wq;
        modprobe_path is set via /proc/sys.
 */
 char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
+struct module_kobject kmod_mk;
 /**
 * request_module - try to load a kernel module
@@ -75,6 +78,11 @@ int request_module(const char *fmt, ...)
        static atomic_t kmod_concurrent = ATOMIC_INIT(0);
 #define MAX_KMOD_CONCURRENT 50  /* Completely arbitrary value - KAO */
        static int kmod_loop_msg;
+        char modalias[16 + MODULE_NAME_LEN] = "MODALIAS=";
+        char *uevent_envp[2] = {
+                modalias,
+                NULL
+        };
        va_start(args, fmt);
        ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
@@ -82,6 +90,12 @@ int request_module(const char *fmt, ...)
        if (ret >= MODULE_NAME_LEN)
                return -ENAMETOOLONG;
+        strcpy(&modalias[strlen("MODALIAS=")], module_name);
+        kobject_uevent_env(&kmod_mk.kobj, KOBJ_CHANGE, uevent_envp);
+        if (modprobe_path[0] == '\0')
+                goto out;
        /* If modprobe needs a service that is in a module, we get a recursive
         * loop.  Limit the number of running kmod threads to max_threads/2 or
         * MAX_KMOD_CONCURRENT, whichever is the smaller.  A cleaner method
@@ -108,9 +122,115 @@ int request_module(const char *fmt, ...)
        ret = call_usermodehelper(modprobe_path, argv, envp, 1);
        atomic_dec(&kmod_concurrent);
+out:
        return ret;
 }
 EXPORT_SYMBOL(request_module);
+static ssize_t store_mod_request(struct module_attribute *mattr,
+                                 struct module *mod,
+                              const char *buffer, size_t count)
+{
+        char name[MODULE_NAME_LEN];
+        int ret;
+        if (count < 1 || count+1 > MODULE_NAME_LEN)
+                return -EINVAL;
+        memcpy(name, buffer, count);
+        name[count] = '\0';
+        if (name[count-1] == '\n')
+                name[count-1] = '\0';
+        ret = request_module(name);
+        if (ret < 0)
+                return ret;
+        return count;
+}
+static struct module_attribute mod_request = {
+        .attr = { .name = "mod_request", .mode = S_IWUSR, .owner = THIS_MODULE },
+        .store = store_mod_request,
+};
+#ifdef CONFIG_MODULE_UNLOAD
+static ssize_t store_mod_unload(struct module_attribute *mattr,
+                            struct module *mod,
+                            const char *buffer, size_t count)
+{
+        char name[MODULE_NAME_LEN];
+        int ret;
+        if (count < 1 || count+1 > MODULE_NAME_LEN)
+                return -EINVAL;
+        memcpy(name, buffer, count);
+        name[count] = '\0';
+        if (name[count-1] == '\n')
+                name[count-1] = '\0';
+        ret = delete_module(name, O_NONBLOCK);
+        if (ret < 0)
+                return ret;
+        return count;
+}
+static struct module_attribute mod_unload = {
+        .attr = { .name = "mod_unload", .mode = S_IWUSR, .owner = THIS_MODULE },
+        .store = store_mod_unload,
+};
+#endif
+static ssize_t show_mod_request_helper(struct module_attribute *mattr,
+                                       struct module *mod,
+                                       char *buffer)
+{
+        return sprintf(buffer, "%s\n", modprobe_path);
+}
+static ssize_t store_mod_request_helper(struct module_attribute *mattr,
+                                        struct module *mod,
+                                        const char *buffer, size_t count)
+{
+        if (count < 1 || count+1 > KMOD_PATH_LEN)
+                return -EINVAL;
+        memcpy(modprobe_path, buffer, count);
+        modprobe_path[count] = '\0';
+        if (modprobe_path[count-1] == '\n')
+                modprobe_path[count-1] = '\0';
+        return count;
+}
+static struct module_attribute mod_request_helper = {
+        .attr = {
+                .name = "mod_request_helper",
+                .mode = S_IWUSR | S_IRUGO,
+                .owner = THIS_MODULE
+        },
+        .show = show_mod_request_helper,
+        .store = store_mod_request_helper,
+};
+void __init kmod_sysfs_init(void)
+{
+        int ret;
+        kmod_mk.mod = THIS_MODULE;
+        kobj_set_kset_s(&kmod_mk, module_subsys);
+        kobject_set_name(&kmod_mk.kobj, "kmod");
+        kobject_init(&kmod_mk.kobj);
+        ret = kobject_add(&kmod_mk.kobj);
+        if (ret < 0)
+                goto out;
+        ret = sysfs_create_file(&kmod_mk.kobj, &mod_request_helper.attr);
+        ret = sysfs_create_file(&kmod_mk.kobj, &mod_request.attr);
+#ifdef CONFIG_MODULE_UNLOAD
+        ret = sysfs_create_file(&kmod_mk.kobj, &mod_unload.attr);
+#endif
+        kobject_uevent(&kmod_mk.kobj, KOBJ_ADD);
+out:
+        return;
+}
 #endif /* CONFIG_KMOD */
 struct subprocess_info {
@@ -217,7 +337,10 @@ static int wait_for_helper(void *data)
                        sub_info->retval = ret;
        }
-        complete(sub_info->complete);
+        if (sub_info->wait < 0)
+                kfree(sub_info);
+        else
+                complete(sub_info->complete);
        return 0;
 }
@@ -239,6 +362,9 @@ static void __call_usermodehelper(struct work_struct *work)
                pid = kernel_thread(____call_usermodehelper, sub_info,
                                    CLONE_VFORK | SIGCHLD);
+        if (wait < 0)
+                return;
        if (pid < 0) {
                sub_info->retval = pid;
                complete(sub_info->complete);
@@ -253,6 +379,9 @@ static void __call_usermodehelper(struct work_struct *work)
 * @envp: null-terminated environment list
 * @session_keyring: session keyring for process (NULL for an empty keyring)
 * @wait: wait for the application to finish and return status.
+ *        when -1 don't wait at all, but you get no useful error back when
+ *        the program couldn't be exec'ed. This makes it safe to call
+ *        from interrupt context.
 *
 * Runs a user-space application.  The application is started
 * asynchronously if wait is not set, and runs as a child of keventd.
@@ -265,17 +394,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
                             struct key *session_keyring, int wait)
 {
        DECLARE_COMPLETION_ONSTACK(done);
-        struct subprocess_info sub_info = {
+        struct subprocess_info *sub_info;
-                .work           = __WORK_INITIALIZER(sub_info.work,
+        int retval;
-                                                     __call_usermodehelper),
-                .complete       = &done,
-                .path           = path,
-                .argv           = argv,
-                .envp           = envp,
-                .ring           = session_keyring,
-                .wait           = wait,
-                .retval         = 0,
-        };
        if (!khelper_wq)
                return -EBUSY;
@@ -283,9 +403,25 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
        if (path[0] == '\0')
                return 0;
-        queue_work(khelper_wq, &sub_info.work);
+        sub_info = kzalloc(sizeof(struct subprocess_info),  GFP_ATOMIC);
+        if (!sub_info)
+                return -ENOMEM;
+        INIT_WORK(&sub_info->work, __call_usermodehelper);
+        sub_info->complete = &done;
+        sub_info->path = path;
+        sub_info->argv = argv;
+        sub_info->envp = envp;
+        sub_info->ring = session_keyring;
+        sub_info->wait = wait;
+        queue_work(khelper_wq, &sub_info->work);
+        if (wait < 0) /* task has freed sub_info */
+                return 0;
        wait_for_completion(&done);
-        return sub_info.retval;
+        retval = sub_info->retval;
+        kfree(sub_info);
+        return retval;
 }
 EXPORT_SYMBOL(call_usermodehelper_keys);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6fcf8dd148d0..d25a9ada3f8e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -39,6 +39,8 @@
 #include <linux/moduleloader.h>
 #include <linux/kallsyms.h>
 #include <linux/freezer.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
@@ -778,6 +780,12 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
        return -ENOSYS;
 }
+static int __kprobes pre_handler_kretprobe(struct kprobe *p,
+                                           struct pt_regs *regs)
+{
+        return 0;
+}
 #endif /* ARCH_SUPPORTS_KRETPROBES */
 void __kprobes unregister_kretprobe(struct kretprobe *rp)
@@ -815,7 +823,109 @@ static int __init init_kprobes(void)
        return err;
 }
-__initcall(init_kprobes);
+#ifdef CONFIG_DEBUG_FS
+static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
+               const char *sym, int offset,char *modname)
+{
+        char *kprobe_type;
+        if (p->pre_handler == pre_handler_kretprobe)
+                kprobe_type = "r";
+        else if (p->pre_handler == setjmp_pre_handler)
+                kprobe_type = "j";
+        else
+                kprobe_type = "k";
+        if (sym)
+                seq_printf(pi, "%p  %s  %s+0x%x  %s\n", p->addr, kprobe_type,
+                        sym, offset, (modname ? modname : " "));
+        else
+                seq_printf(pi, "%p  %s  %p\n", p->addr, kprobe_type, p->addr);
+}
+static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
+{
+        return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL;
+}
+static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
+{
+        (*pos)++;
+        if (*pos >= KPROBE_TABLE_SIZE)
+                return NULL;
+        return pos;
+}
+static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
+{
+        /* Nothing to do */
+}
+static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
+{
+        struct hlist_head *head;
+        struct hlist_node *node;
+        struct kprobe *p, *kp;
+        const char *sym = NULL;
+        unsigned int i = *(loff_t *) v;
+        unsigned long size, offset = 0;
+        char *modname, namebuf[128];
+        head = &kprobe_table[i];
+        preempt_disable();
+        hlist_for_each_entry_rcu(p, node, head, hlist) {
+                sym = kallsyms_lookup((unsigned long)p->addr, &size,
+                                        &offset, &modname, namebuf);
+                if (p->pre_handler == aggr_pre_handler) {
+                        list_for_each_entry_rcu(kp, &p->list, list)
+                                report_probe(pi, kp, sym, offset, modname);
+                } else
+                        report_probe(pi, p, sym, offset, modname);
+        }
+        preempt_enable();
+        return 0;
+}
+static struct seq_operations kprobes_seq_ops = {
+        .start = kprobe_seq_start,
+        .next  = kprobe_seq_next,
+        .stop  = kprobe_seq_stop,
+        .show  = show_kprobe_addr
+};
+static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
+{
+        return seq_open(filp, &kprobes_seq_ops);
+}
+static struct file_operations debugfs_kprobes_operations = {
+        .open           = kprobes_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+static int __kprobes debugfs_kprobe_init(void)
+{
+        struct dentry *dir, *file;
+        dir = debugfs_create_dir("kprobes", NULL);
+        if (!dir)
+                return -ENOMEM;
+        file = debugfs_create_file("list", 0444, dir , 0 ,
+                                &debugfs_kprobes_operations);
+        if (!file) {
+                debugfs_remove(dir);
+                return -ENOMEM;
+        }
+        return 0;
+}
+late_initcall(debugfs_kprobe_init);
+#endif /* CONFIG_DEBUG_FS */
+module_init(init_kprobes);
 EXPORT_SYMBOL_GPL(register_kprobe);
 EXPORT_SYMBOL_GPL(unregister_kprobe);
@@ -824,4 +934,3 @@ EXPORT_SYMBOL_GPL(unregister_jprobe);
 EXPORT_SYMBOL_GPL(jprobe_return);
 EXPORT_SYMBOL_GPL(register_kretprobe);
 EXPORT_SYMBOL_GPL(unregister_kretprobe);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 592c576d77a7..a08a17218dfa 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2228,6 +2228,10 @@ out_calc_hash:
        curr->lockdep_depth++;
        check_chain_key(curr);
+#ifdef CONFIG_DEBUG_LOCKDEP
+        if (unlikely(!debug_locks))
+                return 0;
+#endif
        if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
                debug_locks_off();
                printk("BUG: MAX_LOCK_DEPTH too low!\n");
diff --git a/kernel/module.c b/kernel/module.c
index 8a94e054230c..8c25b1a04fa6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -653,20 +653,11 @@ static void wait_for_zero_refcount(struct module *mod)
        mutex_lock(&module_mutex);
 }
-asmlinkage long
+int delete_module(const char *name, unsigned int flags)
-sys_delete_module(const char __user *name_user, unsigned int flags)
 {
        struct module *mod;
-        char name[MODULE_NAME_LEN];
        int ret, forced = 0;
-        if (!capable(CAP_SYS_MODULE))
-                return -EPERM;
-        if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
-                return -EFAULT;
-        name[MODULE_NAME_LEN-1] = '\0';
        if (mutex_lock_interruptible(&module_mutex) != 0)
                return -EINTR;
@@ -727,6 +718,21 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
        return ret;
 }
+asmlinkage long
+sys_delete_module(const char __user *name_user, unsigned int flags)
+{
+        char name[MODULE_NAME_LEN];
+        if (!capable(CAP_SYS_MODULE))
+                return -EPERM;
+        if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
+                return -EFAULT;
+        name[MODULE_NAME_LEN-1] = '\0';
+        return delete_module(name, flags);
+}
 static void print_unload_info(struct seq_file *m, struct module *mod)
 {
        struct module_use *use;
@@ -1068,7 +1074,8 @@ static inline void remove_sect_attrs(struct module *mod)
 }
 #endif /* CONFIG_KALLSYMS */
-static int module_add_modinfo_attrs(struct module *mod)
+#ifdef CONFIG_SYSFS
+int module_add_modinfo_attrs(struct module *mod)
 {
        struct module_attribute *attr;
        struct module_attribute *temp_attr;
@@ -1094,7 +1101,7 @@ static int module_add_modinfo_attrs(struct module *mod)
        return error;
 }
-static void module_remove_modinfo_attrs(struct module *mod)
+void module_remove_modinfo_attrs(struct module *mod)
 {
        struct module_attribute *attr;
        int i;
@@ -1109,8 +1116,10 @@ static void module_remove_modinfo_attrs(struct module *mod)
        }
        kfree(mod->modinfo_attrs);
 }
+#endif
-static int mod_sysfs_init(struct module *mod)
+#ifdef CONFIG_SYSFS
+int mod_sysfs_init(struct module *mod)
 {
        int err;
@@ -1133,7 +1142,7 @@ out:
        return err;
 }
-static int mod_sysfs_setup(struct module *mod,
+int mod_sysfs_setup(struct module *mod,
                           struct kernel_param *kparam,
                           unsigned int num_params)
 {
@@ -1169,16 +1178,14 @@ out_unreg:
 out:
        return err;
 }
+#endif
 static void mod_kobject_remove(struct module *mod)
 {
        module_remove_modinfo_attrs(mod);
        module_param_sysfs_remove(mod);
-        if (mod->mkobj.drivers_dir)
+        kobject_unregister(mod->mkobj.drivers_dir);
-                kobject_unregister(mod->mkobj.drivers_dir);
+        kobject_unregister(mod->holders_dir);
-        if (mod->holders_dir)
-                kobject_unregister(mod->holders_dir);
        kobject_unregister(&mod->mkobj.kobj);
 }
@@ -2345,6 +2352,7 @@ void print_modules(void)
        printk("\n");
 }
+#ifdef CONFIG_SYSFS
 static char *make_driver_name(struct device_driver *drv)
 {
        char *driver_name;
@@ -2419,6 +2427,7 @@ void module_remove_driver(struct device_driver *drv)
        }
 }
 EXPORT_SYMBOL(module_remove_driver);
+#endif
 #ifdef CONFIG_MODVERSIONS
 /* Generate the signature for struct module here, too, for modversions. */
diff --git a/kernel/params.c b/kernel/params.c
index 553cf7d6a4be..7a751570b56d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -30,8 +30,6 @@
 #define DEBUGP(fmt, a...)
 #endif
-static struct kobj_type module_ktype;
 static inline char dash2underscore(char c)
 {
        if (c == '-')
@@ -391,6 +389,7 @@ struct module_param_attrs
        struct param_attribute attrs[0];
 };
+#ifdef CONFIG_SYSFS
 #define to_param_attr(n) container_of(n, struct param_attribute, mattr);
 static ssize_t param_attr_show(struct module_attribute *mattr,
@@ -426,6 +425,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
                return len;
        return err;
 }
+#endif
 #ifdef CONFIG_MODULES
 #define __modinit
@@ -433,6 +433,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
 #define __modinit __init
 #endif
+#ifdef CONFIG_SYSFS
 /*
 * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME
 * @mk: struct module_kobject (contains parent kobject)
@@ -500,9 +501,7 @@ param_sysfs_setup(struct module_kobject *mk,
        return mp;
 }
 #ifdef CONFIG_MODULES
 /*
 * module_param_sysfs_setup - setup sysfs support for one module
 * @mod: module
@@ -625,7 +624,6 @@ static void __init param_sysfs_builtin(void)
 /* module-related sysfs stuff */
-#ifdef CONFIG_SYSFS
 #define to_module_attr(n) container_of(n, struct module_attribute, attr);
 #define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
@@ -673,6 +671,8 @@ static struct sysfs_ops module_sysfs_ops = {
        .store = module_attr_store,
 };
+static struct kobj_type module_ktype;
 static int uevent_filter(struct kset *kset, struct kobject *kobj)
 {
        struct kobj_type *ktype = get_ktype(kobj);
@@ -686,19 +686,12 @@ static struct kset_uevent_ops module_uevent_ops = {
        .filter = uevent_filter,
 };
-#else
+decl_subsys(module, &module_ktype, &module_uevent_ops);
-static struct sysfs_ops module_sysfs_ops = {
-        .show = NULL,
-        .store = NULL,
-};
-#endif
 static struct kobj_type module_ktype = {
        .sysfs_ops =    &module_sysfs_ops,
 };
-decl_subsys(module, &module_ktype, &module_uevent_ops);
 /*
 * param_sysfs_init - wrapper for built-in params support
 */
@@ -714,11 +707,21 @@ static int __init param_sysfs_init(void)
        }
        param_sysfs_builtin();
+        kmod_sysfs_init();
        return 0;
 }
 subsys_initcall(param_sysfs_init);
+#else
+#if 0
+static struct sysfs_ops module_sysfs_ops = {
+        .show = NULL,
+        .store = NULL,
+};
+#endif
+#endif
 EXPORT_SYMBOL(param_set_byte);
 EXPORT_SYMBOL(param_get_byte);
 EXPORT_SYMBOL(param_set_short);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 7c3e1e6dfb5b..657f77697415 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -304,7 +304,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
                 * should be able to see it.
                 */
                struct task_struct *p;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
                p = find_task_by_pid(pid);
                if (p) {
                        if (CPUCLOCK_PERTHREAD(which_clock)) {
@@ -312,12 +312,17 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
                                        error = cpu_clock_sample(which_clock,
                                                                 p, &rtn);
                                }
-                        } else if (p->tgid == pid && p->signal) {
+                        } else {
-                                error = cpu_clock_sample_group(which_clock,
+                                read_lock(&tasklist_lock);
-                                                               p, &rtn);
+                                if (p->tgid == pid && p->signal) {
+                                        error =
+                                            cpu_clock_sample_group(which_clock,
+                                                                   p, &rtn);
+                                }
+                                read_unlock(&tasklist_lock);
                        }
                }
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
        }
        if (error)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index a1bf61617839..44318ca71978 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int,
                            struct itimerspec *, struct itimerspec *);
 static int common_timer_del(struct k_itimer *timer);
-static int posix_timer_fn(struct hrtimer *data);
+static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
 static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
@@ -334,12 +334,12 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
 * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
 */
-static int posix_timer_fn(struct hrtimer *timer)
+static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 {
        struct k_itimer *timr;
        unsigned long flags;
        int si_private = 0;
-        int ret = HRTIMER_NORESTART;
+        enum hrtimer_restart ret = HRTIMER_NORESTART;
        timr = container_of(timer, struct k_itimer, it.real.timer);
        spin_lock_irqsave(&timr->it_lock, flags);
@@ -356,7 +356,7 @@ static int posix_timer_fn(struct hrtimer *timer)
                if (timr->it.real.interval.tv64 != 0) {
                        timr->it_overrun +=
                                hrtimer_forward(timer,
-                                                timer->base->softirq_time,
+                                                hrtimer_cb_get_time(timer),
                                                timr->it.real.interval);
                        ret = HRTIMER_RESTART;
                        ++timr->it_requeue_pending;
@@ -722,7 +722,7 @@ common_timer_set(struct k_itimer *timr, int flags,
        if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
                return 0;
-        mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
+        mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
        hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
        timr->it.real.timer.function = posix_timer_fn;
@@ -734,7 +734,7 @@ common_timer_set(struct k_itimer *timr, int flags,
        /* SIGEV_NONE timers are not queued ! See common_timer_get */
        if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
                /* Setup correct expiry time for relative timers */
-                if (mode == HRTIMER_REL)
+                if (mode == HRTIMER_MODE_REL)
                        timer->expires = ktime_add(timer->expires,
                                                   timer->base->get_time());
                return 0;
@@ -950,7 +950,8 @@ static int common_nsleep(const clockid_t which_clock, int flags,
                         struct timespec *tsave, struct timespec __user *rmtp)
 {
        return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
-                                 HRTIMER_ABS : HRTIMER_REL, which_clock);
+                                 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+                                 which_clock);
 }
 asmlinkage long
diff --git a/kernel/printk.c b/kernel/printk.c
index 0c151877ff71..4b47e59248df 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -54,7 +54,7 @@ int console_printk[4] = {
 };
 /*
- * Low lever drivers may need that to know if they can schedule in
+ * Low level drivers may need that to know if they can schedule in
 * their unblank() callback or not. So let's export it.
 */
 int oops_in_progress;
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 4ab17da46fd8..180978cb2f75 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -625,7 +625,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        /* Setup the timer, when timeout != NULL */
        if (unlikely(timeout))
                hrtimer_start(&timeout->timer, timeout->timer.expires,
-                              HRTIMER_ABS);
+                              HRTIMER_MODE_ABS);
        for (;;) {
                /* Try to acquire the lock: */
diff --git a/kernel/sched.c b/kernel/sched.c
index 08f86178aa34..0dc757246d89 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1853,6 +1853,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
        struct mm_struct *mm = next->mm;
        struct mm_struct *oldmm = prev->active_mm;
+        /*
+         * For paravirt, this is coupled with an exit in switch_to to
+         * combine the page table reload and the switch backend into
+         * one hypercall.
+         */
+        arch_enter_lazy_cpu_mode();
        if (!mm) {
                next->active_mm = oldmm;
                atomic_inc(&oldmm->mm_count);
diff --git a/kernel/signal.c b/kernel/signal.c
index 8072e568bbe0..e2a7d4bf7d57 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -456,26 +456,50 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 {
        int signr = __dequeue_signal(&tsk->pending, mask, info);
-        if (!signr)
+        if (!signr) {
                signr = __dequeue_signal(&tsk->signal->shared_pending,
                                         mask, info);
+                /*
+                 * itimer signal ?
+                 *
+                 * itimers are process shared and we restart periodic
+                 * itimers in the signal delivery path to prevent DoS
+                 * attacks in the high resolution timer case. This is
+                 * compliant with the old way of self restarting
+                 * itimers, as the SIGALRM is a legacy signal and only
+                 * queued once. Changing the restart behaviour to
+                 * restart the timer in the signal dequeue path is
+                 * reducing the timer noise on heavy loaded !highres
+                 * systems too.
+                 */
+                if (unlikely(signr == SIGALRM)) {
+                        struct hrtimer *tmr = &tsk->signal->real_timer;
+                        if (!hrtimer_is_queued(tmr) &&
+                            tsk->signal->it_real_incr.tv64 != 0) {
+                                hrtimer_forward(tmr, tmr->base->get_time(),
+                                                tsk->signal->it_real_incr);
+                                hrtimer_restart(tmr);
+                        }
+                }
+        }
        recalc_sigpending_tsk(tsk);
-        if (signr && unlikely(sig_kernel_stop(signr))) {
+        if (signr && unlikely(sig_kernel_stop(signr))) {
-                /*
+                /*
-                 * Set a marker that we have dequeued a stop signal.  Our
+                 * Set a marker that we have dequeued a stop signal.  Our
-                 * caller might release the siglock and then the pending
+                 * caller might release the siglock and then the pending
-                 * stop signal it is about to process is no longer in the
+                 * stop signal it is about to process is no longer in the
-                 * pending bitmasks, but must still be cleared by a SIGCONT
+                 * pending bitmasks, but must still be cleared by a SIGCONT
-                 * (and overruled by a SIGKILL).  So those cases clear this
+                 * (and overruled by a SIGKILL).  So those cases clear this
-                 * shared flag after we've set it.  Note that this flag may
+                 * shared flag after we've set it.  Note that this flag may
-                 * remain set after the signal we return is ignored or
+                 * remain set after the signal we return is ignored or
-                 * handled.  That doesn't matter because its only purpose
+                 * handled.  That doesn't matter because its only purpose
-                 * is to alert stop-signal processing code when another
+                 * is to alert stop-signal processing code when another
-                 * processor has come along and cleared the flag.
+                 * processor has come along and cleared the flag.
-                 */
+                 */
-                if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
+                if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
-                        tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
+                        tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
-        }
+        }
        if ( signr &&
             ((info->si_code & __SI_MASK) == __SI_TIMER) &&
             info->si_sys_private){
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 918e52df090e..8b75008e2bd8 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -17,6 +17,7 @@
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
 #include <linux/smp.h>
+#include <linux/tick.h>
 #include <asm/irq.h>
 /*
@@ -273,6 +274,18 @@ EXPORT_SYMBOL(do_softirq);
 #endif
+/*
+ * Enter an interrupt context.
+ */
+void irq_enter(void)
+{
+        __irq_enter();
+#ifdef CONFIG_NO_HZ
+        if (idle_cpu(smp_processor_id()))
+                tick_nohz_update_jiffies();
+#endif
+}
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
 # define invoke_softirq()       __do_softirq()
 #else
@@ -289,6 +302,12 @@ void irq_exit(void)
        sub_preempt_count(IRQ_EXIT_OFFSET);
        if (!in_interrupt() && local_softirq_pending())
                invoke_softirq();
+#ifdef CONFIG_NO_HZ
+        /* Make sure that timer wheel updates are propagated */
+        if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
+                tick_nohz_stop_sched_tick();
+#endif
        preempt_enable_no_resched();
 }
diff --git a/kernel/time.c b/kernel/time.c
index 0e017bff4c19..c6c80ea5d0ea 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -470,6 +470,260 @@ struct timeval ns_to_timeval(const s64 nsec)
        return tv;
 }
+/*
+ * Convert jiffies to milliseconds and back.
+ *
+ * Avoid unnecessary multiplications/divisions in the
+ * two most common HZ cases:
+ */
+unsigned int jiffies_to_msecs(const unsigned long j)
+{
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+        return (MSEC_PER_SEC / HZ) * j;
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+        return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
+#else
+        return (j * MSEC_PER_SEC) / HZ;
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_msecs);
+unsigned int jiffies_to_usecs(const unsigned long j)
+{
+#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+        return (USEC_PER_SEC / HZ) * j;
+#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
+        return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
+#else
+        return (j * USEC_PER_SEC) / HZ;
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_usecs);
+/*
+ * When we convert to jiffies then we interpret incoming values
+ * the following way:
+ *
+ * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
+ *
+ * - 'too large' values [that would result in larger than
+ *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
+ *
+ * - all other values are converted to jiffies by either multiplying
+ *   the input value by a factor or dividing it with a factor
+ *
+ * We must also be careful about 32-bit overflows.
+ */
+unsigned long msecs_to_jiffies(const unsigned int m)
+{
+        /*
+         * Negative value, means infinite timeout:
+         */
+        if ((int)m < 0)
+                return MAX_JIFFY_OFFSET;
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+        /*
+         * HZ is equal to or smaller than 1000, and 1000 is a nice
+         * round multiple of HZ, divide with the factor between them,
+         * but round upwards:
+         */
+        return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+        /*
+         * HZ is larger than 1000, and HZ is a nice round multiple of
+         * 1000 - simply multiply with the factor between them.
+         *
+         * But first make sure the multiplication result cannot
+         * overflow:
+         */
+        if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+                return MAX_JIFFY_OFFSET;
+        return m * (HZ / MSEC_PER_SEC);
+#else
+        /*
+         * Generic case - multiply, round and divide. But first
+         * check that if we are doing a net multiplication, that
+         * we wouldnt overflow:
+         */
+        if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+                return MAX_JIFFY_OFFSET;
+        return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC;
+#endif
+}
+EXPORT_SYMBOL(msecs_to_jiffies);
+unsigned long usecs_to_jiffies(const unsigned int u)
+{
+        if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
+                return MAX_JIFFY_OFFSET;
+#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+        return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
+#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
+        return u * (HZ / USEC_PER_SEC);
+#else
+        return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC;
+#endif
+}
+EXPORT_SYMBOL(usecs_to_jiffies);
+/*
+ * The TICK_NSEC - 1 rounds up the value to the next resolution.  Note
+ * that a remainder subtract here would not do the right thing as the
+ * resolution values don't fall on second boundries.  I.e. the line:
+ * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
+ *
+ * Rather, we just shift the bits off the right.
+ *
+ * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
+ * value to a scaled second value.
+ */
+unsigned long
+timespec_to_jiffies(const struct timespec *value)
+{
+        unsigned long sec = value->tv_sec;
+        long nsec = value->tv_nsec + TICK_NSEC - 1;
+        if (sec >= MAX_SEC_IN_JIFFIES){
+                sec = MAX_SEC_IN_JIFFIES;
+                nsec = 0;
+        }
+        return (((u64)sec * SEC_CONVERSION) +
+                (((u64)nsec * NSEC_CONVERSION) >>
+                 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
+}
+EXPORT_SYMBOL(timespec_to_jiffies);
+void
+jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
+{
+        /*
+         * Convert jiffies to nanoseconds and separate with
+         * one divide.
+         */
+        u64 nsec = (u64)jiffies * TICK_NSEC;
+        value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec);
+}
+EXPORT_SYMBOL(jiffies_to_timespec);
+/* Same for "timeval"
+ *
+ * Well, almost.  The problem here is that the real system resolution is
+ * in nanoseconds and the value being converted is in micro seconds.
+ * Also for some machines (those that use HZ = 1024, in-particular),
+ * there is a LARGE error in the tick size in microseconds.
+ * The solution we use is to do the rounding AFTER we convert the
+ * microsecond part.  Thus the USEC_ROUND, the bits to be shifted off.
+ * Instruction wise, this should cost only an additional add with carry
+ * instruction above the way it was done above.
+ */
+unsigned long
+timeval_to_jiffies(const struct timeval *value)
+{
+        unsigned long sec = value->tv_sec;
+        long usec = value->tv_usec;
+        if (sec >= MAX_SEC_IN_JIFFIES){
+                sec = MAX_SEC_IN_JIFFIES;
+                usec = 0;
+        }
+        return (((u64)sec * SEC_CONVERSION) +
+                (((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
+                 (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
+}
+void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
+{
+        /*
+         * Convert jiffies to nanoseconds and separate with
+         * one divide.
+         */
+        u64 nsec = (u64)jiffies * TICK_NSEC;
+        long tv_usec;
+        value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec);
+        tv_usec /= NSEC_PER_USEC;
+        value->tv_usec = tv_usec;
+}
+/*
+ * Convert jiffies/jiffies_64 to clock_t and back.
+ */
+clock_t jiffies_to_clock_t(long x)
+{
+#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+        return x / (HZ / USER_HZ);
+#else
+        u64 tmp = (u64)x * TICK_NSEC;
+        do_div(tmp, (NSEC_PER_SEC / USER_HZ));
+        return (long)tmp;
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_clock_t);
+unsigned long clock_t_to_jiffies(unsigned long x)
+{
+#if (HZ % USER_HZ)==0
+        if (x >= ~0UL / (HZ / USER_HZ))
+                return ~0UL;
+        return x * (HZ / USER_HZ);
+#else
+        u64 jif;
+        /* Don't worry about loss of precision here .. */
+        if (x >= ~0UL / HZ * USER_HZ)
+                return ~0UL;
+        /* .. but do try to contain it here */
+        jif = x * (u64) HZ;
+        do_div(jif, USER_HZ);
+        return jif;
+#endif
+}
+EXPORT_SYMBOL(clock_t_to_jiffies);
+u64 jiffies_64_to_clock_t(u64 x)
+{
+#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+        do_div(x, HZ / USER_HZ);
+#else
+        /*
+         * There are better ways that don't overflow early,
+         * but even this doesn't overflow in hundreds of years
+         * in 64 bits, so..
+         */
+        x *= TICK_NSEC;
+        do_div(x, (NSEC_PER_SEC / USER_HZ));
+#endif
+        return x;
+}
+EXPORT_SYMBOL(jiffies_64_to_clock_t);
+u64 nsec_to_clock_t(u64 x)
+{
+#if (NSEC_PER_SEC % USER_HZ) == 0
+        do_div(x, (NSEC_PER_SEC / USER_HZ));
+#elif (USER_HZ % 512) == 0
+        x *= USER_HZ/512;
+        do_div(x, (NSEC_PER_SEC / 512));
+#else
+        /*
+         * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
+         * overflow after 64.99 years.
+         * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
+         */
+        x *= 9;
+        do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) /
+                                  USER_HZ));
+#endif
+        return x;
+}
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
new file mode 100644
index 000000000000..f66351126544
--- /dev/null
+++ b/kernel/time/Kconfig
@@ -0,0 +1,25 @@
+#
+# Timer subsystem related configuration options
+#
+config TICK_ONESHOT
+        bool
+        default n
+config NO_HZ
+        bool "Tickless System (Dynamic Ticks)"
+        depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
+        select TICK_ONESHOT
+        help
+          This option enables a tickless system: timer interrupts will
+          only trigger on an as-needed basis both when the system is
+          busy and when the system is idle.
+config HIGH_RES_TIMERS
+        bool "High Resolution Timer Support"
+        depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
+        select TICK_ONESHOT
+        help
+          This option enables high resolution timer support. If your
+          hardware is not capable then this option only increases
+          the size of the kernel image.
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 61a3907d16fb..93bccba1f265 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1 +1,8 @@
-obj-y += ntp.o clocksource.o jiffies.o
+obj-y += ntp.o clocksource.o jiffies.o timer_list.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS)               += clockevents.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS)               += tick-common.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)     += tick-broadcast.o
+obj-$(CONFIG_TICK_ONESHOT)                      += tick-oneshot.o
+obj-$(CONFIG_TICK_ONESHOT)                      += tick-sched.o
+obj-$(CONFIG_TIMER_STATS)                       += timer_stats.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
new file mode 100644
index 000000000000..67932ea78c17
--- /dev/null
+++ b/kernel/time/clockevents.c
@@ -0,0 +1,345 @@
+/*
+ * linux/kernel/time/clockevents.c
+ *
+ * This file contains functions which manage clock event devices.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#include <linux/clockchips.h>
+#include <linux/hrtimer.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+#include <linux/sysdev.h>
+/* The registered clock event devices */
+static LIST_HEAD(clockevent_devices);
+static LIST_HEAD(clockevents_released);
+/* Notification for clock events */
+static RAW_NOTIFIER_HEAD(clockevents_chain);
+/* Protection for the above */
+static DEFINE_SPINLOCK(clockevents_lock);
+/**
+ * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
+ * @latch:      value to convert
+ * @evt:        pointer to clock event device descriptor
+ *
+ * Math helper, returns latch value converted to nanoseconds (bound checked)
+ */
+unsigned long clockevent_delta2ns(unsigned long latch,
+                                  struct clock_event_device *evt)
+{
+        u64 clc = ((u64) latch << evt->shift);
+        do_div(clc, evt->mult);
+        if (clc < 1000)
+                clc = 1000;
+        if (clc > LONG_MAX)
+                clc = LONG_MAX;
+        return (unsigned long) clc;
+}
+/**
+ * clockevents_set_mode - set the operating mode of a clock event device
+ * @dev:        device to modify
+ * @mode:       new mode
+ *
+ * Must be called with interrupts disabled !
+ */
+void clockevents_set_mode(struct clock_event_device *dev,
+                                 enum clock_event_mode mode)
+{
+        if (dev->mode != mode) {
+                dev->set_mode(mode, dev);
+                dev->mode = mode;
+        }
+}
+/**
+ * clockevents_program_event - Reprogram the clock event device.
+ * @expires:    absolute expiry time (monotonic clock)
+ *
+ * Returns 0 on success, -ETIME when the event is in the past.
+ */
+int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
+                              ktime_t now)
+{
+        unsigned long long clc;
+        int64_t delta;
+        delta = ktime_to_ns(ktime_sub(expires, now));
+        if (delta <= 0)
+                return -ETIME;
+        dev->next_event = expires;
+        if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+                return 0;
+        if (delta > dev->max_delta_ns)
+                delta = dev->max_delta_ns;
+        if (delta < dev->min_delta_ns)
+                delta = dev->min_delta_ns;
+        clc = delta * dev->mult;
+        clc >>= dev->shift;
+        return dev->set_next_event((unsigned long) clc, dev);
+}
+/**
+ * clockevents_register_notifier - register a clock events change listener
+ */
+int clockevents_register_notifier(struct notifier_block *nb)
+{
+        int ret;
+        spin_lock(&clockevents_lock);
+        ret = raw_notifier_chain_register(&clockevents_chain, nb);
+        spin_unlock(&clockevents_lock);
+        return ret;
+}
+/**
+ * clockevents_unregister_notifier - unregister a clock events change listener
+ */
+void clockevents_unregister_notifier(struct notifier_block *nb)
+{
+        spin_lock(&clockevents_lock);
+        raw_notifier_chain_unregister(&clockevents_chain, nb);
+        spin_unlock(&clockevents_lock);
+}
+/*
+ * Notify about a clock event change. Called with clockevents_lock
+ * held.
+ */
+static void clockevents_do_notify(unsigned long reason, void *dev)
+{
+        raw_notifier_call_chain(&clockevents_chain, reason, dev);
+}
+/*
+ * Called after a notify add to make devices availble which were
+ * released from the notifier call.
+ */
+static void clockevents_notify_released(void)
+{
+        struct clock_event_device *dev;
+        while (!list_empty(&clockevents_released)) {
+                dev = list_entry(clockevents_released.next,
+                                 struct clock_event_device, list);
+                list_del(&dev->list);
+                list_add(&dev->list, &clockevent_devices);
+                clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
+        }
+}
+/**
+ * clockevents_register_device - register a clock event device
+ * @dev:        device to register
+ */
+void clockevents_register_device(struct clock_event_device *dev)
+{
+        BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+        spin_lock(&clockevents_lock);
+        list_add(&dev->list, &clockevent_devices);
+        clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
+        clockevents_notify_released();
+        spin_unlock(&clockevents_lock);
+}
+/*
+ * Noop handler when we shut down an event device
+ */
+static void clockevents_handle_noop(struct clock_event_device *dev)
+{
+}
+/**
+ * clockevents_exchange_device - release and request clock devices
+ * @old:        device to release (can be NULL)
+ * @new:        device to request (can be NULL)
+ *
+ * Called from the notifier chain. clockevents_lock is held already
+ */
+void clockevents_exchange_device(struct clock_event_device *old,
+                                 struct clock_event_device *new)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        /*
+         * Caller releases a clock event device. We queue it into the
+         * released list and do a notify add later.
+         */
+        if (old) {
+                old->event_handler = clockevents_handle_noop;
+                clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
+                list_del(&old->list);
+                list_add(&old->list, &clockevents_released);
+        }
+        if (new) {
+                BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
+                clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN);
+        }
+        local_irq_restore(flags);
+}
+/**
+ * clockevents_request_device
+ */
+struct clock_event_device *clockevents_request_device(unsigned int features,
+                                                      cpumask_t cpumask)
+{
+        struct clock_event_device *cur, *dev = NULL;
+        struct list_head *tmp;
+        spin_lock(&clockevents_lock);
+        list_for_each(tmp, &clockevent_devices) {
+                cur = list_entry(tmp, struct clock_event_device, list);
+                if ((cur->features & features) == features &&
+                    cpus_equal(cpumask, cur->cpumask)) {
+                        if (!dev || dev->rating < cur->rating)
+                                dev = cur;
+                }
+        }
+        clockevents_exchange_device(NULL, dev);
+        spin_unlock(&clockevents_lock);
+        return dev;
+}
+/**
+ * clockevents_release_device
+ */
+void clockevents_release_device(struct clock_event_device *dev)
+{
+        spin_lock(&clockevents_lock);
+        clockevents_exchange_device(dev, NULL);
+        clockevents_notify_released();
+        spin_unlock(&clockevents_lock);
+}
+/**
+ * clockevents_notify - notification about relevant events
+ */
+void clockevents_notify(unsigned long reason, void *arg)
+{
+        spin_lock(&clockevents_lock);
+        clockevents_do_notify(reason, arg);
+        switch (reason) {
+        case CLOCK_EVT_NOTIFY_CPU_DEAD:
+                /*
+                 * Unregister the clock event devices which were
+                 * released from the users in the notify chain.
+                 */
+                while (!list_empty(&clockevents_released)) {
+                        struct clock_event_device *dev;
+                        dev = list_entry(clockevents_released.next,
+                                         struct clock_event_device, list);
+                        list_del(&dev->list);
+                }
+                break;
+        default:
+                break;
+        }
+        spin_unlock(&clockevents_lock);
+}
+EXPORT_SYMBOL_GPL(clockevents_notify);
+#ifdef CONFIG_SYSFS
+/**
+ * clockevents_show_registered - sysfs interface for listing clockevents
+ * @dev:        unused
+ * @buf:        char buffer to be filled with clock events list
+ *
+ * Provides sysfs interface for listing registered clock event devices
+ */
+static ssize_t clockevents_show_registered(struct sys_device *dev, char *buf)
+{
+        struct list_head *tmp;
+        char *p = buf;
+        int cpu;
+        spin_lock(&clockevents_lock);
+        list_for_each(tmp, &clockevent_devices) {
+                struct clock_event_device *ce;
+                ce = list_entry(tmp, struct clock_event_device, list);
+                p += sprintf(p, "%-20s F:%04x M:%d", ce->name,
+                             ce->features, ce->mode);
+                p += sprintf(p, " C:");
+                if (!cpus_equal(ce->cpumask, cpu_possible_map)) {
+                        for_each_cpu_mask(cpu, ce->cpumask)
+                                p += sprintf(p, " %d", cpu);
+                } else {
+                        /*
+                         * FIXME: Add the cpu which is handling this sucker
+                         */
+                }
+                p += sprintf(p, "\n");
+        }
+        spin_unlock(&clockevents_lock);
+        return p - buf;
+}
+/*
+ * Sysfs setup bits:
+ */
+static SYSDEV_ATTR(registered, 0600,
+                   clockevents_show_registered, NULL);
+static struct sysdev_class clockevents_sysclass = {
+        set_kset_name("clockevents"),
+};
+static struct sys_device clockevents_sys_device = {
+        .id     = 0,
+        .cls    = &clockevents_sysclass,
+};
+static int __init clockevents_sysfs_init(void)
+{
+        int error = sysdev_class_register(&clockevents_sysclass);
+        if (!error)
+                error = sysdev_register(&clockevents_sys_device);
+        if (!error)
+                error = sysdev_create_file(
+                                &clockevents_sys_device,
+                                &attr_registered);
+        return error;
+}
+device_initcall(clockevents_sysfs_init);
+#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index d9ef176c4e09..193a0793af95 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -29,6 +29,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
+#include <linux/tick.h>
 /* XXX - Would like a better way for initializing curr_clocksource */
 extern struct clocksource clocksource_jiffies;
@@ -48,6 +49,7 @@ extern struct clocksource clocksource_jiffies;
 */
 static struct clocksource *curr_clocksource = &clocksource_jiffies;
 static struct clocksource *next_clocksource;
+static struct clocksource *clocksource_override;
 static LIST_HEAD(clocksource_list);
 static DEFINE_SPINLOCK(clocksource_lock);
 static char override_name[32];
@@ -62,9 +64,123 @@ static int __init clocksource_done_booting(void)
        finished_booting = 1;
        return 0;
 }
 late_initcall(clocksource_done_booting);
+#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
+static LIST_HEAD(watchdog_list);
+static struct clocksource *watchdog;
+static struct timer_list watchdog_timer;
+static DEFINE_SPINLOCK(watchdog_lock);
+static cycle_t watchdog_last;
+/*
+ * Interval: 0.5sec Treshold: 0.0625s
+ */
+#define WATCHDOG_INTERVAL (HZ >> 1)
+#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4)
+static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
+{
+        if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD)
+                return;
+        printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
+               cs->name, delta);
+        cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
+        clocksource_change_rating(cs, 0);
+        cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+        list_del(&cs->wd_list);
+}
+static void clocksource_watchdog(unsigned long data)
+{
+        struct clocksource *cs, *tmp;
+        cycle_t csnow, wdnow;
+        int64_t wd_nsec, cs_nsec;
+        spin_lock(&watchdog_lock);
+        wdnow = watchdog->read();
+        wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
+        watchdog_last = wdnow;
+        list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
+                csnow = cs->read();
+                /* Initialized ? */
+                if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
+                        if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
+                            (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
+                                cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+                                /*
+                                 * We just marked the clocksource as
+                                 * highres-capable, notify the rest of the
+                                 * system as well so that we transition
+                                 * into high-res mode:
+                                 */
+                                tick_clock_notify();
+                        }
+                        cs->flags |= CLOCK_SOURCE_WATCHDOG;
+                        cs->wd_last = csnow;
+                } else {
+                        cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
+                        cs->wd_last = csnow;
+                        /* Check the delta. Might remove from the list ! */
+                        clocksource_ratewd(cs, cs_nsec - wd_nsec);
+                }
+        }
+        if (!list_empty(&watchdog_list)) {
+                __mod_timer(&watchdog_timer,
+                            watchdog_timer.expires + WATCHDOG_INTERVAL);
+        }
+        spin_unlock(&watchdog_lock);
+}
+static void clocksource_check_watchdog(struct clocksource *cs)
+{
+        struct clocksource *cse;
+        unsigned long flags;
+        spin_lock_irqsave(&watchdog_lock, flags);
+        if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
+                int started = !list_empty(&watchdog_list);
+                list_add(&cs->wd_list, &watchdog_list);
+                if (!started && watchdog) {
+                        watchdog_last = watchdog->read();
+                        watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
+                        add_timer(&watchdog_timer);
+                }
+        } else if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) {
+                        cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+                if (!watchdog || cs->rating > watchdog->rating) {
+                        if (watchdog)
+                                del_timer(&watchdog_timer);
+                        watchdog = cs;
+                        init_timer(&watchdog_timer);
+                        watchdog_timer.function = clocksource_watchdog;
+                        /* Reset watchdog cycles */
+                        list_for_each_entry(cse, &watchdog_list, wd_list)
+                                cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
+                        /* Start if list is not empty */
+                        if (!list_empty(&watchdog_list)) {
+                                watchdog_last = watchdog->read();
+                                watchdog_timer.expires =
+                                        jiffies + WATCHDOG_INTERVAL;
+                                add_timer(&watchdog_timer);
+                        }
+                }
+        }
+        spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+#else
+static void clocksource_check_watchdog(struct clocksource *cs)
+{
+        if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
+                cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+}
+#endif
 /**
 * clocksource_get_next - Returns the selected clocksource
 *
@@ -84,60 +200,54 @@ struct clocksource *clocksource_get_next(void)
 }
 /**
- * select_clocksource - Finds the best registered clocksource.
+ * select_clocksource - Selects the best registered clocksource.
 *
 * Private function. Must hold clocksource_lock when called.
 *
- * Looks through the list of registered clocksources, returning
+ * Select the clocksource with the best rating, or the clocksource,
- * the one with the highest rating value. If there is a clocksource
+ * which is selected by userspace override.
- * name that matches the override string, it returns that clocksource.
 */
 static struct clocksource *select_clocksource(void)
 {
-        struct clocksource *best = NULL;
+        struct clocksource *next;
-        struct list_head *tmp;
-        list_for_each(tmp, &clocksource_list) {
+        if (list_empty(&clocksource_list))
-                struct clocksource *src;
+                return NULL;
-                src = list_entry(tmp, struct clocksource, list);
+        if (clocksource_override)
-                if (!best)
+                next = clocksource_override;
-                        best = src;
+        else
+                next = list_entry(clocksource_list.next, struct clocksource,
-                /* check for override: */
+                                  list);
-                if (strlen(src->name) == strlen(override_name) &&
-                    !strcmp(src->name, override_name)) {
+        if (next == curr_clocksource)
-                        best = src;
+                return NULL;
-                        break;
-                }
-                /* pick the highest rating: */
-                if (src->rating > best->rating)
-                        best = src;
-        }
-        return best;
+        return next;
 }
-/**
+/*
- * is_registered_source - Checks if clocksource is registered
+ * Enqueue the clocksource sorted by rating
- * @c:          pointer to a clocksource
- *
- * Private helper function. Must hold clocksource_lock when called.
- *
- * Returns one if the clocksource is already registered, zero otherwise.
 */
-static int is_registered_source(struct clocksource *c)
+static int clocksource_enqueue(struct clocksource *c)
 {
-        int len = strlen(c->name);
+        struct list_head *tmp, *entry = &clocksource_list;
-        struct list_head *tmp;
        list_for_each(tmp, &clocksource_list) {
-                struct clocksource *src;
+                struct clocksource *cs;
-                src = list_entry(tmp, struct clocksource, list);
+                cs = list_entry(tmp, struct clocksource, list);
-                if (strlen(src->name) == len && !strcmp(src->name, c->name))
+                if (cs == c)
-                        return 1;
+                        return -EBUSY;
+                /* Keep track of the place, where to insert */
+                if (cs->rating >= c->rating)
+                        entry = tmp;
        }
+        list_add(&c->list, entry);
+        if (strlen(c->name) == strlen(override_name) &&
+            !strcmp(c->name, override_name))
+                clocksource_override = c;
        return 0;
 }
@@ -150,42 +260,35 @@ static int is_registered_source(struct clocksource *c)
 */
 int clocksource_register(struct clocksource *c)
 {
-        int ret = 0;
        unsigned long flags;
+        int ret;
        spin_lock_irqsave(&clocksource_lock, flags);
-        /* check if clocksource is already registered */
+        ret = clocksource_enqueue(c);
-        if (is_registered_source(c)) {
+        if (!ret)
-                printk("register_clocksource: Cannot register %s. "
-                       "Already registered!", c->name);
-                ret = -EBUSY;
-        } else {
-                /* register it */
-                list_add(&c->list, &clocksource_list);
-                /* scan the registered clocksources, and pick the best one */
                next_clocksource = select_clocksource();
-        }
        spin_unlock_irqrestore(&clocksource_lock, flags);
+        if (!ret)
+                clocksource_check_watchdog(c);
        return ret;
 }
 EXPORT_SYMBOL(clocksource_register);
 /**
- * clocksource_reselect - Rescan list for next clocksource
+ * clocksource_change_rating - Change the rating of a registered clocksource
 *
- * A quick helper function to be used if a clocksource changes its
- * rating. Forces the clocksource list to be re-scanned for the best
- * clocksource.
 */
-void clocksource_reselect(void)
+void clocksource_change_rating(struct clocksource *cs, int rating)
 {
        unsigned long flags;
        spin_lock_irqsave(&clocksource_lock, flags);
+        list_del(&cs->list);
+        cs->rating = rating;
+        clocksource_enqueue(cs);
        next_clocksource = select_clocksource();
        spin_unlock_irqrestore(&clocksource_lock, flags);
 }
-EXPORT_SYMBOL(clocksource_reselect);
 #ifdef CONFIG_SYSFS
 /**
@@ -221,7 +324,11 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
 static ssize_t sysfs_override_clocksource(struct sys_device *dev,
                                          const char *buf, size_t count)
 {
+        struct clocksource *ovr = NULL;
+        struct list_head *tmp;
        size_t ret = count;
+        int len;
        /* strings from sysfs write are not 0 terminated! */
        if (count >= sizeof(override_name))
                return -EINVAL;
@@ -229,17 +336,32 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
        /* strip of \n: */
        if (buf[count-1] == '\n')
                count--;
-        if (count < 1)
-                return -EINVAL;
        spin_lock_irq(&clocksource_lock);
-        /* copy the name given: */
+        if (count > 0)
-        memcpy(override_name, buf, count);
+                memcpy(override_name, buf, count);
        override_name[count] = 0;
-        /* try to select it: */
+        len = strlen(override_name);
-        next_clocksource = select_clocksource();
+        if (len) {
+                ovr = clocksource_override;
+                /* try to select it: */
+                list_for_each(tmp, &clocksource_list) {
+                        struct clocksource *cs;
+                        cs = list_entry(tmp, struct clocksource, list);
+                        if (strlen(cs->name) == len &&
+                            !strcmp(cs->name, override_name))
+                                ovr = cs;
+                }
+        }
+        /* Reselect, when the override name has changed */
+        if (ovr != clocksource_override) {
+                clocksource_override = ovr;
+                next_clocksource = select_clocksource();
+        }
        spin_unlock_irq(&clocksource_lock);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a99b2a6e6a07..3be8da8fed7e 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -62,7 +62,6 @@ struct clocksource clocksource_jiffies = {
        .mask           = 0xffffffff, /*32bits*/
        .mult           = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
        .shift          = JIFFIES_SHIFT,
-        .is_continuous  = 0, /* tick based, not free running */
 };
 static int __init init_jiffies_clocksource(void)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 3afeaa3a73f9..eb12509e00bd 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -24,7 +24,7 @@ static u64 tick_length, tick_length_base;
 #define MAX_TICKADJ             500             /* microsecs */
 #define MAX_TICKADJ_SCALED      (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
-                                  TICK_LENGTH_SHIFT) / HZ)
+                                  TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ)
 /*
 * phase-lock loop variables
@@ -46,13 +46,17 @@ long time_adjust;
 static void ntp_update_frequency(void)
 {
-        tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT;
+        u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
-        tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
+                                << TICK_LENGTH_SHIFT;
-        tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
+        second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
+        second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
-        do_div(tick_length_base, HZ);
+        tick_length_base = second_length;
-        tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT;
+        do_div(second_length, HZ);
+        tick_nsec = second_length >> TICK_LENGTH_SHIFT;
+        do_div(tick_length_base, NTP_INTERVAL_FREQ);
 }
 /**
@@ -162,7 +166,7 @@ void second_overflow(void)
                        tick_length -= MAX_TICKADJ_SCALED;
                } else {
                        tick_length += (s64)(time_adjust * NSEC_PER_USEC /
-                                             HZ) << TICK_LENGTH_SHIFT;
+                                        NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT;
                        time_adjust = 0;
                }
        }
@@ -239,7 +243,8 @@ int do_adjtimex(struct timex *txc)
                    result = -EINVAL;
                    goto leave;
                }
-                time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC);
+                time_freq = ((s64)txc->freq * NSEC_PER_USEC)
+                                >> (SHIFT_USEC - SHIFT_NSEC);
            }
            if (txc->modes & ADJ_MAXERROR) {
@@ -309,7 +314,8 @@ int do_adjtimex(struct timex *txc)
                    freq_adj += time_freq;
                    freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
                    time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
-                    time_offset = (time_offset / HZ) << SHIFT_UPDATE;
+                    time_offset = (time_offset / NTP_INTERVAL_FREQ)
+                                        << SHIFT_UPDATE;
                } /* STA_PLL */
            } /* txc->modes & ADJ_OFFSET */
            if (txc->modes & ADJ_TICK)
@@ -324,8 +330,10 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
        if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
            txc->offset    = save_adjust;
        else
-            txc->offset    = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000;
+            txc->offset    = shift_right(time_offset, SHIFT_UPDATE)
-        txc->freq          = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC);
+                                * NTP_INTERVAL_FREQ / 1000;
+        txc->freq          = (time_freq / NSEC_PER_USEC)
+                                << (SHIFT_USEC - SHIFT_NSEC);
        txc->maxerror      = time_maxerror;
        txc->esterror      = time_esterror;
        txc->status        = time_status;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
new file mode 100644
index 000000000000..12b3efeb9f6f
--- /dev/null
+++ b/kernel/time/tick-broadcast.c
@@ -0,0 +1,480 @@
+/*
+ * linux/kernel/time/tick-broadcast.c
+ *
+ * This file contains functions which emulate a local clock-event
+ * device via a broadcast event source.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/irq.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include "tick-internal.h"
+/*
+ * Broadcast support for broken x86 hardware, where the local apic
+ * timer stops in C3 state.
+ */
+struct tick_device tick_broadcast_device;
+static cpumask_t tick_broadcast_mask;
+static DEFINE_SPINLOCK(tick_broadcast_lock);
+/*
+ * Debugging: see timer_list.c
+ */
+struct tick_device *tick_get_broadcast_device(void)
+{
+        return &tick_broadcast_device;
+}
+cpumask_t *tick_get_broadcast_mask(void)
+{
+        return &tick_broadcast_mask;
+}
+/*
+ * Start the device in periodic mode
+ */
+static void tick_broadcast_start_periodic(struct clock_event_device *bc)
+{
+        if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN)
+                tick_setup_periodic(bc, 1);
+}
+/*
+ * Check, if the device can be utilized as broadcast device:
+ */
+int tick_check_broadcast_device(struct clock_event_device *dev)
+{
+        if (tick_broadcast_device.evtdev ||
+            (dev->features & CLOCK_EVT_FEAT_C3STOP))
+                return 0;
+        clockevents_exchange_device(NULL, dev);
+        tick_broadcast_device.evtdev = dev;
+        if (!cpus_empty(tick_broadcast_mask))
+                tick_broadcast_start_periodic(dev);
+        return 1;
+}
+/*
+ * Check, if the device is the broadcast device
+ */
+int tick_is_broadcast_device(struct clock_event_device *dev)
+{
+        return (dev && tick_broadcast_device.evtdev == dev);
+}
+/*
+ * Check, if the device is disfunctional and a place holder, which
+ * needs to be handled by the broadcast device.
+ */
+int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
+{
+        unsigned long flags;
+        int ret = 0;
+        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        /*
+         * Devices might be registered with both periodic and oneshot
+         * mode disabled. This signals, that the device needs to be
+         * operated from the broadcast device and is a placeholder for
+         * the cpu local device.
+         */
+        if (!tick_device_is_functional(dev)) {
+                dev->event_handler = tick_handle_periodic;
+                cpu_set(cpu, tick_broadcast_mask);
+                tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
+                ret = 1;
+        }
+        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        return ret;
+}
+/*
+ * Broadcast the event to the cpus, which are set in the mask
+ */
+int tick_do_broadcast(cpumask_t mask)
+{
+        int ret = 0, cpu = smp_processor_id();
+        struct tick_device *td;
+        /*
+         * Check, if the current cpu is in the mask
+         */
+        if (cpu_isset(cpu, mask)) {
+                cpu_clear(cpu, mask);
+                td = &per_cpu(tick_cpu_device, cpu);
+                td->evtdev->event_handler(td->evtdev);
+                ret = 1;
+        }
+        if (!cpus_empty(mask)) {
+                /*
+                 * It might be necessary to actually check whether the devices
+                 * have different broadcast functions. For now, just use the
+                 * one of the first device. This works as long as we have this
+                 * misfeature only on x86 (lapic)
+                 */
+                cpu = first_cpu(mask);
+                td = &per_cpu(tick_cpu_device, cpu);
+                td->evtdev->broadcast(mask);
+                ret = 1;
+        }
+        return ret;
+}
+/*
+ * Periodic broadcast:
+ * - invoke the broadcast handlers
+ */
+static void tick_do_periodic_broadcast(void)
+{
+        cpumask_t mask;
+        spin_lock(&tick_broadcast_lock);
+        cpus_and(mask, cpu_online_map, tick_broadcast_mask);
+        tick_do_broadcast(mask);
+        spin_unlock(&tick_broadcast_lock);
+}
+/*
+ * Event handler for periodic broadcast ticks
+ */
+static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
+{
+        dev->next_event.tv64 = KTIME_MAX;
+        tick_do_periodic_broadcast();
+        /*
+         * The device is in periodic mode. No reprogramming necessary:
+         */
+        if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
+                return;
+        /*
+         * Setup the next period for devices, which do not have
+         * periodic mode:
+         */
+        for (;;) {
+                ktime_t next = ktime_add(dev->next_event, tick_period);
+                if (!clockevents_program_event(dev, next, ktime_get()))
+                        return;
+                tick_do_periodic_broadcast();
+        }
+}
+/*
+ * Powerstate information: The system enters/leaves a state, where
+ * affected devices might stop
+ */
+static void tick_do_broadcast_on_off(void *why)
+{
+        struct clock_event_device *bc, *dev;
+        struct tick_device *td;
+        unsigned long flags, *reason = why;
+        int cpu;
+        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        cpu = smp_processor_id();
+        td = &per_cpu(tick_cpu_device, cpu);
+        dev = td->evtdev;
+        bc = tick_broadcast_device.evtdev;
+        /*
+         * Is the device in broadcast mode forever or is it not
+         * affected by the powerstate ?
+         */
+        if (!dev || !tick_device_is_functional(dev) ||
+            !(dev->features & CLOCK_EVT_FEAT_C3STOP))
+                goto out;
+        if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) {
+                if (!cpu_isset(cpu, tick_broadcast_mask)) {
+                        cpu_set(cpu, tick_broadcast_mask);
+                        if (td->mode == TICKDEV_MODE_PERIODIC)
+                                clockevents_set_mode(dev,
+                                                     CLOCK_EVT_MODE_SHUTDOWN);
+                }
+        } else {
+                if (cpu_isset(cpu, tick_broadcast_mask)) {
+                        cpu_clear(cpu, tick_broadcast_mask);
+                        if (td->mode == TICKDEV_MODE_PERIODIC)
+                                tick_setup_periodic(dev, 0);
+                }
+        }
+        if (cpus_empty(tick_broadcast_mask))
+                clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+        else {
+                if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+                        tick_broadcast_start_periodic(bc);
+                else
+                        tick_broadcast_setup_oneshot(bc);
+        }
+out:
+        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+/*
+ * Powerstate information: The system enters/leaves a state, where
+ * affected devices might stop.
+ */
+void tick_broadcast_on_off(unsigned long reason, int *oncpu)
+{
+        int cpu = get_cpu();
+        if (cpu == *oncpu)
+                tick_do_broadcast_on_off(&reason);
+        else
+                smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
+                                         &reason, 1, 1);
+        put_cpu();
+}
+/*
+ * Set the periodic handler depending on broadcast on/off
+ */
+void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
+{
+        if (!broadcast)
+                dev->event_handler = tick_handle_periodic;
+        else
+                dev->event_handler = tick_handle_periodic_broadcast;
+}
+/*
+ * Remove a CPU from broadcasting
+ */
+void tick_shutdown_broadcast(unsigned int *cpup)
+{
+        struct clock_event_device *bc;
+        unsigned long flags;
+        unsigned int cpu = *cpup;
+        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        bc = tick_broadcast_device.evtdev;
+        cpu_clear(cpu, tick_broadcast_mask);
+        if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
+                if (bc && cpus_empty(tick_broadcast_mask))
+                        clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+        }
+        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+#ifdef CONFIG_TICK_ONESHOT
+static cpumask_t tick_broadcast_oneshot_mask;
+/*
+ * Debugging: see timer_list.c
+ */
+cpumask_t *tick_get_broadcast_oneshot_mask(void)
+{
+        return &tick_broadcast_oneshot_mask;
+}
+static int tick_broadcast_set_event(ktime_t expires, int force)
+{
+        struct clock_event_device *bc = tick_broadcast_device.evtdev;
+        ktime_t now = ktime_get();
+        int res;
+        for(;;) {
+                res = clockevents_program_event(bc, expires, now);
+                if (!res || !force)
+                        return res;
+                now = ktime_get();
+                expires = ktime_add(now, ktime_set(0, bc->min_delta_ns));
+        }
+}
+/*
+ * Reprogram the broadcast device:
+ *
+ * Called with tick_broadcast_lock held and interrupts disabled.
+ */
+static int tick_broadcast_reprogram(void)
+{
+        ktime_t expires = { .tv64 = KTIME_MAX };
+        struct tick_device *td;
+        int cpu;
+        /*
+         * Find the event which expires next:
+         */
+        for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
+             cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
+                td = &per_cpu(tick_cpu_device, cpu);
+                if (td->evtdev->next_event.tv64 < expires.tv64)
+                        expires = td->evtdev->next_event;
+        }
+        if (expires.tv64 == KTIME_MAX)
+                return 0;
+        return tick_broadcast_set_event(expires, 0);
+}
+/*
+ * Handle oneshot mode broadcasting
+ */
+static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
+{
+        struct tick_device *td;
+        cpumask_t mask;
+        ktime_t now;
+        int cpu;
+        spin_lock(&tick_broadcast_lock);
+again:
+        dev->next_event.tv64 = KTIME_MAX;
+        mask = CPU_MASK_NONE;
+        now = ktime_get();
+        /* Find all expired events */
+        for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
+             cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
+                td = &per_cpu(tick_cpu_device, cpu);
+                if (td->evtdev->next_event.tv64 <= now.tv64)
+                        cpu_set(cpu, mask);
+        }
+        /*
+         * Wakeup the cpus which have an expired event. The broadcast
+         * device is reprogrammed in the return from idle code.
+         */
+        if (!tick_do_broadcast(mask)) {
+                /*
+                 * The global event did not expire any CPU local
+                 * events. This happens in dyntick mode, as the
+                 * maximum PIT delta is quite small.
+                 */
+                if (tick_broadcast_reprogram())
+                        goto again;
+        }
+        spin_unlock(&tick_broadcast_lock);
+}
+/*
+ * Powerstate information: The system enters/leaves a state, where
+ * affected devices might stop
+ */
+void tick_broadcast_oneshot_control(unsigned long reason)
+{
+        struct clock_event_device *bc, *dev;
+        struct tick_device *td;
+        unsigned long flags;
+        int cpu;
+        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        /*
+         * Periodic mode does not care about the enter/exit of power
+         * states
+         */
+        if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
+                goto out;
+        bc = tick_broadcast_device.evtdev;
+        cpu = smp_processor_id();
+        td = &per_cpu(tick_cpu_device, cpu);
+        dev = td->evtdev;
+        if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
+                goto out;
+        if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
+                if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
+                        cpu_set(cpu, tick_broadcast_oneshot_mask);
+                        clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+                        if (dev->next_event.tv64 < bc->next_event.tv64)
+                                tick_broadcast_set_event(dev->next_event, 1);
+                }
+        } else {
+                if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
+                        cpu_clear(cpu, tick_broadcast_oneshot_mask);
+                        clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+                        if (dev->next_event.tv64 != KTIME_MAX)
+                                tick_program_event(dev->next_event, 1);
+                }
+        }
+out:
+        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+/**
+ * tick_broadcast_setup_highres - setup the broadcast device for highres
+ */
+void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+{
+        if (bc->mode != CLOCK_EVT_MODE_ONESHOT) {
+                bc->event_handler = tick_handle_oneshot_broadcast;
+                clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+                bc->next_event.tv64 = KTIME_MAX;
+        }
+}
+/*
+ * Select oneshot operating mode for the broadcast device
+ */
+void tick_broadcast_switch_to_oneshot(void)
+{
+        struct clock_event_device *bc;
+        unsigned long flags;
+        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
+        bc = tick_broadcast_device.evtdev;
+        if (bc)
+                tick_broadcast_setup_oneshot(bc);
+        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+/*
+ * Remove a dead CPU from broadcasting
+ */
+void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
+{
+        struct clock_event_device *bc;
+        unsigned long flags;
+        unsigned int cpu = *cpup;
+        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        bc = tick_broadcast_device.evtdev;
+        cpu_clear(cpu, tick_broadcast_oneshot_mask);
+        if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) {
+                if (bc && cpus_empty(tick_broadcast_oneshot_mask))
+                        clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+        }
+        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+}
+#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
new file mode 100644
index 000000000000..4500e347f1bb
--- /dev/null
+++ b/kernel/time/tick-common.c
@@ -0,0 +1,346 @@
+/*
+ * linux/kernel/time/tick-common.c
+ *
+ * This file contains the base functions to manage periodic tick
+ * related events.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/irq.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include "tick-internal.h"
+/*
+ * Tick devices
+ */
+DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
+/*
+ * Tick next event: keeps track of the tick time
+ */
+ktime_t tick_next_period;
+ktime_t tick_period;
+static int tick_do_timer_cpu = -1;
+DEFINE_SPINLOCK(tick_device_lock);
+/*
+ * Debugging: see timer_list.c
+ */
+struct tick_device *tick_get_device(int cpu)
+{
+        return &per_cpu(tick_cpu_device, cpu);
+}
+/**
+ * tick_is_oneshot_available - check for a oneshot capable event device
+ */
+int tick_is_oneshot_available(void)
+{
+        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+        return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
+}
+/*
+ * Periodic tick
+ */
+static void tick_periodic(int cpu)
+{
+        if (tick_do_timer_cpu == cpu) {
+                write_seqlock(&xtime_lock);
+                /* Keep track of the next tick event */
+                tick_next_period = ktime_add(tick_next_period, tick_period);
+                do_timer(1);
+                write_sequnlock(&xtime_lock);
+        }
+        update_process_times(user_mode(get_irq_regs()));
+        profile_tick(CPU_PROFILING);
+}
+/*
+ * Event handler for periodic ticks
+ */
+void tick_handle_periodic(struct clock_event_device *dev)
+{
+        int cpu = smp_processor_id();
+        tick_periodic(cpu);
+        if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
+                return;
+        /*
+         * Setup the next period for devices, which do not have
+         * periodic mode:
+         */
+        for (;;) {
+                ktime_t next = ktime_add(dev->next_event, tick_period);
+                if (!clockevents_program_event(dev, next, ktime_get()))
+                        return;
+                tick_periodic(cpu);
+        }
+}
+/*
+ * Setup the device for a periodic tick
+ */
+void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
+{
+        tick_set_periodic_handler(dev, broadcast);
+        /* Broadcast setup ? */
+        if (!tick_device_is_functional(dev))
+                return;
+        if (dev->features & CLOCK_EVT_FEAT_PERIODIC) {
+                clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
+        } else {
+                unsigned long seq;
+                ktime_t next;
+                do {
+                        seq = read_seqbegin(&xtime_lock);
+                        next = tick_next_period;
+                } while (read_seqretry(&xtime_lock, seq));
+                clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+                for (;;) {
+                        if (!clockevents_program_event(dev, next, ktime_get()))
+                                return;
+                        next = ktime_add(next, tick_period);
+                }
+        }
+}
+/*
+ * Setup the tick device
+ */
+static void tick_setup_device(struct tick_device *td,
+                              struct clock_event_device *newdev, int cpu,
+                              cpumask_t cpumask)
+{
+        ktime_t next_event;
+        void (*handler)(struct clock_event_device *) = NULL;
+        /*
+         * First device setup ?
+         */
+        if (!td->evtdev) {
+                /*
+                 * If no cpu took the do_timer update, assign it to
+                 * this cpu:
+                 */
+                if (tick_do_timer_cpu == -1) {
+                        tick_do_timer_cpu = cpu;
+                        tick_next_period = ktime_get();
+                        tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
+                }
+                /*
+                 * Startup in periodic mode first.
+                 */
+                td->mode = TICKDEV_MODE_PERIODIC;
+        } else {
+                handler = td->evtdev->event_handler;
+                next_event = td->evtdev->next_event;
+        }
+        td->evtdev = newdev;
+        /*
+         * When the device is not per cpu, pin the interrupt to the
+         * current cpu:
+         */
+        if (!cpus_equal(newdev->cpumask, cpumask))
+                irq_set_affinity(newdev->irq, cpumask);
+        /*
+         * When global broadcasting is active, check if the current
+         * device is registered as a placeholder for broadcast mode.
+         * This allows us to handle this x86 misfeature in a generic
+         * way.
+         */
+        if (tick_device_uses_broadcast(newdev, cpu))
+                return;
+        if (td->mode == TICKDEV_MODE_PERIODIC)
+                tick_setup_periodic(newdev, 0);
+        else
+                tick_setup_oneshot(newdev, handler, next_event);
+}
+/*
+ * Check, if the new registered device should be used.
+ */
+static int tick_check_new_device(struct clock_event_device *newdev)
+{
+        struct clock_event_device *curdev;
+        struct tick_device *td;
+        int cpu, ret = NOTIFY_OK;
+        unsigned long flags;
+        cpumask_t cpumask;
+        spin_lock_irqsave(&tick_device_lock, flags);
+        cpu = smp_processor_id();
+        if (!cpu_isset(cpu, newdev->cpumask))
+                goto out;
+        td = &per_cpu(tick_cpu_device, cpu);
+        curdev = td->evtdev;
+        cpumask = cpumask_of_cpu(cpu);
+        /* cpu local device ? */
+        if (!cpus_equal(newdev->cpumask, cpumask)) {
+                /*
+                 * If the cpu affinity of the device interrupt can not
+                 * be set, ignore it.
+                 */
+                if (!irq_can_set_affinity(newdev->irq))
+                        goto out_bc;
+                /*
+                 * If we have a cpu local device already, do not replace it
+                 * by a non cpu local device
+                 */
+                if (curdev && cpus_equal(curdev->cpumask, cpumask))
+                        goto out_bc;
+        }
+        /*
+         * If we have an active device, then check the rating and the oneshot
+         * feature.
+         */
+        if (curdev) {
+                /*
+                 * Prefer one shot capable devices !
+                 */
+                if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
+                    !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
+                        goto out_bc;
+                /*
+                 * Check the rating
+                 */
+                if (curdev->rating >= newdev->rating)
+                        goto out_bc;
+        }
+        /*
+         * Replace the eventually existing device by the new
+         * device. If the current device is the broadcast device, do
+         * not give it back to the clockevents layer !
+         */
+        if (tick_is_broadcast_device(curdev)) {
+                clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN);
+                curdev = NULL;
+        }
+        clockevents_exchange_device(curdev, newdev);
+        tick_setup_device(td, newdev, cpu, cpumask);
+        if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
+                tick_oneshot_notify();
+        spin_unlock_irqrestore(&tick_device_lock, flags);
+        return NOTIFY_STOP;
+out_bc:
+        /*
+         * Can the new device be used as a broadcast device ?
+         */
+        if (tick_check_broadcast_device(newdev))
+                ret = NOTIFY_STOP;
+out:
+        spin_unlock_irqrestore(&tick_device_lock, flags);
+        return ret;
+}
+/*
+ * Shutdown an event device on a given cpu:
+ *
+ * This is called on a life CPU, when a CPU is dead. So we cannot
+ * access the hardware device itself.
+ * We just set the mode and remove it from the lists.
+ */
+static void tick_shutdown(unsigned int *cpup)
+{
+        struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
+        struct clock_event_device *dev = td->evtdev;
+        unsigned long flags;
+        spin_lock_irqsave(&tick_device_lock, flags);
+        td->mode = TICKDEV_MODE_PERIODIC;
+        if (dev) {
+                /*
+                 * Prevent that the clock events layer tries to call
+                 * the set mode function!
+                 */
+                dev->mode = CLOCK_EVT_MODE_UNUSED;
+                clockevents_exchange_device(dev, NULL);
+                td->evtdev = NULL;
+        }
+        spin_unlock_irqrestore(&tick_device_lock, flags);
+}
+/*
+ * Notification about clock event devices
+ */
+static int tick_notify(struct notifier_block *nb, unsigned long reason,
+                               void *dev)
+{
+        switch (reason) {
+        case CLOCK_EVT_NOTIFY_ADD:
+                return tick_check_new_device(dev);
+        case CLOCK_EVT_NOTIFY_BROADCAST_ON:
+        case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
+                tick_broadcast_on_off(reason, dev);
+                break;
+        case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
+        case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
+                tick_broadcast_oneshot_control(reason);
+                break;
+        case CLOCK_EVT_NOTIFY_CPU_DEAD:
+                tick_shutdown_broadcast_oneshot(dev);
+                tick_shutdown_broadcast(dev);
+                tick_shutdown(dev);
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block tick_notifier = {
+        .notifier_call = tick_notify,
+};
+/**
+ * tick_init - initialize the tick control
+ *
+ * Register the notifier with the clockevents framework
+ */
+void __init tick_init(void)
+{
+        clockevents_register_notifier(&tick_notifier);
+}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
new file mode 100644
index 000000000000..54861a0f29ff
--- /dev/null
+++ b/kernel/time/tick-internal.h
@@ -0,0 +1,110 @@
+/*
+ * tick internal variable and functions used by low/high res code
+ */
+DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
+extern spinlock_t tick_device_lock;
+extern ktime_t tick_next_period;
+extern ktime_t tick_period;
+extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
+extern void tick_handle_periodic(struct clock_event_device *dev);
+/*
+ * NO_HZ / high resolution timer shared code
+ */
+#ifdef CONFIG_TICK_ONESHOT
+extern void tick_setup_oneshot(struct clock_event_device *newdev,
+                               void (*handler)(struct clock_event_device *),
+                               ktime_t nextevt);
+extern int tick_program_event(ktime_t expires, int force);
+extern void tick_oneshot_notify(void);
+extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
+# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
+extern void tick_broadcast_oneshot_control(unsigned long reason);
+extern void tick_broadcast_switch_to_oneshot(void);
+extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
+# else /* BROADCAST */
+static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+{
+        BUG();
+}
+static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
+static inline void tick_broadcast_switch_to_oneshot(void) { }
+static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+# endif /* !BROADCAST */
+#else /* !ONESHOT */
+static inline
+void tick_setup_oneshot(struct clock_event_device *newdev,
+                        void (*handler)(struct clock_event_device *),
+                        ktime_t nextevt)
+{
+        BUG();
+}
+static inline int tick_program_event(ktime_t expires, int force)
+{
+        return 0;
+}
+static inline void tick_oneshot_notify(void) { }
+static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
+{
+        BUG();
+}
+static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
+static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+#endif /* !TICK_ONESHOT */
+/*
+ * Broadcasting support
+ */
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern int tick_do_broadcast(cpumask_t mask);
+extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
+extern int tick_check_broadcast_device(struct clock_event_device *dev);
+extern int tick_is_broadcast_device(struct clock_event_device *dev);
+extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
+extern void tick_shutdown_broadcast(unsigned int *cpup);
+extern void
+tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
+#else /* !BROADCAST */
+static inline int tick_check_broadcast_device(struct clock_event_device *dev)
+{
+        return 0;
+}
+static inline int tick_is_broadcast_device(struct clock_event_device *dev)
+{
+        return 0;
+}
+static inline int tick_device_uses_broadcast(struct clock_event_device *dev,
+                                             int cpu)
+{
+        return 0;
+}
+static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
+static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
+static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
+/*
+ * Set the periodic handler in non broadcast mode
+ */
+static inline void tick_set_periodic_handler(struct clock_event_device *dev,
+                                             int broadcast)
+{
+        dev->event_handler = tick_handle_periodic;
+}
+#endif /* !BROADCAST */
+/*
+ * Check, if the device is functional or a dummy for broadcast
+ */
+static inline int tick_device_is_functional(struct clock_event_device *dev)
+{
+        return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
+}
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
new file mode 100644
index 000000000000..2e8b7ff863cc
--- /dev/null
+++ b/kernel/time/tick-oneshot.c
@@ -0,0 +1,84 @@
+/*
+ * linux/kernel/time/tick-oneshot.c
+ *
+ * This file contains functions which manage high resolution tick
+ * related events.
+ *
+ * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
+ *
+ * This code is licenced under the GPL version 2. For details see
+ * kernel-base/COPYING.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/irq.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include "tick-internal.h"
+/**
+ * tick_program_event
+ */
+int tick_program_event(ktime_t expires, int force)
+{
+        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+        ktime_t now = ktime_get();
+        while (1) {
+                int ret = clockevents_program_event(dev, expires, now);
+                if (!ret || !force)
+                        return ret;
+                now = ktime_get();
+                expires = ktime_add(now, ktime_set(0, dev->min_delta_ns));
+        }
+}
+/**
+ * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
+ */
+void tick_setup_oneshot(struct clock_event_device *newdev,
+                        void (*handler)(struct clock_event_device *),
+                        ktime_t next_event)
+{
+        newdev->event_handler = handler;
+        clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
+        clockevents_program_event(newdev, next_event, ktime_get());
+}
+/**
+ * tick_switch_to_oneshot - switch to oneshot mode
+ */
+int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
+{
+        struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+        struct clock_event_device *dev = td->evtdev;
+        if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
+            !tick_device_is_functional(dev))
+                return -EINVAL;
+        td->mode = TICKDEV_MODE_ONESHOT;
+        dev->event_handler = handler;
+        clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+        tick_broadcast_switch_to_oneshot();
+        return 0;
+}
+#ifdef CONFIG_HIGH_RES_TIMERS
+/**
+ * tick_init_highres - switch to high resolution mode
+ *
+ * Called with interrupts disabled.
+ */
+int tick_init_highres(void)
+{
+        return tick_switch_to_oneshot(hrtimer_interrupt);
+}
+#endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
new file mode 100644
index 000000000000..512a4a906467
--- /dev/null
+++ b/kernel/time/tick-sched.c
@@ -0,0 +1,565 @@
+/*
+ *  linux/kernel/time/tick-sched.c
+ *
+ *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
+ *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
+ *
+ *  No idle tick implementation for low and high resolution timers
+ *
+ *  Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include "tick-internal.h"
+/*
+ * Per cpu nohz control structure
+ */
+static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+/*
+ * The time, when the last jiffy update happened. Protected by xtime_lock.
+ */
+static ktime_t last_jiffies_update;
+struct tick_sched *tick_get_tick_sched(int cpu)
+{
+        return &per_cpu(tick_cpu_sched, cpu);
+}
+/*
+ * Must be called with interrupts disabled !
+ */
+static void tick_do_update_jiffies64(ktime_t now)
+{
+        unsigned long ticks = 0;
+        ktime_t delta;
+        /* Reevalute with xtime_lock held */
+        write_seqlock(&xtime_lock);
+        delta = ktime_sub(now, last_jiffies_update);
+        if (delta.tv64 >= tick_period.tv64) {
+                delta = ktime_sub(delta, tick_period);
+                last_jiffies_update = ktime_add(last_jiffies_update,
+                                                tick_period);
+                /* Slow path for long timeouts */
+                if (unlikely(delta.tv64 >= tick_period.tv64)) {
+                        s64 incr = ktime_to_ns(tick_period);
+                        ticks = ktime_divns(delta, incr);
+                        last_jiffies_update = ktime_add_ns(last_jiffies_update,
+                                                           incr * ticks);
+                }
+                do_timer(++ticks);
+        }
+        write_sequnlock(&xtime_lock);
+}
+/*
+ * Initialize and return retrieve the jiffies update.
+ */
+static ktime_t tick_init_jiffy_update(void)
+{
+        ktime_t period;
+        write_seqlock(&xtime_lock);
+        /* Did we start the jiffies update yet ? */
+        if (last_jiffies_update.tv64 == 0)
+                last_jiffies_update = tick_next_period;
+        period = last_jiffies_update;
+        write_sequnlock(&xtime_lock);
+        return period;
+}
+/*
+ * NOHZ - aka dynamic tick functionality
+ */
+#ifdef CONFIG_NO_HZ
+/*
+ * NO HZ enabled ?
+ */
+static int tick_nohz_enabled __read_mostly  = 1;
+/*
+ * Enable / Disable tickless mode
+ */
+static int __init setup_tick_nohz(char *str)
+{
+        if (!strcmp(str, "off"))
+                tick_nohz_enabled = 0;
+        else if (!strcmp(str, "on"))
+                tick_nohz_enabled = 1;
+        else
+                return 0;
+        return 1;
+}
+__setup("nohz=", setup_tick_nohz);
+/**
+ * tick_nohz_update_jiffies - update jiffies when idle was interrupted
+ *
+ * Called from interrupt entry when the CPU was idle
+ *
+ * In case the sched_tick was stopped on this CPU, we have to check if jiffies
+ * must be updated. Otherwise an interrupt handler could use a stale jiffy
+ * value. We do this unconditionally on any cpu, as we don't know whether the
+ * cpu, which has the update task assigned is in a long sleep.
+ */
+void tick_nohz_update_jiffies(void)
+{
+        int cpu = smp_processor_id();
+        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        unsigned long flags;
+        ktime_t now;
+        if (!ts->tick_stopped)
+                return;
+        cpu_clear(cpu, nohz_cpu_mask);
+        now = ktime_get();
+        local_irq_save(flags);
+        tick_do_update_jiffies64(now);
+        local_irq_restore(flags);
+}
+/**
+ * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
+ *
+ * When the next event is more than a tick into the future, stop the idle tick
+ * Called either from the idle loop or from irq_exit() when an idle period was
+ * just interrupted by an interrupt which did not cause a reschedule.
+ */
+void tick_nohz_stop_sched_tick(void)
+{
+        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
+        struct tick_sched *ts;
+        ktime_t last_update, expires, now, delta;
+        int cpu;
+        local_irq_save(flags);
+        cpu = smp_processor_id();
+        ts = &per_cpu(tick_cpu_sched, cpu);
+        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
+                goto end;
+        if (need_resched())
+                goto end;
+        cpu = smp_processor_id();
+        if (unlikely(local_softirq_pending()))
+                printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+                       local_softirq_pending());
+        now = ktime_get();
+        /*
+         * When called from irq_exit we need to account the idle sleep time
+         * correctly.
+         */
+        if (ts->tick_stopped) {
+                delta = ktime_sub(now, ts->idle_entrytime);
+                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+        }
+        ts->idle_entrytime = now;
+        ts->idle_calls++;
+        /* Read jiffies and the time when jiffies were updated last */
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                last_update = last_jiffies_update;
+                last_jiffies = jiffies;
+        } while (read_seqretry(&xtime_lock, seq));
+        /* Get the next timer wheel timer */
+        next_jiffies = get_next_timer_interrupt(last_jiffies);
+        delta_jiffies = next_jiffies - last_jiffies;
+        if (rcu_needs_cpu(cpu))
+                delta_jiffies = 1;
+        /*
+         * Do not stop the tick, if we are only one off
+         * or if the cpu is required for rcu
+         */
+        if (!ts->tick_stopped && delta_jiffies == 1)
+                goto out;
+        /* Schedule the tick, if we are at least one jiffie off */
+        if ((long)delta_jiffies >= 1) {
+                if (delta_jiffies > 1)
+                        cpu_set(cpu, nohz_cpu_mask);
+                /*
+                 * nohz_stop_sched_tick can be called several times before
+                 * the nohz_restart_sched_tick is called. This happens when
+                 * interrupts arrive which do not cause a reschedule. In the
+                 * first call we save the current tick time, so we can restart
+                 * the scheduler tick in nohz_restart_sched_tick.
+                 */
+                if (!ts->tick_stopped) {
+                        ts->idle_tick = ts->sched_timer.expires;
+                        ts->tick_stopped = 1;
+                        ts->idle_jiffies = last_jiffies;
+                }
+                /*
+                 * calculate the expiry time for the next timer wheel
+                 * timer
+                 */
+                expires = ktime_add_ns(last_update, tick_period.tv64 *
+                                       delta_jiffies);
+                ts->idle_expires = expires;
+                ts->idle_sleeps++;
+                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+                        hrtimer_start(&ts->sched_timer, expires,
+                                      HRTIMER_MODE_ABS);
+                        /* Check, if the timer was already in the past */
+                        if (hrtimer_active(&ts->sched_timer))
+                                goto out;
+                } else if(!tick_program_event(expires, 0))
+                                goto out;
+                /*
+                 * We are past the event already. So we crossed a
+                 * jiffie boundary. Update jiffies and raise the
+                 * softirq.
+                 */
+                tick_do_update_jiffies64(ktime_get());
+                cpu_clear(cpu, nohz_cpu_mask);
+        }
+        raise_softirq_irqoff(TIMER_SOFTIRQ);
+out:
+        ts->next_jiffies = next_jiffies;
+        ts->last_jiffies = last_jiffies;
+end:
+        local_irq_restore(flags);
+}
+/**
+ * nohz_restart_sched_tick - restart the idle tick from the idle task
+ *
+ * Restart the idle tick when the CPU is woken up from idle
+ */
+void tick_nohz_restart_sched_tick(void)
+{
+        int cpu = smp_processor_id();
+        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        unsigned long ticks;
+        ktime_t now, delta;
+        if (!ts->tick_stopped)
+                return;
+        /* Update jiffies first */
+        now = ktime_get();
+        local_irq_disable();
+        tick_do_update_jiffies64(now);
+        cpu_clear(cpu, nohz_cpu_mask);
+        /* Account the idle time */
+        delta = ktime_sub(now, ts->idle_entrytime);
+        ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+        /*
+         * We stopped the tick in idle. Update process times would miss the
+         * time we slept as update_process_times does only a 1 tick
+         * accounting. Enforce that this is accounted to idle !
+         */
+        ticks = jiffies - ts->idle_jiffies;
+        /*
+         * We might be one off. Do not randomly account a huge number of ticks!
+         */
+        if (ticks && ticks < LONG_MAX) {
+                add_preempt_count(HARDIRQ_OFFSET);
+                account_system_time(current, HARDIRQ_OFFSET,
+                                    jiffies_to_cputime(ticks));
+                sub_preempt_count(HARDIRQ_OFFSET);
+        }
+        /*
+         * Cancel the scheduled timer and restore the tick
+         */
+        ts->tick_stopped  = 0;
+        hrtimer_cancel(&ts->sched_timer);
+        ts->sched_timer.expires = ts->idle_tick;
+        while (1) {
+                /* Forward the time to expire in the future */
+                hrtimer_forward(&ts->sched_timer, now, tick_period);
+                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+                        hrtimer_start(&ts->sched_timer,
+                                      ts->sched_timer.expires,
+                                      HRTIMER_MODE_ABS);
+                        /* Check, if the timer was already in the past */
+                        if (hrtimer_active(&ts->sched_timer))
+                                break;
+                } else {
+                        if (!tick_program_event(ts->sched_timer.expires, 0))
+                                break;
+                }
+                /* Update jiffies and reread time */
+                tick_do_update_jiffies64(now);
+                now = ktime_get();
+        }
+        local_irq_enable();
+}
+static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
+{
+        hrtimer_forward(&ts->sched_timer, now, tick_period);
+        return tick_program_event(ts->sched_timer.expires, 0);
+}
+/*
+ * The nohz low res interrupt handler
+ */
+static void tick_nohz_handler(struct clock_event_device *dev)
+{
+        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        struct pt_regs *regs = get_irq_regs();
+        ktime_t now = ktime_get();
+        dev->next_event.tv64 = KTIME_MAX;
+        /* Check, if the jiffies need an update */
+        tick_do_update_jiffies64(now);
+        /*
+         * When we are idle and the tick is stopped, we have to touch
+         * the watchdog as we might not schedule for a really long
+         * time. This happens on complete idle SMP systems while
+         * waiting on the login prompt. We also increment the "start
+         * of idle" jiffy stamp so the idle accounting adjustment we
+         * do when we go busy again does not account too much ticks.
+         */
+        if (ts->tick_stopped) {
+                touch_softlockup_watchdog();
+                ts->idle_jiffies++;
+        }
+        update_process_times(user_mode(regs));
+        profile_tick(CPU_PROFILING);
+        /* Do not restart, when we are in the idle loop */
+        if (ts->tick_stopped)
+                return;
+        while (tick_nohz_reprogram(ts, now)) {
+                now = ktime_get();
+                tick_do_update_jiffies64(now);
+        }
+}
+/**
+ * tick_nohz_switch_to_nohz - switch to nohz mode
+ */
+static void tick_nohz_switch_to_nohz(void)
+{
+        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        ktime_t next;
+        if (!tick_nohz_enabled)
+                return;
+        local_irq_disable();
+        if (tick_switch_to_oneshot(tick_nohz_handler)) {
+                local_irq_enable();
+                return;
+        }
+        ts->nohz_mode = NOHZ_MODE_LOWRES;
+        /*
+         * Recycle the hrtimer in ts, so we can share the
+         * hrtimer_forward with the highres code.
+         */
+        hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+        /* Get the next period */
+        next = tick_init_jiffy_update();
+        for (;;) {
+                ts->sched_timer.expires = next;
+                if (!tick_program_event(next, 0))
+                        break;
+                next = ktime_add(next, tick_period);
+        }
+        local_irq_enable();
+        printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n",
+               smp_processor_id());
+}
+#else
+static inline void tick_nohz_switch_to_nohz(void) { }
+#endif /* NO_HZ */
+/*
+ * High resolution timer specific code
+ */
+#ifdef CONFIG_HIGH_RES_TIMERS
+/*
+ * We rearm the timer until we get disabled by the idle code
+ * Called with interrupts disabled and timer->base->cpu_base->lock held.
+ */
+static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
+{
+        struct tick_sched *ts =
+                container_of(timer, struct tick_sched, sched_timer);
+        struct hrtimer_cpu_base *base = timer->base->cpu_base;
+        struct pt_regs *regs = get_irq_regs();
+        ktime_t now = ktime_get();
+        /* Check, if the jiffies need an update */
+        tick_do_update_jiffies64(now);
+        /*
+         * Do not call, when we are not in irq context and have
+         * no valid regs pointer
+         */
+        if (regs) {
+                /*
+                 * When we are idle and the tick is stopped, we have to touch
+                 * the watchdog as we might not schedule for a really long
+                 * time. This happens on complete idle SMP systems while
+                 * waiting on the login prompt. We also increment the "start of
+                 * idle" jiffy stamp so the idle accounting adjustment we do
+                 * when we go busy again does not account too much ticks.
+                 */
+                if (ts->tick_stopped) {
+                        touch_softlockup_watchdog();
+                        ts->idle_jiffies++;
+                }
+                /*
+                 * update_process_times() might take tasklist_lock, hence
+                 * drop the base lock. sched-tick hrtimers are per-CPU and
+                 * never accessible by userspace APIs, so this is safe to do.
+                 */
+                spin_unlock(&base->lock);
+                update_process_times(user_mode(regs));
+                profile_tick(CPU_PROFILING);
+                spin_lock(&base->lock);
+        }
+        /* Do not restart, when we are in the idle loop */
+        if (ts->tick_stopped)
+                return HRTIMER_NORESTART;
+        hrtimer_forward(timer, now, tick_period);
+        return HRTIMER_RESTART;
+}
+/**
+ * tick_setup_sched_timer - setup the tick emulation timer
+ */
+void tick_setup_sched_timer(void)
+{
+        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        ktime_t now = ktime_get();
+        /*
+         * Emulate tick processing via per-CPU hrtimers:
+         */
+        hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+        ts->sched_timer.function = tick_sched_timer;
+        ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+        /* Get the next period */
+        ts->sched_timer.expires = tick_init_jiffy_update();
+        for (;;) {
+                hrtimer_forward(&ts->sched_timer, now, tick_period);
+                hrtimer_start(&ts->sched_timer, ts->sched_timer.expires,
+                              HRTIMER_MODE_ABS);
+                /* Check, if the timer was already in the past */
+                if (hrtimer_active(&ts->sched_timer))
+                        break;
+                now = ktime_get();
+        }
+#ifdef CONFIG_NO_HZ
+        if (tick_nohz_enabled)
+                ts->nohz_mode = NOHZ_MODE_HIGHRES;
+#endif
+}
+void tick_cancel_sched_timer(int cpu)
+{
+        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        if (ts->sched_timer.base)
+                hrtimer_cancel(&ts->sched_timer);
+        ts->tick_stopped = 0;
+        ts->nohz_mode = NOHZ_MODE_INACTIVE;
+}
+#endif /* HIGH_RES_TIMERS */
+/**
+ * Async notification about clocksource changes
+ */
+void tick_clock_notify(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
+}
+/*
+ * Async notification about clock event changes
+ */
+void tick_oneshot_notify(void)
+{
+        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        set_bit(0, &ts->check_clocks);
+}
+/**
+ * Check, if a change happened, which makes oneshot possible.
+ *
+ * Called cyclic from the hrtimer softirq (driven by the timer
+ * softirq) allow_nohz signals, that we can switch into low-res nohz
+ * mode, because high resolution timers are disabled (either compile
+ * or runtime).
+ */
+int tick_check_oneshot_change(int allow_nohz)
+{
+        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        if (!test_and_clear_bit(0, &ts->check_clocks))
+                return 0;
+        if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
+                return 0;
+        if (!timekeeping_is_continuous() || !tick_is_oneshot_available())
+                return 0;
+        if (!allow_nohz)
+                return 1;
+        tick_nohz_switch_to_nohz();
+        return 0;
+}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
new file mode 100644
index 000000000000..f82c635c3d5c
--- /dev/null
+++ b/kernel/time/timer_list.c
@@ -0,0 +1,287 @@
+/*
+ * kernel/time/timer_list.c
+ *
+ * List pending timers
+ *
+ * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/tick.h>
+#include <asm/uaccess.h>
+typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
+DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
+/*
+ * This allows printing both to /proc/timer_list and
+ * to the console (on SysRq-Q):
+ */
+#define SEQ_printf(m, x...)                     \
+ do {                                           \
+        if (m)                                  \
+                seq_printf(m, x);               \
+        else                                    \
+                printk(x);                      \
+ } while (0)
+static void print_name_offset(struct seq_file *m, void *sym)
+{
+        unsigned long addr = (unsigned long)sym;
+        char namebuf[KSYM_NAME_LEN+1];
+        unsigned long size, offset;
+        const char *sym_name;
+        char *modname;
+        sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
+        if (sym_name)
+                SEQ_printf(m, "%s", sym_name);
+        else
+                SEQ_printf(m, "<%p>", sym);
+}
+static void
+print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
+{
+#ifdef CONFIG_TIMER_STATS
+        char tmp[TASK_COMM_LEN + 1];
+#endif
+        SEQ_printf(m, " #%d: ", idx);
+        print_name_offset(m, timer);
+        SEQ_printf(m, ", ");
+        print_name_offset(m, timer->function);
+        SEQ_printf(m, ", S:%02lx", timer->state);
+#ifdef CONFIG_TIMER_STATS
+        SEQ_printf(m, ", ");
+        print_name_offset(m, timer->start_site);
+        memcpy(tmp, timer->start_comm, TASK_COMM_LEN);
+        tmp[TASK_COMM_LEN] = 0;
+        SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
+#endif
+        SEQ_printf(m, "\n");
+        SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n",
+                (unsigned long long)ktime_to_ns(timer->expires),
+                (unsigned long long)(ktime_to_ns(timer->expires) - now));
+}
+static void
+print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
+                    u64 now)
+{
+        struct hrtimer *timer, tmp;
+        unsigned long next = 0, i;
+        struct rb_node *curr;
+        unsigned long flags;
+next_one:
+        i = 0;
+        spin_lock_irqsave(&base->cpu_base->lock, flags);
+        curr = base->first;
+        /*
+         * Crude but we have to do this O(N*N) thing, because
+         * we have to unlock the base when printing:
+         */
+        while (curr && i < next) {
+                curr = rb_next(curr);
+                i++;
+        }
+        if (curr) {
+                timer = rb_entry(curr, struct hrtimer, node);
+                tmp = *timer;
+                spin_unlock_irqrestore(&base->cpu_base->lock, flags);
+                print_timer(m, &tmp, i, now);
+                next++;
+                goto next_one;
+        }
+        spin_unlock_irqrestore(&base->cpu_base->lock, flags);
+}
+static void
+print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
+{
+        SEQ_printf(m, "  .index:      %d\n",
+                        base->index);
+        SEQ_printf(m, "  .resolution: %Ld nsecs\n",
+                        (unsigned long long)ktime_to_ns(base->resolution));
+        SEQ_printf(m,   "  .get_time:   ");
+        print_name_offset(m, base->get_time);
+        SEQ_printf(m,   "\n");
+#ifdef CONFIG_HIGH_RES_TIMERS
+        SEQ_printf(m, "  .offset:     %Ld nsecs\n",
+                        ktime_to_ns(base->offset));
+#endif
+        SEQ_printf(m,   "active timers:\n");
+        print_active_timers(m, base, now);
+}
+static void print_cpu(struct seq_file *m, int cpu, u64 now)
+{
+        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
+        int i;
+        SEQ_printf(m, "\ncpu: %d\n", cpu);
+        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+                SEQ_printf(m, " clock %d:\n", i);
+                print_base(m, cpu_base->clock_base + i, now);
+        }
+#define P(x) \
+        SEQ_printf(m, "  .%-15s: %Ld\n", #x, (u64)(cpu_base->x))
+#define P_ns(x) \
+        SEQ_printf(m, "  .%-15s: %Ld nsecs\n", #x, \
+                (u64)(ktime_to_ns(cpu_base->x)))
+#ifdef CONFIG_HIGH_RES_TIMERS
+        P_ns(expires_next);
+        P(hres_active);
+        P(nr_events);
+#endif
+#undef P
+#undef P_ns
+#ifdef CONFIG_TICK_ONESHOT
+# define P(x) \
+        SEQ_printf(m, "  .%-15s: %Ld\n", #x, (u64)(ts->x))
+# define P_ns(x) \
+        SEQ_printf(m, "  .%-15s: %Ld nsecs\n", #x, \
+                (u64)(ktime_to_ns(ts->x)))
+        {
+                struct tick_sched *ts = tick_get_tick_sched(cpu);
+                P(nohz_mode);
+                P_ns(idle_tick);
+                P(tick_stopped);
+                P(idle_jiffies);
+                P(idle_calls);
+                P(idle_sleeps);
+                P_ns(idle_entrytime);
+                P_ns(idle_sleeptime);
+                P(last_jiffies);
+                P(next_jiffies);
+                P_ns(idle_expires);
+                SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies);
+        }
+#endif
+#undef P
+#undef P_ns
+}
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+static void
+print_tickdevice(struct seq_file *m, struct tick_device *td)
+{
+        struct clock_event_device *dev = td->evtdev;
+        SEQ_printf(m, "\nTick Device: mode:     %d\n", td->mode);
+        SEQ_printf(m, "Clock Event Device: ");
+        if (!dev) {
+                SEQ_printf(m, "<NULL>\n");
+                return;
+        }
+        SEQ_printf(m, "%s\n", dev->name);
+        SEQ_printf(m, " max_delta_ns:   %ld\n", dev->max_delta_ns);
+        SEQ_printf(m, " min_delta_ns:   %ld\n", dev->min_delta_ns);
+        SEQ_printf(m, " mult:           %ld\n", dev->mult);
+        SEQ_printf(m, " shift:          %d\n", dev->shift);
+        SEQ_printf(m, " mode:           %d\n", dev->mode);
+        SEQ_printf(m, " next_event:     %Ld nsecs\n",
+                   (unsigned long long) ktime_to_ns(dev->next_event));
+        SEQ_printf(m, " set_next_event: ");
+        print_name_offset(m, dev->set_next_event);
+        SEQ_printf(m, "\n");
+        SEQ_printf(m, " set_mode:       ");
+        print_name_offset(m, dev->set_mode);
+        SEQ_printf(m, "\n");
+        SEQ_printf(m, " event_handler:  ");
+        print_name_offset(m, dev->event_handler);
+        SEQ_printf(m, "\n");
+}
+static void timer_list_show_tickdevices(struct seq_file *m)
+{
+        int cpu;
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+        print_tickdevice(m, tick_get_broadcast_device());
+        SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
+                   tick_get_broadcast_mask()->bits[0]);
+#ifdef CONFIG_TICK_ONESHOT
+        SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
+                   tick_get_broadcast_oneshot_mask()->bits[0]);
+#endif
+        SEQ_printf(m, "\n");
+#endif
+        for_each_online_cpu(cpu)
+                   print_tickdevice(m, tick_get_device(cpu));
+        SEQ_printf(m, "\n");
+}
+#else
+static void timer_list_show_tickdevices(struct seq_file *m) { }
+#endif
+static int timer_list_show(struct seq_file *m, void *v)
+{
+        u64 now = ktime_to_ns(ktime_get());
+        int cpu;
+        SEQ_printf(m, "Timer List Version: v0.3\n");
+        SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
+        SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
+        for_each_online_cpu(cpu)
+                print_cpu(m, cpu, now);
+        SEQ_printf(m, "\n");
+        timer_list_show_tickdevices(m);
+        return 0;
+}
+void sysrq_timer_list_show(void)
+{
+        timer_list_show(NULL, NULL);
+}
+static int timer_list_open(struct inode *inode, struct file *filp)
+{
+        return single_open(filp, timer_list_show, NULL);
+}
+static struct file_operations timer_list_fops = {
+        .open           = timer_list_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+static int __init init_timer_list_procfs(void)
+{
+        struct proc_dir_entry *pe;
+        pe = create_proc_entry("timer_list", 0644, NULL);
+        if (!pe)
+                return -ENOMEM;
+        pe->proc_fops = &timer_list_fops;
+        return 0;
+}
+__initcall(init_timer_list_procfs);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
new file mode 100644
index 000000000000..1bc4882e28e0
--- /dev/null
+++ b/kernel/time/timer_stats.c
@@ -0,0 +1,411 @@
+/*
+ * kernel/time/timer_stats.c
+ *
+ * Collect timer usage statistics.
+ *
+ * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * timer_stats is based on timer_top, a similar functionality which was part of
+ * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the
+ * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based
+ * on dynamic allocation of the statistics entries and linear search based
+ * lookup combined with a global lock, rather than the static array, hash
+ * and per-CPU locking which is used by timer_stats. It was written for the
+ * pre hrtimer kernel code and therefore did not take hrtimers into account.
+ * Nevertheless it provided the base for the timer_stats implementation and
+ * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks
+ * for this effort.
+ *
+ * timer_top.c is
+ *      Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
+ *      Written by Daniel Petrini <d.pensator@gmail.com>
+ *      timer_top.c was released under the GNU General Public License version 2
+ *
+ * We export the addresses and counting of timer functions being called,
+ * the pid and cmdline from the owner process if applicable.
+ *
+ * Start/stop data collection:
+ * # echo 1[0] >/proc/timer_stats
+ *
+ * Display the information collected so far:
+ * # cat /proc/timer_stats
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <asm/uaccess.h>
+/*
+ * This is our basic unit of interest: a timer expiry event identified
+ * by the timer, its start/expire functions and the PID of the task that
+ * started the timer. We count the number of times an event happens:
+ */
+struct entry {
+        /*
+         * Hash list:
+         */
+        struct entry            *next;
+        /*
+         * Hash keys:
+         */
+        void                    *timer;
+        void                    *start_func;
+        void                    *expire_func;
+        pid_t                   pid;
+        /*
+         * Number of timeout events:
+         */
+        unsigned long           count;
+        /*
+         * We save the command-line string to preserve
+         * this information past task exit:
+         */
+        char                    comm[TASK_COMM_LEN + 1];
+} ____cacheline_aligned_in_smp;
+/*
+ * Spinlock protecting the tables - not taken during lookup:
+ */
+static DEFINE_SPINLOCK(table_lock);
+/*
+ * Per-CPU lookup locks for fast hash lookup:
+ */
+static DEFINE_PER_CPU(spinlock_t, lookup_lock);
+/*
+ * Mutex to serialize state changes with show-stats activities:
+ */
+static DEFINE_MUTEX(show_mutex);
+/*
+ * Collection status, active/inactive:
+ */
+static int __read_mostly active;
+/*
+ * Beginning/end timestamps of measurement:
+ */
+static ktime_t time_start, time_stop;
+/*
+ * tstat entry structs only get allocated while collection is
+ * active and never freed during that time - this simplifies
+ * things quite a bit.
+ *
+ * They get freed when a new collection period is started.
+ */
+#define MAX_ENTRIES_BITS        10
+#define MAX_ENTRIES             (1UL << MAX_ENTRIES_BITS)
+static unsigned long nr_entries;
+static struct entry entries[MAX_ENTRIES];
+static atomic_t overflow_count;
+static void reset_entries(void)
+{
+        nr_entries = 0;
+        memset(entries, 0, sizeof(entries));
+        atomic_set(&overflow_count, 0);
+}
+static struct entry *alloc_entry(void)
+{
+        if (nr_entries >= MAX_ENTRIES)
+                return NULL;
+        return entries + nr_entries++;
+}
+/*
+ * The entries are in a hash-table, for fast lookup:
+ */
+#define TSTAT_HASH_BITS         (MAX_ENTRIES_BITS - 1)
+#define TSTAT_HASH_SIZE         (1UL << TSTAT_HASH_BITS)
+#define TSTAT_HASH_MASK         (TSTAT_HASH_SIZE - 1)
+#define __tstat_hashfn(entry)                                           \
+        (((unsigned long)(entry)->timer       ^                         \
+          (unsigned long)(entry)->start_func  ^                         \
+          (unsigned long)(entry)->expire_func ^                         \
+          (unsigned long)(entry)->pid           ) & TSTAT_HASH_MASK)
+#define tstat_hashentry(entry)  (tstat_hash_table + __tstat_hashfn(entry))
+static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
+static int match_entries(struct entry *entry1, struct entry *entry2)
+{
+        return entry1->timer       == entry2->timer       &&
+               entry1->start_func  == entry2->start_func  &&
+               entry1->expire_func == entry2->expire_func &&
+               entry1->pid         == entry2->pid;
+}
+/*
+ * Look up whether an entry matching this item is present
+ * in the hash already. Must be called with irqs off and the
+ * lookup lock held:
+ */
+static struct entry *tstat_lookup(struct entry *entry, char *comm)
+{
+        struct entry **head, *curr, *prev;
+        head = tstat_hashentry(entry);
+        curr = *head;
+        /*
+         * The fastpath is when the entry is already hashed,
+         * we do this with the lookup lock held, but with the
+         * table lock not held:
+         */
+        while (curr) {
+                if (match_entries(curr, entry))
+                        return curr;
+                curr = curr->next;
+        }
+        /*
+         * Slowpath: allocate, set up and link a new hash entry:
+         */
+        prev = NULL;
+        curr = *head;
+        spin_lock(&table_lock);
+        /*
+         * Make sure we have not raced with another CPU:
+         */
+        while (curr) {
+                if (match_entries(curr, entry))
+                        goto out_unlock;
+                prev = curr;
+                curr = curr->next;
+        }
+        curr = alloc_entry();
+        if (curr) {
+                *curr = *entry;
+                curr->count = 0;
+                memcpy(curr->comm, comm, TASK_COMM_LEN);
+                if (prev)
+                        prev->next = curr;
+                else
+                        *head = curr;
+                curr->next = NULL;
+        }
+ out_unlock:
+        spin_unlock(&table_lock);
+        return curr;
+}
+/**
+ * timer_stats_update_stats - Update the statistics for a timer.
+ * @timer:      pointer to either a timer_list or a hrtimer
+ * @pid:        the pid of the task which set up the timer
+ * @startf:     pointer to the function which did the timer setup
+ * @timerf:     pointer to the timer callback function of the timer
+ * @comm:       name of the process which set up the timer
+ *
+ * When the timer is already registered, then the event counter is
+ * incremented. Otherwise the timer is registered in a free slot.
+ */
+void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
+                              void *timerf, char * comm)
+{
+        /*
+         * It doesnt matter which lock we take:
+         */
+        spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id());
+        struct entry *entry, input;
+        unsigned long flags;
+        input.timer = timer;
+        input.start_func = startf;
+        input.expire_func = timerf;
+        input.pid = pid;
+        spin_lock_irqsave(lock, flags);
+        if (!active)
+                goto out_unlock;
+        entry = tstat_lookup(&input, comm);
+        if (likely(entry))
+                entry->count++;
+        else
+                atomic_inc(&overflow_count);
+ out_unlock:
+        spin_unlock_irqrestore(lock, flags);
+}
+static void print_name_offset(struct seq_file *m, unsigned long addr)
+{
+        char namebuf[KSYM_NAME_LEN+1];
+        unsigned long size, offset;
+        const char *sym_name;
+        char *modname;
+        sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
+        if (sym_name)
+                seq_printf(m, "%s", sym_name);
+        else
+                seq_printf(m, "<%p>", (void *)addr);
+}
+static int tstats_show(struct seq_file *m, void *v)
+{
+        struct timespec period;
+        struct entry *entry;
+        unsigned long ms;
+        long events = 0;
+        ktime_t time;
+        int i;
+        mutex_lock(&show_mutex);
+        /*
+         * If still active then calculate up to now:
+         */
+        if (active)
+                time_stop = ktime_get();
+        time = ktime_sub(time_stop, time_start);
+        period = ktime_to_timespec(time);
+        ms = period.tv_nsec / 1000000;
+        seq_puts(m, "Timer Stats Version: v0.1\n");
+        seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
+        if (atomic_read(&overflow_count))
+                seq_printf(m, "Overflow: %d entries\n",
+                        atomic_read(&overflow_count));
+        for (i = 0; i < nr_entries; i++) {
+                entry = entries + i;
+                seq_printf(m, "%4lu, %5d %-16s ",
+                                entry->count, entry->pid, entry->comm);
+                print_name_offset(m, (unsigned long)entry->start_func);
+                seq_puts(m, " (");
+                print_name_offset(m, (unsigned long)entry->expire_func);
+                seq_puts(m, ")\n");
+                events += entry->count;
+        }
+        ms += period.tv_sec * 1000;
+        if (!ms)
+                ms = 1;
+        if (events && period.tv_sec)
+                seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events,
+                           events / period.tv_sec, events * 1000 / ms);
+        else
+                seq_printf(m, "%ld total events\n", events);
+        mutex_unlock(&show_mutex);
+        return 0;
+}
+/*
+ * After a state change, make sure all concurrent lookup/update
+ * activities have stopped:
+ */
+static void sync_access(void)
+{
+        unsigned long flags;
+        int cpu;
+        for_each_online_cpu(cpu) {
+                spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags);
+                /* nothing */
+                spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags);
+        }
+}
+static ssize_t tstats_write(struct file *file, const char __user *buf,
+                            size_t count, loff_t *offs)
+{
+        char ctl[2];
+        if (count != 2 || *offs)
+                return -EINVAL;
+        if (copy_from_user(ctl, buf, count))
+                return -EFAULT;
+        mutex_lock(&show_mutex);
+        switch (ctl[0]) {
+        case '0':
+                if (active) {
+                        active = 0;
+                        time_stop = ktime_get();
+                        sync_access();
+                }
+                break;
+        case '1':
+                if (!active) {
+                        reset_entries();
+                        time_start = ktime_get();
+                        active = 1;
+                }
+                break;
+        default:
+                count = -EINVAL;
+        }
+        mutex_unlock(&show_mutex);
+        return count;
+}
+static int tstats_open(struct inode *inode, struct file *filp)
+{
+        return single_open(filp, tstats_show, NULL);
+}
+static struct file_operations tstats_fops = {
+        .open           = tstats_open,
+        .read           = seq_read,
+        .write          = tstats_write,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+void __init init_timer_stats(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                spin_lock_init(&per_cpu(lookup_lock, cpu));
+}
+static int __init init_tstats_procfs(void)
+{
+        struct proc_dir_entry *pe;
+        pe = create_proc_entry("timer_stats", 0644, NULL);
+        if (!pe)
+                return -ENOMEM;
+        pe->proc_fops = &tstats_fops;
+        return 0;
+}
+__initcall(init_tstats_procfs);
diff --git a/kernel/timer.c b/kernel/timer.c
index 8533c3796082..cb1b86a9c52f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -34,6 +34,8 @@
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
 #include <linux/delay.h>
+#include <linux/tick.h>
+#include <linux/kallsyms.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -262,6 +264,18 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
        list_add_tail(&timer->entry, vec);
 }
+#ifdef CONFIG_TIMER_STATS
+void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
+{
+        if (timer->start_site)
+                return;
+        timer->start_site = addr;
+        memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
+        timer->start_pid = current->pid;
+}
+#endif
 /**
 * init_timer - initialize a timer.
 * @timer: the timer to be initialized
@@ -273,11 +287,16 @@ void fastcall init_timer(struct timer_list *timer)
 {
        timer->entry.next = NULL;
        timer->base = __raw_get_cpu_var(tvec_bases);
+#ifdef CONFIG_TIMER_STATS
+        timer->start_site = NULL;
+        timer->start_pid = -1;
+        memset(timer->start_comm, 0, TASK_COMM_LEN);
+#endif
 }
 EXPORT_SYMBOL(init_timer);
 static inline void detach_timer(struct timer_list *timer,
-                                        int clear_pending)
+                                int clear_pending)
 {
        struct list_head *entry = &timer->entry;
@@ -324,6 +343,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
        unsigned long flags;
        int ret = 0;
+        timer_stats_timer_set_start_info(timer);
        BUG_ON(!timer->function);
        base = lock_timer_base(timer, &flags);
@@ -374,6 +394,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
        tvec_base_t *base = per_cpu(tvec_bases, cpu);
        unsigned long flags;
+        timer_stats_timer_set_start_info(timer);
        BUG_ON(timer_pending(timer) || !timer->function);
        spin_lock_irqsave(&base->lock, flags);
        timer->base = base;
@@ -406,6 +427,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 {
        BUG_ON(!timer->function);
+        timer_stats_timer_set_start_info(timer);
        /*
         * This is a common optimization triggered by the
         * networking code - if the timer is re-modified
@@ -436,6 +458,7 @@ int del_timer(struct timer_list *timer)
        unsigned long flags;
        int ret = 0;
+        timer_stats_timer_clear_start_info(timer);
        if (timer_pending(timer)) {
                base = lock_timer_base(timer, &flags);
                if (timer_pending(timer)) {
@@ -569,6 +592,8 @@ static inline void __run_timers(tvec_base_t *base)
                        fn = timer->function;
                        data = timer->data;
+                        timer_stats_account_timer(timer);
                        set_running_timer(base, timer);
                        detach_timer(timer, 1);
                        spin_unlock_irq(&base->lock);
@@ -591,105 +616,124 @@ static inline void __run_timers(tvec_base_t *base)
        spin_unlock_irq(&base->lock);
 }
-#ifdef CONFIG_NO_IDLE_HZ
+#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
 /*
 * Find out when the next timer event is due to happen. This
 * is used on S/390 to stop all activity when a cpus is idle.
 * This functions needs to be called disabled.
 */
-unsigned long next_timer_interrupt(void)
+static unsigned long __next_timer_interrupt(tvec_base_t *base)
 {
-        tvec_base_t *base;
+        unsigned long timer_jiffies = base->timer_jiffies;
-        struct list_head *list;
+        unsigned long expires = timer_jiffies + (LONG_MAX >> 1);
+        int index, slot, array, found = 0;
        struct timer_list *nte;
-        unsigned long expires;
-        unsigned long hr_expires = MAX_JIFFY_OFFSET;
-        ktime_t hr_delta;
        tvec_t *varray[4];
-        int i, j;
-        hr_delta = hrtimer_get_next_event();
-        if (hr_delta.tv64 != KTIME_MAX) {
-                struct timespec tsdelta;
-                tsdelta = ktime_to_timespec(hr_delta);
-                hr_expires = timespec_to_jiffies(&tsdelta);
-                if (hr_expires < 3)
-                        return hr_expires + jiffies;
-        }
-        hr_expires += jiffies;
-        base = __get_cpu_var(tvec_bases);
-        spin_lock(&base->lock);
-        expires = base->timer_jiffies + (LONG_MAX >> 1);
-        list = NULL;
        /* Look for timer events in tv1. */
-        j = base->timer_jiffies & TVR_MASK;
+        index = slot = timer_jiffies & TVR_MASK;
        do {
-                list_for_each_entry(nte, base->tv1.vec + j, entry) {
+                list_for_each_entry(nte, base->tv1.vec + slot, entry) {
+                        found = 1;
                        expires = nte->expires;
-                        if (j < (base->timer_jiffies & TVR_MASK))
+                        /* Look at the cascade bucket(s)? */
-                                list = base->tv2.vec + (INDEX(0));
+                        if (!index || slot < index)
-                        goto found;
+                                goto cascade;
+                        return expires;
                }
-                j = (j + 1) & TVR_MASK;
+                slot = (slot + 1) & TVR_MASK;
-        } while (j != (base->timer_jiffies & TVR_MASK));
+        } while (slot != index);
+cascade:
+        /* Calculate the next cascade event */
+        if (index)
+                timer_jiffies += TVR_SIZE - index;
+        timer_jiffies >>= TVR_BITS;
        /* Check tv2-tv5. */
        varray[0] = &base->tv2;
        varray[1] = &base->tv3;
        varray[2] = &base->tv4;
        varray[3] = &base->tv5;
-        for (i = 0; i < 4; i++) {
-                j = INDEX(i);
+        for (array = 0; array < 4; array++) {
+                tvec_t *varp = varray[array];
+                index = slot = timer_jiffies & TVN_MASK;
                do {
-                        if (list_empty(varray[i]->vec + j)) {
+                        list_for_each_entry(nte, varp->vec + slot, entry) {
-                                j = (j + 1) & TVN_MASK;
+                                found = 1;
-                                continue;
-                        }
-                        list_for_each_entry(nte, varray[i]->vec + j, entry)
                                if (time_before(nte->expires, expires))
                                        expires = nte->expires;
-                        if (j < (INDEX(i)) && i < 3)
+                        }
-                                list = varray[i + 1]->vec + (INDEX(i + 1));
+                        /*
-                        goto found;
+                         * Do we still search for the first timer or are
-                } while (j != (INDEX(i)));
+                         * we looking up the cascade buckets ?
-        }
+                         */
-found:
+                        if (found) {
-        if (list) {
+                                /* Look at the cascade bucket(s)? */
-                /*
+                                if (!index || slot < index)
-                 * The search wrapped. We need to look at the next list
+                                        break;
-                 * from next tv element that would cascade into tv element
+                                return expires;
-                 * where we found the timer element.
+                        }
-                 */
+                        slot = (slot + 1) & TVN_MASK;
-                list_for_each_entry(nte, list, entry) {
+                } while (slot != index);
-                        if (time_before(nte->expires, expires))
-                                expires = nte->expires;
+                if (index)
-                }
+                        timer_jiffies += TVN_SIZE - index;
+                timer_jiffies >>= TVN_BITS;
        }
-        spin_unlock(&base->lock);
+        return expires;
+}
-        /*
+/*
-         * It can happen that other CPUs service timer IRQs and increment
+ * Check, if the next hrtimer event is before the next timer wheel
-         * jiffies, but we have not yet got a local timer tick to process
+ * event:
-         * the timer wheels.  In that case, the expiry time can be before
+ */
-         * jiffies, but since the high-resolution timer here is relative to
+static unsigned long cmp_next_hrtimer_event(unsigned long now,
-         * jiffies, the default expression when high-resolution timers are
+                                            unsigned long expires)
-         * not active,
+{
-         *
+        ktime_t hr_delta = hrtimer_get_next_event();
-         *   time_before(MAX_JIFFY_OFFSET + jiffies, expires)
+        struct timespec tsdelta;
-         *
-         * would falsely evaluate to true.  If that is the case, just
+        if (hr_delta.tv64 == KTIME_MAX)
-         * return jiffies so that we can immediately fire the local timer
+                return expires;
-         */
-        if (time_before(expires, jiffies))
-                return jiffies;
-        if (time_before(hr_expires, expires))
+        if (hr_delta.tv64 <= TICK_NSEC)
-                return hr_expires;
+                return now;
+        tsdelta = ktime_to_timespec(hr_delta);
+        now += timespec_to_jiffies(&tsdelta);
+        if (time_before(now, expires))
+                return now;
        return expires;
 }
+/**
+ * next_timer_interrupt - return the jiffy of the next pending timer
+ */
+unsigned long get_next_timer_interrupt(unsigned long now)
+{
+        tvec_base_t *base = __get_cpu_var(tvec_bases);
+        unsigned long expires;
+        spin_lock(&base->lock);
+        expires = __next_timer_interrupt(base);
+        spin_unlock(&base->lock);
+        if (time_before_eq(expires, now))
+                return now;
+        return cmp_next_hrtimer_event(now, expires);
+}
+#ifdef CONFIG_NO_IDLE_HZ
+unsigned long next_timer_interrupt(void)
+{
+        return get_next_timer_interrupt(jiffies);
+}
+#endif
 #endif
 /******************************************************************/
@@ -832,32 +876,35 @@ EXPORT_SYMBOL(do_settimeofday);
 *
 * Accumulates current time interval and initializes new clocksource
 */
-static int change_clocksource(void)
+static void change_clocksource(void)
 {
        struct clocksource *new;
        cycle_t now;
        u64 nsec;
        new = clocksource_get_next();
-        if (clock != new) {
-                now = clocksource_read(new);
+        if (clock == new)
-                nsec =  __get_nsec_offset();
+                return;
-                timespec_add_ns(&xtime, nsec);
+        now = clocksource_read(new);
-                clock = new;
+        nsec =  __get_nsec_offset();
-                clock->cycle_last = now;
+        timespec_add_ns(&xtime, nsec);
-                printk(KERN_INFO "Time: %s clocksource has been installed.\n",
-                       clock->name);
+        clock = new;
-                return 1;
+        clock->cycle_last = now;
-        } else if (clock->update_callback) {
-                return clock->update_callback();
+        clock->error = 0;
-        }
+        clock->xtime_nsec = 0;
-        return 0;
+        clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+        tick_clock_notify();
+        printk(KERN_INFO "Time: %s clocksource has been installed.\n",
+               clock->name);
 }
 #else
-static inline int change_clocksource(void)
+static inline void change_clocksource(void) { }
-{
-        return 0;
-}
 #endif
 /**
@@ -871,33 +918,56 @@ int timekeeping_is_continuous(void)
        do {
                seq = read_seqbegin(&xtime_lock);
-                ret = clock->is_continuous;
+                ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
        } while (read_seqretry(&xtime_lock, seq));
        return ret;
 }
+/**
+ * read_persistent_clock -  Return time in seconds from the persistent clock.
+ *
+ * Weak dummy function for arches that do not yet support it.
+ * Returns seconds from epoch using the battery backed persistent clock.
+ * Returns zero if unsupported.
+ *
+ *  XXX - Do be sure to remove it once all arches implement it.
+ */
+unsigned long __attribute__((weak)) read_persistent_clock(void)
+{
+        return 0;
+}
 /*
 * timekeeping_init - Initializes the clocksource and common timekeeping values
 */
 void __init timekeeping_init(void)
 {
        unsigned long flags;
+        unsigned long sec = read_persistent_clock();
        write_seqlock_irqsave(&xtime_lock, flags);
        ntp_clear();
        clock = clocksource_get_next();
-        clocksource_calculate_interval(clock, tick_nsec);
+        clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
        clock->cycle_last = clocksource_read(clock);
+        xtime.tv_sec = sec;
+        xtime.tv_nsec = 0;
+        set_normalized_timespec(&wall_to_monotonic,
+                -xtime.tv_sec, -xtime.tv_nsec);
        write_sequnlock_irqrestore(&xtime_lock, flags);
 }
+/* flag for if timekeeping is suspended */
 static int timekeeping_suspended;
+/* time in seconds when suspend began */
+static unsigned long timekeeping_suspend_time;
 /**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
 * @dev:        unused
@@ -909,13 +979,26 @@ static int timekeeping_suspended;
 static int timekeeping_resume(struct sys_device *dev)
 {
        unsigned long flags;
+        unsigned long now = read_persistent_clock();
        write_seqlock_irqsave(&xtime_lock, flags);
-        /* restart the last cycle value */
+        if (now && (now > timekeeping_suspend_time)) {
+                unsigned long sleep_length = now - timekeeping_suspend_time;
+                xtime.tv_sec += sleep_length;
+                wall_to_monotonic.tv_sec -= sleep_length;
+        }
+        /* re-base the last cycle value */
        clock->cycle_last = clocksource_read(clock);
        clock->error = 0;
        timekeeping_suspended = 0;
        write_sequnlock_irqrestore(&xtime_lock, flags);
+        touch_softlockup_watchdog();
+        /* Resume hrtimers */
+        clock_was_set();
        return 0;
 }
@@ -925,6 +1008,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
        write_seqlock_irqsave(&xtime_lock, flags);
        timekeeping_suspended = 1;
+        timekeeping_suspend_time = read_persistent_clock();
        write_sequnlock_irqrestore(&xtime_lock, flags);
        return 0;
 }
@@ -1089,11 +1173,8 @@ static void update_wall_time(void)
        clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
        /* check to see if there is a new clocksource to use */
-        if (change_clocksource()) {
+        change_clocksource();
-                clock->error = 0;
+        update_vsyscall(&xtime, clock);
-                clock->xtime_nsec = 0;
-                clocksource_calculate_interval(clock, tick_nsec);
-        }
 }
 /*
@@ -1162,11 +1243,9 @@ static inline void calc_load(unsigned long ticks)
 * This read-write spinlock protects us from races in SMP while
 * playing with xtime and avenrun.
 */
-#ifndef ARCH_HAVE_XTIME_LOCK
+__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
 EXPORT_SYMBOL(xtime_lock);
-#endif
 /*
 * This function runs timers and the timer-tq in bottom half context.
@@ -1175,7 +1254,8 @@ static void run_timer_softirq(struct softirq_action *h)
 {
        tvec_base_t *base = __get_cpu_var(tvec_bases);
-        hrtimer_run_queues();
+        hrtimer_run_queues();
        if (time_after_eq(jiffies, base->timer_jiffies))
                __run_timers(base);
 }
@@ -1621,6 +1701,8 @@ void __init init_timers(void)
        int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
                                (void *)(long)smp_processor_id());
+        init_timer_stats();
        BUG_ON(err == NOTIFY_BAD);
        register_cpu_notifier(&timers_nb);
        open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index baacc3691415..658f638c402c 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -22,8 +22,6 @@
 #include <linux/acct.h>
 #include <linux/jiffies.h>
-#define USEC_PER_TICK   (USEC_PER_SEC/HZ)
 /*
 * fill in basic accounting fields
 */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 020d1fff57dc..b6fa5e63085d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -218,7 +218,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 }
 EXPORT_SYMBOL_GPL(queue_work);
-static void delayed_work_timer_fn(unsigned long __data)
+void delayed_work_timer_fn(unsigned long __data)
 {
        struct delayed_work *dwork = (struct delayed_work *)__data;
        struct workqueue_struct *wq = get_wq_data(&dwork->work);
@@ -245,6 +245,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
        struct timer_list *timer = &dwork->timer;
        struct work_struct *work = &dwork->work;
+        timer_stats_timer_set_start_info(timer);
        if (delay == 0)
                return queue_work(wq, work);
@@ -593,8 +594,10 @@ EXPORT_SYMBOL(schedule_work);
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue.
 */
-int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
+int fastcall schedule_delayed_work(struct delayed_work *dwork,
+                                        unsigned long delay)
 {
+        timer_stats_timer_set_start_info(&dwork->timer);
        return queue_delayed_work(keventd_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);