Merge branch 'master' of /home/davem/src/GIT/linux-2.6/

Conflicts: include/net/tcp.h
author: David S. Miller <davem@davemloft.net> 2009-12-11 20:12:17 -0500
committer: David S. Miller <davem@davemloft.net> 2009-12-11 20:12:17 -0500
commit: 501706565b2d4d2d40d0d301d5411ede099b8a6f (patch)
tree: 142a18bf1f1e74a09dbfa27540b893ade0fd797d /kernel
parent: e93737b0f0159a61772894943199fd3b6f315641 (diff)
parent: 2fe77b81c77eed92c4c0439f74c8148a295b4a86 (diff)
21 files changed, 492 insertions, 184 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ba0f1ecb212..7c4e2713df0a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -392,10 +392,9 @@ int disable_nonboot_cpus(void)
                if (cpu == first_cpu)
                        continue;
                error = _cpu_down(cpu, 1);
-                if (!error) {
+                if (!error)
                        cpumask_set_cpu(cpu, frozen_cpus);
-                        printk("CPU%d is down\n", cpu);
+                else {
-                } else {
                        printk(KERN_ERR "Error taking CPU%d down: %d\n",
                                cpu, error);
                        break;
diff --git a/kernel/exit.c b/kernel/exit.c
index 1143012951e9..6f50ef55a6f3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -971,7 +971,7 @@ NORET_TYPE void do_exit(long code)
        exit_thread();
        cgroup_exit(tsk, 1);
-        if (group_dead && tsk->signal->leader)
+        if (group_dead)
                disassociate_ctty(1);
        module_put(task_thread_info(tsk)->exec_domain->module);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 3e1c36e7998f..ede527708123 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1238,7 +1238,8 @@ hrtimer_interrupt_hanging(struct clock_event_device *dev,
        force_clock_reprogram = 1;
        dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
        printk(KERN_WARNING "hrtimer: interrupt too slow, "
-                "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
+               "forcing clock min delta to %llu ns\n",
+               (unsigned long long) dev->min_delta_ns);
 }
 /*
 * High resolution timer interrupt
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index bde4c667d24d..7305b297d1eb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1067,7 +1067,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
                kfree(action);
 #ifdef CONFIG_DEBUG_SHIRQ
-        if (irqflags & IRQF_SHARED) {
+        if (!retval && (irqflags & IRQF_SHARED)) {
                /*
                 * It's a shared IRQ -- the driver ought to be prepared for it
                 * to happen immediately, so let's make sure....
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 22b0a6eedf24..e49ea1c5232d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -220,7 +220,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                /*
                 * If we are seeing only the odd spurious IRQ caused by
                 * bus asynchronicity then don't eventually trigger an error,
-                 * otherwise the couter becomes a doomsday timer for otherwise
+                 * otherwise the counter becomes a doomsday timer for otherwise
                 * working systems
                 */
                if (time_after(jiffies, desc->last_unhandled + HZ/10))
diff --git a/kernel/itimer.c b/kernel/itimer.c
index b03451ede528..d802883153da 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -146,6 +146,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 {
        cputime_t cval, nval, cinterval, ninterval;
        s64 ns_ninterval, ns_nval;
+        u32 error, incr_error;
        struct cpu_itimer *it = &tsk->signal->it[clock_id];
        nval = timeval_to_cputime(&value->it_value);
@@ -153,8 +154,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
        ninterval = timeval_to_cputime(&value->it_interval);
        ns_ninterval = timeval_to_ns(&value->it_interval);
-        it->incr_error = cputime_sub_ns(ninterval, ns_ninterval);
+        error = cputime_sub_ns(nval, ns_nval);
-        it->error = cputime_sub_ns(nval, ns_nval);
+        incr_error = cputime_sub_ns(ninterval, ns_ninterval);
        spin_lock_irq(&tsk->sighand->siglock);
@@ -168,6 +169,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
        }
        it->expires = nval;
        it->incr = ninterval;
+        it->error = error;
+        it->incr_error = incr_error;
        trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
                           ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 7d7014634022..2eb517e23514 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -129,6 +129,7 @@ struct task_struct		*kgdb_usethread;
 struct task_struct              *kgdb_contthread;
 int                             kgdb_single_step;
+pid_t                           kgdb_sstep_pid;
 /* Our I/O buffers. */
 static char                     remcom_in_buffer[BUFMAX];
@@ -541,12 +542,17 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
         */
        if (tid == 0 || tid == -1)
                tid = -atomic_read(&kgdb_active) - 2;
-        if (tid < 0) {
+        if (tid < -1 && tid > -NR_CPUS - 2) {
                if (kgdb_info[-tid - 2].task)
                        return kgdb_info[-tid - 2].task;
                else
                        return idle_task(-tid - 2);
        }
+        if (tid <= 0) {
+                printk(KERN_ERR "KGDB: Internal thread select error\n");
+                dump_stack();
+                return NULL;
+        }
        /*
         * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -619,7 +625,8 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
 static int kgdb_activate_sw_breakpoints(void)
 {
        unsigned long addr;
-        int error = 0;
+        int error;
+        int ret = 0;
        int i;
        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -629,13 +636,16 @@ static int kgdb_activate_sw_breakpoints(void)
                addr = kgdb_break[i].bpt_addr;
                error = kgdb_arch_set_breakpoint(addr,
                                kgdb_break[i].saved_instr);
-                if (error)
+                if (error) {
-                        return error;
+                        ret = error;
+                        printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
+                        continue;
+                }
                kgdb_flush_swbreak_addr(addr);
                kgdb_break[i].state = BP_ACTIVE;
        }
-        return 0;
+        return ret;
 }
 static int kgdb_set_sw_break(unsigned long addr)
@@ -682,7 +692,8 @@ static int kgdb_set_sw_break(unsigned long addr)
 static int kgdb_deactivate_sw_breakpoints(void)
 {
        unsigned long addr;
-        int error = 0;
+        int error;
+        int ret = 0;
        int i;
        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -691,13 +702,15 @@ static int kgdb_deactivate_sw_breakpoints(void)
                addr = kgdb_break[i].bpt_addr;
                error = kgdb_arch_remove_breakpoint(addr,
                                        kgdb_break[i].saved_instr);
-                if (error)
+                if (error) {
-                        return error;
+                        printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
+                        ret = error;
+                }
                kgdb_flush_swbreak_addr(addr);
                kgdb_break[i].state = BP_SET;
        }
-        return 0;
+        return ret;
 }
 static int kgdb_remove_sw_break(unsigned long addr)
@@ -1204,8 +1217,10 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks)
                return 1;
        } else {
-                error_packet(remcom_out_buffer, -EINVAL);
+                kgdb_msg_write("KGDB only knows signal 9 (pass)"
-                return 0;
+                        " and 15 (pass and disconnect)\n"
+                        "Executing a continue without signal passing\n", 0);
+                remcom_in_buffer[0] = 'c';
        }
        /* Indicate fall through */
@@ -1395,6 +1410,7 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
        struct kgdb_state kgdb_var;
        struct kgdb_state *ks = &kgdb_var;
        unsigned long flags;
+        int sstep_tries = 100;
        int error = 0;
        int i, cpu;
@@ -1425,13 +1441,14 @@ acquirelock:
                cpu_relax();
        /*
-         * Do not start the debugger connection on this CPU if the last
+         * For single stepping, try to only enter on the processor
-         * instance of the exception handler wanted to come into the
+         * that was single stepping.  To gaurd against a deadlock, the
-         * debugger on a different CPU via a single step
+         * kernel will only try for the value of sstep_tries before
+         * giving up and continuing on.
         */
        if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
-            atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
+            (kgdb_info[cpu].task &&
+             kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
                atomic_set(&kgdb_active, -1);
                touch_softlockup_watchdog();
                clocksource_touch_watchdog();
@@ -1524,6 +1541,13 @@ acquirelock:
        }
 kgdb_restore:
+        if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
+                int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
+                if (kgdb_info[sstep_cpu].task)
+                        kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
+                else
+                        kgdb_sstep_pid = 0;
+        }
        /* Free kgdb_active */
        atomic_set(&kgdb_active, -1);
        touch_softlockup_watchdog();
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 6b7ddba1dd64..40a996ec39fa 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -476,7 +476,7 @@ static void perf_event_remove_from_context(struct perf_event *event)
        if (!task) {
                /*
                 * Per cpu events are removed via an smp call and
-                 * the removal is always sucessful.
+                 * the removal is always successful.
                 */
                smp_call_function_single(event->cpu,
                                         __perf_event_remove_from_context,
@@ -845,7 +845,7 @@ perf_install_in_context(struct perf_event_context *ctx,
        if (!task) {
                /*
                 * Per cpu events are installed via an smp call and
-                 * the install is always sucessful.
+                 * the install is always successful.
                 */
                smp_call_function_single(cpu, __perf_install_in_context,
                                         event, 1);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index dfdec524d1b7..3db49b9ca374 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -29,7 +29,6 @@
 #include <linux/pm_qos_params.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/time.h>
@@ -344,37 +343,33 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
 }
 EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
-#define PID_NAME_LEN sizeof("process_1234567890")
+#define PID_NAME_LEN 32
-static char name[PID_NAME_LEN];
 static int pm_qos_power_open(struct inode *inode, struct file *filp)
 {
        int ret;
        long pm_qos_class;
+        char name[PID_NAME_LEN];
-        lock_kernel();
        pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
        if (pm_qos_class >= 0) {
                filp->private_data = (void *)pm_qos_class;
-                sprintf(name, "process_%d", current->pid);
+                snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
                ret = pm_qos_add_requirement(pm_qos_class, name,
                                        PM_QOS_DEFAULT_VALUE);
-                if (ret >= 0) {
+                if (ret >= 0)
-                        unlock_kernel();
                        return 0;
-                }
        }
-        unlock_kernel();
        return -EPERM;
 }
 static int pm_qos_power_release(struct inode *inode, struct file *filp)
 {
        int pm_qos_class;
+        char name[PID_NAME_LEN];
        pm_qos_class = (long)filp->private_data;
-        sprintf(name, "process_%d", current->pid);
+        snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
        pm_qos_remove_requirement(pm_qos_class, name);
        return 0;
@@ -385,13 +380,14 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
 {
        s32 value;
        int pm_qos_class;
+        char name[PID_NAME_LEN];
        pm_qos_class = (long)filp->private_data;
        if (count != sizeof(s32))
                return -EINVAL;
        if (copy_from_user(&value, buf, sizeof(s32)))
                return -EFAULT;
-        sprintf(name, "process_%d", current->pid);
+        snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
        pm_qos_update_requirement(pm_qos_class, name, value);
        return  sizeof(s32);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 5c9dc228747b..438ff4523513 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -384,7 +384,8 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 /*
 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
- * This is called from sys_timer_create with the new timer already locked.
+ * This is called from sys_timer_create() and do_cpu_nanosleep() with the
+ * new timer already all-zeros initialized.
 */
 int posix_cpu_timer_create(struct k_itimer *new_timer)
 {
@@ -396,8 +397,6 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
                return -EINVAL;
        INIT_LIST_HEAD(&new_timer->it.cpu.entry);
-        new_timer->it.cpu.incr.sched = 0;
-        new_timer->it.cpu.expires.sched = 0;
        read_lock(&tasklist_lock);
        if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
diff --git a/kernel/resource.c b/kernel/resource.c
index fb11a58b9594..dc15686b7a77 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -308,35 +308,37 @@ static int find_resource(struct resource *root, struct resource *new,
                         void *alignf_data)
 {
        struct resource *this = root->child;
+        resource_size_t start, end;
-        new->start = root->start;
+        start = root->start;
        /*
         * Skip past an allocated resource that starts at 0, since the assignment
         * of this->start - 1 to new->end below would cause an underflow.
         */
        if (this && this->start == 0) {
-                new->start = this->end + 1;
+                start = this->end + 1;
                this = this->sibling;
        }
        for(;;) {
                if (this)
-                        new->end = this->start - 1;
+                        end = this->start - 1;
                else
-                        new->end = root->end;
+                        end = root->end;
-                if (new->start < min)
+                if (start < min)
-                        new->start = min;
+                        start = min;
-                if (new->end > max)
+                if (end > max)
-                        new->end = max;
+                        end = max;
-                new->start = ALIGN(new->start, align);
+                start = ALIGN(start, align);
                if (alignf)
                        alignf(alignf_data, new, size, align);
-                if (new->start < new->end && new->end - new->start >= size - 1) {
+                if (start < end && end - start >= size - 1) {
-                        new->end = new->start + size - 1;
+                        new->start = start;
+                        new->end = start + size - 1;
                        return 0;
                }
                if (!this)
                        break;
-                new->start = this->end + 1;
+                start = this->end + 1;
                this = this->sibling;
        }
        return -EBUSY;
diff --git a/kernel/sys.c b/kernel/sys.c
index 9968c5fb55b9..585d6cd10040 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
 #include <linux/mm.h>
 #include <linux/utsname.h>
 #include <linux/mman.h>
-#include <linux/smp_lock.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
@@ -349,6 +348,9 @@ void kernel_power_off(void)
        machine_power_off();
 }
 EXPORT_SYMBOL_GPL(kernel_power_off);
+static DEFINE_MUTEX(reboot_mutex);
 /*
 * Reboot system call: for obvious reasons only root may call it,
 * and even root needs to set up some magic numbers in the registers
@@ -381,7 +383,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
        if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
                cmd = LINUX_REBOOT_CMD_HALT;
-        lock_kernel();
+        mutex_lock(&reboot_mutex);
        switch (cmd) {
        case LINUX_REBOOT_CMD_RESTART:
                kernel_restart(NULL);
@@ -397,20 +399,18 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
        case LINUX_REBOOT_CMD_HALT:
                kernel_halt();
-                unlock_kernel();
                do_exit(0);
                panic("cannot halt");
        case LINUX_REBOOT_CMD_POWER_OFF:
                kernel_power_off();
-                unlock_kernel();
                do_exit(0);
                break;
        case LINUX_REBOOT_CMD_RESTART2:
                if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
-                        unlock_kernel();
+                        ret = -EFAULT;
-                        return -EFAULT;
+                        break;
                }
                buffer[sizeof(buffer) - 1] = '\0';
@@ -433,7 +433,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                ret = -EINVAL;
                break;
        }
-        unlock_kernel();
+        mutex_unlock(&reboot_mutex);
        return ret;
 }
diff --git a/kernel/time.c b/kernel/time.c
index 804798005d19..c6324d96009e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -136,7 +136,6 @@ static inline void warp_clock(void)
        write_seqlock_irq(&xtime_lock);
        wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
        xtime.tv_sec += sys_tz.tz_minuteswest * 60;
-        update_xtime_cache(0);
        write_sequnlock_irq(&xtime_lock);
        clock_was_set();
 }
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 620b58abdc32..20a8920029ee 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -20,6 +20,8 @@
 #include <linux/sysdev.h>
 #include <linux/tick.h>
+#include "tick-internal.h"
 /* The registered clock event devices */
 static LIST_HEAD(clockevent_devices);
 static LIST_HEAD(clockevents_released);
@@ -37,10 +39,9 @@ static DEFINE_SPINLOCK(clockevents_lock);
 *
 * Math helper, returns latch value converted to nanoseconds (bound checked)
 */
-unsigned long clockevent_delta2ns(unsigned long latch,
+u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
-                                  struct clock_event_device *evt)
 {
-        u64 clc = ((u64) latch << evt->shift);
+        u64 clc = (u64) latch << evt->shift;
        if (unlikely(!evt->mult)) {
                evt->mult = 1;
@@ -50,10 +51,10 @@ unsigned long clockevent_delta2ns(unsigned long latch,
        do_div(clc, evt->mult);
        if (clc < 1000)
                clc = 1000;
-        if (clc > LONG_MAX)
+        if (clc > KTIME_MAX)
-                clc = LONG_MAX;
+                clc = KTIME_MAX;
-        return (unsigned long) clc;
+        return clc;
 }
 EXPORT_SYMBOL_GPL(clockevent_delta2ns);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 4a310906b3e8..e85c23404d34 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -107,6 +107,59 @@ u64 timecounter_cyc2time(struct timecounter *tc,
 }
 EXPORT_SYMBOL_GPL(timecounter_cyc2time);
+/**
+ * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
+ * @mult:       pointer to mult variable
+ * @shift:      pointer to shift variable
+ * @from:       frequency to convert from
+ * @to:         frequency to convert to
+ * @minsec:     guaranteed runtime conversion range in seconds
+ *
+ * The function evaluates the shift/mult pair for the scaled math
+ * operations of clocksources and clockevents.
+ *
+ * @to and @from are frequency values in HZ. For clock sources @to is
+ * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
+ * event @to is the counter frequency and @from is NSEC_PER_SEC.
+ *
+ * The @minsec conversion range argument controls the time frame in
+ * seconds which must be covered by the runtime conversion with the
+ * calculated mult and shift factors. This guarantees that no 64bit
+ * overflow happens when the input value of the conversion is
+ * multiplied with the calculated mult factor. Larger ranges may
+ * reduce the conversion accuracy by chosing smaller mult and shift
+ * factors.
+ */
+void
+clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
+{
+        u64 tmp;
+        u32 sft, sftacc= 32;
+        /*
+         * Calculate the shift factor which is limiting the conversion
+         * range:
+         */
+        tmp = ((u64)minsec * from) >> 32;
+        while (tmp) {
+                tmp >>=1;
+                sftacc--;
+        }
+        /*
+         * Find the conversion shift/mult pair which has the best
+         * accuracy and fits the maxsec conversion range:
+         */
+        for (sft = 32; sft > 0; sft--) {
+                tmp = (u64) to << sft;
+                do_div(tmp, from);
+                if ((tmp >> sftacc) == 0)
+                        break;
+        }
+        *mult = tmp;
+        *shift = sft;
+}
 /*[Clocksource internal variables]---------
 * curr_clocksource:
 *      currently selected clocksource.
@@ -413,6 +466,47 @@ void clocksource_touch_watchdog(void)
        clocksource_resume_watchdog();
 }
+/**
+ * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * @cs:         Pointer to clocksource
+ *
+ */
+static u64 clocksource_max_deferment(struct clocksource *cs)
+{
+        u64 max_nsecs, max_cycles;
+        /*
+         * Calculate the maximum number of cycles that we can pass to the
+         * cyc2ns function without overflowing a 64-bit signed result. The
+         * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
+         * is equivalent to the below.
+         * max_cycles < (2^63)/cs->mult
+         * max_cycles < 2^(log2((2^63)/cs->mult))
+         * max_cycles < 2^(log2(2^63) - log2(cs->mult))
+         * max_cycles < 2^(63 - log2(cs->mult))
+         * max_cycles < 1 << (63 - log2(cs->mult))
+         * Please note that we add 1 to the result of the log2 to account for
+         * any rounding errors, ensure the above inequality is satisfied and
+         * no overflow will occur.
+         */
+        max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
+        /*
+         * The actual maximum number of cycles we can defer the clocksource is
+         * determined by the minimum of max_cycles and cs->mask.
+         */
+        max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
+        max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
+        /*
+         * To ensure that the clocksource does not wrap whilst we are idle,
+         * limit the time the clocksource can be deferred by 12.5%. Please
+         * note a margin of 12.5% is used because this can be computed with
+         * a shift, versus say 10% which would require division.
+         */
+        return max_nsecs - (max_nsecs >> 5);
+}
 #ifdef CONFIG_GENERIC_TIME
 /**
@@ -511,6 +605,9 @@ static void clocksource_enqueue(struct clocksource *cs)
 */
 int clocksource_register(struct clocksource *cs)
 {
+        /* calculate max idle time permitted for this clocksource */
+        cs->max_idle_ns = clocksource_max_deferment(cs);
        mutex_lock(&clocksource_mutex);
        clocksource_enqueue(cs);
        clocksource_select();
@@ -580,7 +677,7 @@ sysfs_show_current_clocksources(struct sys_device *dev,
 * @count:      length of buffer
 *
 * Takes input from sysfs interface for manually overriding the default
- * clocksource selction.
+ * clocksource selection.
 */
 static ssize_t sysfs_override_clocksource(struct sys_device *dev,
                                          struct sysdev_attribute *attr,
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index a96c0e2b89cf..0a8a213016f0 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -50,9 +50,9 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
                                dev->min_delta_ns += dev->min_delta_ns >> 1;
                        printk(KERN_WARNING
-                               "CE: %s increasing min_delta_ns to %lu nsec\n",
+                               "CE: %s increasing min_delta_ns to %llu nsec\n",
                               dev->name ? dev->name : "?",
-                               dev->min_delta_ns << 1);
+                               (unsigned long long) dev->min_delta_ns << 1);
                        i = 0;
                }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 89aed5933ed4..f992762d7f51 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz);
 * value. We do this unconditionally on any cpu, as we don't know whether the
 * cpu, which has the update task assigned is in a long sleep.
 */
-static void tick_nohz_update_jiffies(void)
+static void tick_nohz_update_jiffies(ktime_t now)
 {
        int cpu = smp_processor_id();
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        unsigned long flags;
-        ktime_t now;
-        if (!ts->tick_stopped)
-                return;
        cpumask_clear_cpu(cpu, nohz_cpu_mask);
-        now = ktime_get();
        ts->idle_waketime = now;
        local_irq_save(flags);
@@ -155,20 +150,17 @@ static void tick_nohz_update_jiffies(void)
        touch_softlockup_watchdog();
 }
-static void tick_nohz_stop_idle(int cpu)
+static void tick_nohz_stop_idle(int cpu, ktime_t now)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        ktime_t delta;
-        if (ts->idle_active) {
+        delta = ktime_sub(now, ts->idle_entrytime);
-                ktime_t now, delta;
+        ts->idle_lastupdate = now;
-                now = ktime_get();
+        ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-                delta = ktime_sub(now, ts->idle_entrytime);
+        ts->idle_active = 0;
-                ts->idle_lastupdate = now;
-                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-                ts->idle_active = 0;
-                sched_clock_idle_wakeup_event(0);
+        sched_clock_idle_wakeup_event(0);
-        }
 }
 static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
@@ -216,6 +208,7 @@ void tick_nohz_stop_sched_tick(int inidle)
        struct tick_sched *ts;
        ktime_t last_update, expires, now;
        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+        u64 time_delta;
        int cpu;
        local_irq_save(flags);
@@ -263,7 +256,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                if (ratelimit < 10) {
                        printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-                               local_softirq_pending());
+                               (unsigned int) local_softirq_pending());
                        ratelimit++;
                }
                goto end;
@@ -275,14 +268,18 @@ void tick_nohz_stop_sched_tick(int inidle)
                seq = read_seqbegin(&xtime_lock);
                last_update = last_jiffies_update;
                last_jiffies = jiffies;
+                time_delta = timekeeping_max_deferment();
        } while (read_seqretry(&xtime_lock, seq));
-        /* Get the next timer wheel timer */
+        if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
-        next_jiffies = get_next_timer_interrupt(last_jiffies);
+            arch_needs_cpu(cpu)) {
-        delta_jiffies = next_jiffies - last_jiffies;
+                next_jiffies = last_jiffies + 1;
-        if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
                delta_jiffies = 1;
+        } else {
+                /* Get the next timer wheel timer */
+                next_jiffies = get_next_timer_interrupt(last_jiffies);
+                delta_jiffies = next_jiffies - last_jiffies;
+        }
        /*
         * Do not stop the tick, if we are only one off
         * or if the cpu is required for rcu
@@ -294,22 +291,51 @@ void tick_nohz_stop_sched_tick(int inidle)
        if ((long)delta_jiffies >= 1) {
                /*
-                * calculate the expiry time for the next timer wheel
-                * timer
-                */
-                expires = ktime_add_ns(last_update, tick_period.tv64 *
-                                   delta_jiffies);
-                /*
                 * If this cpu is the one which updates jiffies, then
                 * give up the assignment and let it be taken by the
                 * cpu which runs the tick timer next, which might be
                 * this cpu as well. If we don't drop this here the
                 * jiffies might be stale and do_timer() never
-                 * invoked.
+                 * invoked. Keep track of the fact that it was the one
+                 * which had the do_timer() duty last. If this cpu is
+                 * the one which had the do_timer() duty last, we
+                 * limit the sleep time to the timekeeping
+                 * max_deferement value which we retrieved
+                 * above. Otherwise we can sleep as long as we want.
                 */
-                if (cpu == tick_do_timer_cpu)
+                if (cpu == tick_do_timer_cpu) {
                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+                        ts->do_timer_last = 1;
+                } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
+                        time_delta = KTIME_MAX;
+                        ts->do_timer_last = 0;
+                } else if (!ts->do_timer_last) {
+                        time_delta = KTIME_MAX;
+                }
+                /*
+                 * calculate the expiry time for the next timer wheel
+                 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
+                 * that there is no timer pending or at least extremely
+                 * far into the future (12 days for HZ=1000). In this
+                 * case we set the expiry to the end of time.
+                 */
+                if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
+                        /*
+                         * Calculate the time delta for the next timer event.
+                         * If the time delta exceeds the maximum time delta
+                         * permitted by the current clocksource then adjust
+                         * the time delta accordingly to ensure the
+                         * clocksource does not wrap.
+                         */
+                        time_delta = min_t(u64, time_delta,
+                                           tick_period.tv64 * delta_jiffies);
+                }
+                if (time_delta < KTIME_MAX)
+                        expires = ktime_add_ns(last_update, time_delta);
+                else
+                        expires.tv64 = KTIME_MAX;
                if (delta_jiffies > 1)
                        cpumask_set_cpu(cpu, nohz_cpu_mask);
@@ -342,22 +368,19 @@ void tick_nohz_stop_sched_tick(int inidle)
                ts->idle_sleeps++;
+                /* Mark expires */
+                ts->idle_expires = expires;
                /*
-                 * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that
+                 * If the expiration time == KTIME_MAX, then
-                 * there is no timer pending or at least extremly far
+                 * in this case we simply stop the tick timer.
-                 * into the future (12 days for HZ=1000). In this case
-                 * we simply stop the tick timer:
                 */
-                if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) {
+                 if (unlikely(expires.tv64 == KTIME_MAX)) {
-                        ts->idle_expires.tv64 = KTIME_MAX;
                        if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
                                hrtimer_cancel(&ts->sched_timer);
                        goto out;
                }
-                /* Mark expiries */
-                ts->idle_expires = expires;
                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
                        hrtimer_start(&ts->sched_timer, expires,
                                      HRTIMER_MODE_ABS_PINNED);
@@ -436,7 +459,11 @@ void tick_nohz_restart_sched_tick(void)
        ktime_t now;
        local_irq_disable();
-        tick_nohz_stop_idle(cpu);
+        if (ts->idle_active || (ts->inidle && ts->tick_stopped))
+                now = ktime_get();
+        if (ts->idle_active)
+                tick_nohz_stop_idle(cpu, now);
        if (!ts->inidle || !ts->tick_stopped) {
                ts->inidle = 0;
@@ -450,7 +477,6 @@ void tick_nohz_restart_sched_tick(void)
        /* Update jiffies first */
        select_nohz_load_balancer(0);
-        now = ktime_get();
        tick_do_update_jiffies64(now);
        cpumask_clear_cpu(cpu, nohz_cpu_mask);
@@ -584,22 +610,18 @@ static void tick_nohz_switch_to_nohz(void)
 * timer and do not touch the other magic bits which need to be done
 * when idle is left.
 */
-static void tick_nohz_kick_tick(int cpu)
+static void tick_nohz_kick_tick(int cpu, ktime_t now)
 {
 #if 0
        /* Switch back to 2.6.27 behaviour */
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-        ktime_t delta, now;
+        ktime_t delta;
-        if (!ts->tick_stopped)
-                return;
        /*
         * Do not touch the tick device, when the next expiry is either
         * already reached or less/equal than the tick period.
         */
-        now = ktime_get();
        delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
        if (delta.tv64 <= tick_period.tv64)
                return;
@@ -608,9 +630,26 @@ static void tick_nohz_kick_tick(int cpu)
 #endif
 }
+static inline void tick_check_nohz(int cpu)
+{
+        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        ktime_t now;
+        if (!ts->idle_active && !ts->tick_stopped)
+                return;
+        now = ktime_get();
+        if (ts->idle_active)
+                tick_nohz_stop_idle(cpu, now);
+        if (ts->tick_stopped) {
+                tick_nohz_update_jiffies(now);
+                tick_nohz_kick_tick(cpu, now);
+        }
+}
 #else
 static inline void tick_nohz_switch_to_nohz(void) { }
+static inline void tick_check_nohz(int cpu) { }
 #endif /* NO_HZ */
@@ -620,11 +659,7 @@ static inline void tick_nohz_switch_to_nohz(void) { }
 void tick_check_idle(int cpu)
 {
        tick_check_oneshot_broadcast(cpu);
-#ifdef CONFIG_NO_HZ
+        tick_check_nohz(cpu);
-        tick_nohz_stop_idle(cpu);
-        tick_nohz_update_jiffies();
-        tick_nohz_kick_tick(cpu);
-#endif
 }
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c3a4e2907eaa..af4135f05825 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -165,19 +165,12 @@ struct timespec raw_time;
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
-static struct timespec xtime_cache __attribute__ ((aligned (16)));
-void update_xtime_cache(u64 nsec)
-{
-        xtime_cache = xtime;
-        timespec_add_ns(&xtime_cache, nsec);
-}
 /* must hold xtime_lock */
 void timekeeping_leap_insert(int leapsecond)
 {
        xtime.tv_sec += leapsecond;
        wall_to_monotonic.tv_sec -= leapsecond;
-        update_vsyscall(&xtime, timekeeper.clock);
+        update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
 }
 #ifdef CONFIG_GENERIC_TIME
@@ -332,12 +325,10 @@ int do_settimeofday(struct timespec *tv)
        xtime = *tv;
-        update_xtime_cache(0);
        timekeeper.ntp_error = 0;
        ntp_clear();
-        update_vsyscall(&xtime, timekeeper.clock);
+        update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
        write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -488,6 +479,17 @@ int timekeeping_valid_for_hres(void)
 }
 /**
+ * timekeeping_max_deferment - Returns max time the clocksource can be deferred
+ *
+ * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
+ * ensure that the clocksource does not change!
+ */
+u64 timekeeping_max_deferment(void)
+{
+        return timekeeper.clock->max_idle_ns;
+}
+/**
 * read_persistent_clock -  Return time from the persistent clock.
 *
 * Weak dummy function for arches that do not yet support it.
@@ -548,7 +550,6 @@ void __init timekeeping_init(void)
        }
        set_normalized_timespec(&wall_to_monotonic,
                                -boot.tv_sec, -boot.tv_nsec);
-        update_xtime_cache(0);
        total_sleep_time.tv_sec = 0;
        total_sleep_time.tv_nsec = 0;
        write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -582,7 +583,6 @@ static int timekeeping_resume(struct sys_device *dev)
                wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
                total_sleep_time = timespec_add_safe(total_sleep_time, ts);
        }
-        update_xtime_cache(0);
        /* re-base the last cycle value */
        timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
        timekeeper.ntp_error = 0;
@@ -723,6 +723,49 @@ static void timekeeping_adjust(s64 offset)
 }
 /**
+ * logarithmic_accumulation - shifted accumulation of cycles
+ *
+ * This functions accumulates a shifted interval of cycles into
+ * into a shifted interval nanoseconds. Allows for O(log) accumulation
+ * loop.
+ *
+ * Returns the unconsumed cycles.
+ */
+static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
+{
+        u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+        /* If the offset is smaller then a shifted interval, do nothing */
+        if (offset < timekeeper.cycle_interval<<shift)
+                return offset;
+        /* Accumulate one shifted interval */
+        offset -= timekeeper.cycle_interval << shift;
+        timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
+        timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
+        while (timekeeper.xtime_nsec >= nsecps) {
+                timekeeper.xtime_nsec -= nsecps;
+                xtime.tv_sec++;
+                second_overflow();
+        }
+        /* Accumulate into raw time */
+        raw_time.tv_nsec += timekeeper.raw_interval << shift;;
+        while (raw_time.tv_nsec >= NSEC_PER_SEC) {
+                raw_time.tv_nsec -= NSEC_PER_SEC;
+                raw_time.tv_sec++;
+        }
+        /* Accumulate error between NTP and clock interval */
+        timekeeper.ntp_error += tick_length << shift;
+        timekeeper.ntp_error -= timekeeper.xtime_interval <<
+                                (timekeeper.ntp_error_shift + shift);
+        return offset;
+}
+/**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
 * Called from the timer interrupt, must hold a write on xtime_lock.
@@ -731,7 +774,7 @@ void update_wall_time(void)
 {
        struct clocksource *clock;
        cycle_t offset;
-        u64 nsecs;
+        int shift = 0, maxshift;
        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
@@ -745,33 +788,22 @@ void update_wall_time(void)
 #endif
        timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
-        /* normally this loop will run just once, however in the
+        /*
-         * case of lost or late ticks, it will accumulate correctly.
+         * With NO_HZ we may have to accumulate many cycle_intervals
+         * (think "ticks") worth of time at once. To do this efficiently,
+         * we calculate the largest doubling multiple of cycle_intervals
+         * that is smaller then the offset. We then accumulate that
+         * chunk in one go, and then try to consume the next smaller
+         * doubled multiple.
         */
+        shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
+        shift = max(0, shift);
+        /* Bound shift to one less then what overflows tick_length */
+        maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
+        shift = min(shift, maxshift);
        while (offset >= timekeeper.cycle_interval) {
-                u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+                offset = logarithmic_accumulation(offset, shift);
+                shift--;
-                /* accumulate one interval */
-                offset -= timekeeper.cycle_interval;
-                clock->cycle_last += timekeeper.cycle_interval;
-                timekeeper.xtime_nsec += timekeeper.xtime_interval;
-                if (timekeeper.xtime_nsec >= nsecps) {
-                        timekeeper.xtime_nsec -= nsecps;
-                        xtime.tv_sec++;
-                        second_overflow();
-                }
-                raw_time.tv_nsec += timekeeper.raw_interval;
-                if (raw_time.tv_nsec >= NSEC_PER_SEC) {
-                        raw_time.tv_nsec -= NSEC_PER_SEC;
-                        raw_time.tv_sec++;
-                }
-                /* accumulate error between NTP and clock interval */
-                timekeeper.ntp_error += tick_length;
-                timekeeper.ntp_error -= timekeeper.xtime_interval <<
-                                        timekeeper.ntp_error_shift;
        }
        /* correct the clock when NTP error is too big */
@@ -807,11 +839,8 @@ void update_wall_time(void)
        timekeeper.ntp_error += timekeeper.xtime_nsec <<
                                timekeeper.ntp_error_shift;
-        nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
-        update_xtime_cache(nsecs);
        /* check to see if there is a new clocksource to use */
-        update_vsyscall(&xtime, timekeeper.clock);
+        update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
 }
 /**
@@ -846,13 +875,13 @@ void monotonic_to_bootbased(struct timespec *ts)
 unsigned long get_seconds(void)
 {
-        return xtime_cache.tv_sec;
+        return xtime.tv_sec;
 }
 EXPORT_SYMBOL(get_seconds);
 struct timespec __current_kernel_time(void)
 {
-        return xtime_cache;
+        return xtime;
 }
 struct timespec current_kernel_time(void)
@@ -862,8 +891,7 @@ struct timespec current_kernel_time(void)
        do {
                seq = read_seqbegin(&xtime_lock);
+                now = xtime;
-                now = xtime_cache;
        } while (read_seqretry(&xtime_lock, seq));
        return now;
@@ -877,8 +905,7 @@ struct timespec get_monotonic_coarse(void)
        do {
                seq = read_seqbegin(&xtime_lock);
+                now = xtime;
-                now = xtime_cache;
                mono = wall_to_monotonic;
        } while (read_seqretry(&xtime_lock, seq));
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1b5b7aa2fdfd..665c76edbf17 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -204,10 +204,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
                return;
        }
        SEQ_printf(m, "%s\n", dev->name);
-        SEQ_printf(m, " max_delta_ns:   %lu\n", dev->max_delta_ns);
+        SEQ_printf(m, " max_delta_ns:   %llu\n",
-        SEQ_printf(m, " min_delta_ns:   %lu\n", dev->min_delta_ns);
+                   (unsigned long long) dev->max_delta_ns);
-        SEQ_printf(m, " mult:           %lu\n", dev->mult);
+        SEQ_printf(m, " min_delta_ns:   %llu\n",
-        SEQ_printf(m, " shift:          %d\n", dev->shift);
+                   (unsigned long long) dev->min_delta_ns);
+        SEQ_printf(m, " mult:           %u\n", dev->mult);
+        SEQ_printf(m, " shift:          %u\n", dev->shift);
        SEQ_printf(m, " mode:           %d\n", dev->mode);
        SEQ_printf(m, " next_event:     %Ld nsecs\n",
                   (unsigned long long) ktime_to_ns(dev->next_event));
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 03e2d6fd9b18..eb27fd3430a2 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -6,8 +6,6 @@
 static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
-#define URN_LIST_HEAD per_cpu(return_notifier_list, raw_smp_processor_id())
 /*
 * Request a notification when the current cpu returns to userspace.  Must be
 * called in atomic context.  The notifier will also be called in atomic
@@ -16,7 +14,7 @@ static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
 void user_return_notifier_register(struct user_return_notifier *urn)
 {
        set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
-        hlist_add_head(&urn->link, &URN_LIST_HEAD);
+        hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list));
 }
 EXPORT_SYMBOL_GPL(user_return_notifier_register);
@@ -27,7 +25,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
 void user_return_notifier_unregister(struct user_return_notifier *urn)
 {
        hlist_del(&urn->link);
-        if (hlist_empty(&URN_LIST_HEAD))
+        if (hlist_empty(&__get_cpu_var(return_notifier_list)))
                clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
 }
 EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 67e526b6ae81..dee48658805c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,116 @@ struct workqueue_struct {
 #endif
 };
+#ifdef CONFIG_DEBUG_OBJECTS_WORK
+static struct debug_obj_descr work_debug_descr;
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int work_fixup_init(void *addr, enum debug_obj_state state)
+{
+        struct work_struct *work = addr;
+        switch (state) {
+        case ODEBUG_STATE_ACTIVE:
+                cancel_work_sync(work);
+                debug_object_init(work, &work_debug_descr);
+                return 1;
+        default:
+                return 0;
+        }
+}
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ */
+static int work_fixup_activate(void *addr, enum debug_obj_state state)
+{
+        struct work_struct *work = addr;
+        switch (state) {
+        case ODEBUG_STATE_NOTAVAILABLE:
+                /*
+                 * This is not really a fixup. The work struct was
+                 * statically initialized. We just make sure that it
+                 * is tracked in the object tracker.
+                 */
+                if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
+                        debug_object_init(work, &work_debug_descr);
+                        debug_object_activate(work, &work_debug_descr);
+                        return 0;
+                }
+                WARN_ON_ONCE(1);
+                return 0;
+        case ODEBUG_STATE_ACTIVE:
+                WARN_ON(1);
+        default:
+                return 0;
+        }
+}
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int work_fixup_free(void *addr, enum debug_obj_state state)
+{
+        struct work_struct *work = addr;
+        switch (state) {
+        case ODEBUG_STATE_ACTIVE:
+                cancel_work_sync(work);
+                debug_object_free(work, &work_debug_descr);
+                return 1;
+        default:
+                return 0;
+        }
+}
+static struct debug_obj_descr work_debug_descr = {
+        .name           = "work_struct",
+        .fixup_init     = work_fixup_init,
+        .fixup_activate = work_fixup_activate,
+        .fixup_free     = work_fixup_free,
+};
+static inline void debug_work_activate(struct work_struct *work)
+{
+        debug_object_activate(work, &work_debug_descr);
+}
+static inline void debug_work_deactivate(struct work_struct *work)
+{
+        debug_object_deactivate(work, &work_debug_descr);
+}
+void __init_work(struct work_struct *work, int onstack)
+{
+        if (onstack)
+                debug_object_init_on_stack(work, &work_debug_descr);
+        else
+                debug_object_init(work, &work_debug_descr);
+}
+EXPORT_SYMBOL_GPL(__init_work);
+void destroy_work_on_stack(struct work_struct *work)
+{
+        debug_object_free(work, &work_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_work_on_stack);
+#else
+static inline void debug_work_activate(struct work_struct *work) { }
+static inline void debug_work_deactivate(struct work_struct *work) { }
+#endif
 /* Serializes the accesses to the list of workqueues. */
 static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
@@ -145,6 +255,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 {
        unsigned long flags;
+        debug_work_activate(work);
        spin_lock_irqsave(&cwq->lock, flags);
        insert_work(cwq, work, &cwq->worklist);
        spin_unlock_irqrestore(&cwq->lock, flags);
@@ -280,6 +391,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
                struct lockdep_map lockdep_map = work->lockdep_map;
 #endif
                trace_workqueue_execution(cwq->thread, work);
+                debug_work_deactivate(work);
                cwq->current_work = work;
                list_del_init(cwq->worklist.next);
                spin_unlock_irq(&cwq->lock);
@@ -350,11 +462,18 @@ static void wq_barrier_func(struct work_struct *work)
 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
                        struct wq_barrier *barr, struct list_head *head)
 {
-        INIT_WORK(&barr->work, wq_barrier_func);
+        /*
+         * debugobject calls are safe here even with cwq->lock locked
+         * as we know for sure that this will not trigger any of the
+         * checks and call back into the fixup functions where we
+         * might deadlock.
+         */
+        INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
        __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
        init_completion(&barr->done);
+        debug_work_activate(&barr->work);
        insert_work(cwq, &barr->work, head);
 }
@@ -372,8 +491,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
        }
        spin_unlock_irq(&cwq->lock);
-        if (active)
+        if (active) {
                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+        }
        return active;
 }
@@ -451,6 +572,7 @@ out:
                return 0;
        wait_for_completion(&barr.done);
+        destroy_work_on_stack(&barr.work);
        return 1;
 }
 EXPORT_SYMBOL_GPL(flush_work);
@@ -485,6 +607,7 @@ static int try_to_grab_pending(struct work_struct *work)
                 */
                smp_rmb();
                if (cwq == get_wq_data(work)) {
+                        debug_work_deactivate(work);
                        list_del_init(&work->entry);
                        ret = 1;
                }
@@ -507,8 +630,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
        }
        spin_unlock_irq(&cwq->lock);
-        if (unlikely(running))
+        if (unlikely(running)) {
                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+        }
 }
 static void wait_on_work(struct work_struct *work)
author	David S. Miller <davem@davemloft.net>	2009-12-11 20:12:17 -0500
committer	David S. Miller <davem@davemloft.net>	2009-12-11 20:12:17 -0500
commit	501706565b2d4d2d40d0d301d5411ede099b8a6f (patch)
tree	142a18bf1f1e74a09dbfa27540b893ade0fd797d /kernel
parent	e93737b0f0159a61772894943199fd3b6f315641 (diff)
parent	2fe77b81c77eed92c4c0439f74c8148a295b4a86 (diff)