5 files changed, 533 insertions, 286 deletions
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9737a76e106f..7663e5df0e6f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -354,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p)
        return p->pre_handler == aggr_pre_handler;
 }
+/* Return true(!0) if the kprobe is unused */
+static inline int kprobe_unused(struct kprobe *p)
+{
+        return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
+               list_empty(&p->list);
+}
 /*
 * Keep all fields in the kprobe consistent
 */
-static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
 {
-        memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
+        memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
-        memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+        memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
 }
 #ifdef CONFIG_OPTPROBES
@@ -384,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
        }
 }
+/* Free optimized instructions and optimized_kprobe */
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = container_of(p, struct optimized_kprobe, kp);
+        arch_remove_optimized_kprobe(op);
+        arch_remove_kprobe(p);
+        kfree(op);
+}
 /* Return true(!0) if the kprobe is ready for optimization. */
 static inline int kprobe_optready(struct kprobe *p)
 {
@@ -397,6 +415,33 @@ static inline int kprobe_optready(struct kprobe *p)
        return 0;
 }
+/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
+static inline int kprobe_disarmed(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
+        if (!kprobe_aggrprobe(p))
+                return kprobe_disabled(p);
+        op = container_of(p, struct optimized_kprobe, kp);
+        return kprobe_disabled(p) && list_empty(&op->list);
+}
+/* Return true(!0) if the probe is queued on (un)optimizing lists */
+static int __kprobes kprobe_queued(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        if (kprobe_aggrprobe(p)) {
+                op = container_of(p, struct optimized_kprobe, kp);
+                if (!list_empty(&op->list))
+                        return 1;
+        }
+        return 0;
+}
 /*
 * Return an optimized kprobe whose optimizing code replaces
 * instructions including addr (exclude breakpoint).
@@ -422,30 +467,23 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
 /* Optimization staging list, protected by kprobe_mutex */
 static LIST_HEAD(optimizing_list);
+static LIST_HEAD(unoptimizing_list);
 static void kprobe_optimizer(struct work_struct *work);
 static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
+static DECLARE_COMPLETION(optimizer_comp);
 #define OPTIMIZE_DELAY 5
-/* Kprobe jump optimizer */
+/*
-static __kprobes void kprobe_optimizer(struct work_struct *work)
+ * Optimize (replace a breakpoint with a jump) kprobes listed on
+ * optimizing_list.
+ */
+static __kprobes void do_optimize_kprobes(void)
 {
-        struct optimized_kprobe *op, *tmp;
+        /* Optimization never be done when disarmed */
+        if (kprobes_all_disarmed || !kprobes_allow_optimization ||
-        /* Lock modules while optimizing kprobes */
+            list_empty(&optimizing_list))
-        mutex_lock(&module_mutex);
+                return;
-        mutex_lock(&kprobe_mutex);
-        if (kprobes_all_disarmed || !kprobes_allow_optimization)
-                goto end;
-        /*
-         * Wait for quiesence period to ensure all running interrupts
-         * are done. Because optprobe may modify multiple instructions
-         * there is a chance that Nth instruction is interrupted. In that
-         * case, running interrupt can return to 2nd-Nth byte of jump
-         * instruction. This wait is for avoiding it.
-         */
-        synchronize_sched();
        /*
         * The optimization/unoptimization refers online_cpus via
@@ -459,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
         */
        get_online_cpus();
        mutex_lock(&text_mutex);
-        list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
+        arch_optimize_kprobes(&optimizing_list);
-                WARN_ON(kprobe_disabled(&op->kp));
+        mutex_unlock(&text_mutex);
-                if (arch_optimize_kprobe(op) < 0)
+        put_online_cpus();
-                        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+}
-                list_del_init(&op->list);
+/*
+ * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
+ * if need) kprobes listed on unoptimizing_list.
+ */
+static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
+{
+        struct optimized_kprobe *op, *tmp;
+        /* Unoptimization must be done anytime */
+        if (list_empty(&unoptimizing_list))
+                return;
+        /* Ditto to do_optimize_kprobes */
+        get_online_cpus();
+        mutex_lock(&text_mutex);
+        arch_unoptimize_kprobes(&unoptimizing_list, free_list);
+        /* Loop free_list for disarming */
+        list_for_each_entry_safe(op, tmp, free_list, list) {
+                /* Disarm probes if marked disabled */
+                if (kprobe_disabled(&op->kp))
+                        arch_disarm_kprobe(&op->kp);
+                if (kprobe_unused(&op->kp)) {
+                        /*
+                         * Remove unused probes from hash list. After waiting
+                         * for synchronization, these probes are reclaimed.
+                         * (reclaiming is done by do_free_cleaned_kprobes.)
+                         */
+                        hlist_del_rcu(&op->kp.hlist);
+                } else
+                        list_del_init(&op->list);
        }
        mutex_unlock(&text_mutex);
        put_online_cpus();
-end:
+}
+/* Reclaim all kprobes on the free_list */
+static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
+{
+        struct optimized_kprobe *op, *tmp;
+        list_for_each_entry_safe(op, tmp, free_list, list) {
+                BUG_ON(!kprobe_unused(&op->kp));
+                list_del_init(&op->list);
+                free_aggr_kprobe(&op->kp);
+        }
+}
+/* Start optimizer after OPTIMIZE_DELAY passed */
+static __kprobes void kick_kprobe_optimizer(void)
+{
+        if (!delayed_work_pending(&optimizing_work))
+                schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+}
+/* Kprobe jump optimizer */
+static __kprobes void kprobe_optimizer(struct work_struct *work)
+{
+        LIST_HEAD(free_list);
+        /* Lock modules while optimizing kprobes */
+        mutex_lock(&module_mutex);
+        mutex_lock(&kprobe_mutex);
+        /*
+         * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
+         * kprobes before waiting for quiesence period.
+         */
+        do_unoptimize_kprobes(&free_list);
+        /*
+         * Step 2: Wait for quiesence period to ensure all running interrupts
+         * are done. Because optprobe may modify multiple instructions
+         * there is a chance that Nth instruction is interrupted. In that
+         * case, running interrupt can return to 2nd-Nth byte of jump
+         * instruction. This wait is for avoiding it.
+         */
+        synchronize_sched();
+        /* Step 3: Optimize kprobes after quiesence period */
+        do_optimize_kprobes();
+        /* Step 4: Free cleaned kprobes after quiesence period */
+        do_free_cleaned_kprobes(&free_list);
        mutex_unlock(&kprobe_mutex);
        mutex_unlock(&module_mutex);
+        /* Step 5: Kick optimizer again if needed */
+        if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
+                kick_kprobe_optimizer();
+        else
+                /* Wake up all waiters */
+                complete_all(&optimizer_comp);
+}
+/* Wait for completing optimization and unoptimization */
+static __kprobes void wait_for_kprobe_optimizer(void)
+{
+        if (delayed_work_pending(&optimizing_work))
+                wait_for_completion(&optimizer_comp);
 }
 /* Optimize kprobe if p is ready to be optimized */
@@ -495,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
        /* Check if it is already optimized. */
        if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
                return;
        op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
-        list_add(&op->list, &optimizing_list);
-        if (!delayed_work_pending(&optimizing_work))
+        if (!list_empty(&op->list))
-                schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+                /* This is under unoptimizing. Just dequeue the probe */
+                list_del_init(&op->list);
+        else {
+                list_add(&op->list, &optimizing_list);
+                kick_kprobe_optimizer();
+        }
+}
+/* Short cut to direct unoptimizing */
+static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+        get_online_cpus();
+        arch_unoptimize_kprobe(op);
+        put_online_cpus();
+        if (kprobe_disabled(&op->kp))
+                arch_disarm_kprobe(&op->kp);
 }
 /* Unoptimize a kprobe if p is optimized */
-static __kprobes void unoptimize_kprobe(struct kprobe *p)
+static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
 {
        struct optimized_kprobe *op;
-        if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
+        if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
-                op = container_of(p, struct optimized_kprobe, kp);
+                return; /* This is not an optprobe nor optimized */
-                if (!list_empty(&op->list))
-                        /* Dequeue from the optimization queue */
+        op = container_of(p, struct optimized_kprobe, kp);
+        if (!kprobe_optimized(p)) {
+                /* Unoptimized or unoptimizing case */
+                if (force && !list_empty(&op->list)) {
+                        /*
+                         * Only if this is unoptimizing kprobe and forced,
+                         * forcibly unoptimize it. (No need to unoptimize
+                         * unoptimized kprobe again :)
+                         */
                        list_del_init(&op->list);
-                else
+                        force_unoptimize_kprobe(op);
-                        /* Replace jump with break */
+                }
-                        arch_unoptimize_kprobe(op);
+                return;
-                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+        }
+        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+        if (!list_empty(&op->list)) {
+                /* Dequeue from the optimization queue */
+                list_del_init(&op->list);
+                return;
+        }
+        /* Optimized kprobe case */
+        if (force)
+                /* Forcibly update the code: this is a special case */
+                force_unoptimize_kprobe(op);
+        else {
+                list_add(&op->list, &unoptimizing_list);
+                kick_kprobe_optimizer();
        }
 }
+/* Cancel unoptimizing for reusing */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+        struct optimized_kprobe *op;
+        BUG_ON(!kprobe_unused(ap));
+        /*
+         * Unused kprobe MUST be on the way of delayed unoptimizing (means
+         * there is still a relative jump) and disabled.
+         */
+        op = container_of(ap, struct optimized_kprobe, kp);
+        if (unlikely(list_empty(&op->list)))
+                printk(KERN_WARNING "Warning: found a stray unused "
+                        "aggrprobe@%p\n", ap->addr);
+        /* Enable the probe again */
+        ap->flags &= ~KPROBE_FLAG_DISABLED;
+        /* Optimize it again (remove from op->list) */
+        BUG_ON(!kprobe_optready(ap));
+        optimize_kprobe(ap);
+}
 /* Remove optimized instructions */
 static void __kprobes kill_optimized_kprobe(struct kprobe *p)
 {
        struct optimized_kprobe *op;
        op = container_of(p, struct optimized_kprobe, kp);
-        if (!list_empty(&op->list)) {
+        if (!list_empty(&op->list))
-                /* Dequeue from the optimization queue */
+                /* Dequeue from the (un)optimization queue */
                list_del_init(&op->list);
-                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
-        }
+        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
-        /* Don't unoptimize, because the target code will be freed. */
+        /* Don't touch the code, because it is already freed. */
        arch_remove_optimized_kprobe(op);
 }
@@ -543,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
        arch_prepare_optimized_kprobe(op);
 }
-/* Free optimized instructions and optimized_kprobe */
-static __kprobes void free_aggr_kprobe(struct kprobe *p)
-{
-        struct optimized_kprobe *op;
-        op = container_of(p, struct optimized_kprobe, kp);
-        arch_remove_optimized_kprobe(op);
-        kfree(op);
-}
 /* Allocate new optimized_kprobe and try to prepare optimized instructions */
 static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
 {
@@ -587,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
        op = container_of(ap, struct optimized_kprobe, kp);
        if (!arch_prepared_optinsn(&op->optinsn)) {
                /* If failed to setup optimizing, fallback to kprobe */
-                free_aggr_kprobe(ap);
+                arch_remove_optimized_kprobe(op);
+                kfree(op);
                return;
        }
@@ -631,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void)
                return;
        kprobes_allow_optimization = false;
-        printk(KERN_INFO "Kprobes globally unoptimized\n");
-        get_online_cpus();      /* For avoiding text_mutex deadlock */
-        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist) {
                        if (!kprobe_disabled(p))
-                                unoptimize_kprobe(p);
+                                unoptimize_kprobe(p, false);
                }
        }
+        /* Wait for unoptimizing completion */
-        mutex_unlock(&text_mutex);
+        wait_for_kprobe_optimizer();
-        put_online_cpus();
+        printk(KERN_INFO "Kprobes globally unoptimized\n");
-        /* Allow all currently running kprobes to complete */
-        synchronize_sched();
 }
 int sysctl_kprobes_optimization;
@@ -669,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
 }
 #endif /* CONFIG_SYSCTL */
+/* Put a breakpoint for a probe. Must be called with text_mutex locked */
 static void __kprobes __arm_kprobe(struct kprobe *p)
 {
-        struct kprobe *old_p;
+        struct kprobe *_p;
        /* Check collision with other optimized kprobes */
-        old_p = get_optimized_kprobe((unsigned long)p->addr);
+        _p = get_optimized_kprobe((unsigned long)p->addr);
-        if (unlikely(old_p))
+        if (unlikely(_p))
-                unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
+                /* Fallback to unoptimized kprobe */
+                unoptimize_kprobe(_p, true);
        arch_arm_kprobe(p);
        optimize_kprobe(p);     /* Try to optimize (add kprobe to a list) */
 }
-static void __kprobes __disarm_kprobe(struct kprobe *p)
+/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
+static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
 {
-        struct kprobe *old_p;
+        struct kprobe *_p;
-        unoptimize_kprobe(p);   /* Try to unoptimize */
+        unoptimize_kprobe(p, false);    /* Try to unoptimize */
-        arch_disarm_kprobe(p);
-        /* If another kprobe was blocked, optimize it. */
+        if (!kprobe_queued(p)) {
-        old_p = get_optimized_kprobe((unsigned long)p->addr);
+                arch_disarm_kprobe(p);
-        if (unlikely(old_p))
+                /* If another kprobe was blocked, optimize it. */
-                optimize_kprobe(old_p);
+                _p = get_optimized_kprobe((unsigned long)p->addr);
+                if (unlikely(_p) && reopt)
+                        optimize_kprobe(_p);
+        }
+        /* TODO: reoptimize others after unoptimized this probe */
 }
 #else /* !CONFIG_OPTPROBES */
 #define optimize_kprobe(p)                      do {} while (0)
-#define unoptimize_kprobe(p)                    do {} while (0)
+#define unoptimize_kprobe(p, f)                 do {} while (0)
 #define kill_optimized_kprobe(p)                do {} while (0)
 #define prepare_optimized_kprobe(p)             do {} while (0)
 #define try_to_optimize_kprobe(p)               do {} while (0)
 #define __arm_kprobe(p)                         arch_arm_kprobe(p)
-#define __disarm_kprobe(p)                      arch_disarm_kprobe(p)
+#define __disarm_kprobe(p, o)                   arch_disarm_kprobe(p)
+#define kprobe_disarmed(p)                      kprobe_disabled(p)
+#define wait_for_kprobe_optimizer()             do {} while (0)
+/* There should be no unused kprobes can be reused without optimization */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+        printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
+        BUG_ON(kprobe_unused(ap));
+}
 static __kprobes void free_aggr_kprobe(struct kprobe *p)
 {
+        arch_remove_kprobe(p);
        kfree(p);
 }
@@ -732,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
 /* Disarm a kprobe with text_mutex */
 static void __kprobes disarm_kprobe(struct kprobe *kp)
 {
-        get_online_cpus();      /* For avoiding text_mutex deadlock */
+        /* Ditto */
        mutex_lock(&text_mutex);
-        __disarm_kprobe(kp);
+        __disarm_kprobe(kp, true);
        mutex_unlock(&text_mutex);
-        put_online_cpus();
 }
 /*
@@ -942,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
        BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
        if (p->break_handler || p->post_handler)
-                unoptimize_kprobe(ap);  /* Fall back to normal kprobe */
+                unoptimize_kprobe(ap, true);    /* Fall back to normal kprobe */
        if (p->break_handler) {
                if (ap->break_handler)
@@ -993,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 * This is the second or subsequent kprobe at the address - handle
 * the intricacies
 */
-static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
+static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
                                          struct kprobe *p)
 {
        int ret = 0;
-        struct kprobe *ap = old_p;
+        struct kprobe *ap = orig_p;
-        if (!kprobe_aggrprobe(old_p)) {
+        if (!kprobe_aggrprobe(orig_p)) {
-                /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
+                /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
-                ap = alloc_aggr_kprobe(old_p);
+                ap = alloc_aggr_kprobe(orig_p);
                if (!ap)
                        return -ENOMEM;
-                init_aggr_kprobe(ap, old_p);
+                init_aggr_kprobe(ap, orig_p);
-        }
+        } else if (kprobe_unused(ap))
+                /* This probe is going to die. Rescue it */
+                reuse_unused_kprobe(ap);
        if (kprobe_gone(ap)) {
                /*
@@ -1039,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
        return add_new_kprobe(ap, p);
 }
-/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
-static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
-{
-        struct kprobe *kp;
-        list_for_each_entry_rcu(kp, &p->list, list) {
-                if (!kprobe_disabled(kp))
-                        /*
-                         * There is an active probe on the list.
-                         * We can't disable aggr_kprobe.
-                         */
-                        return 0;
-        }
-        p->flags |= KPROBE_FLAG_DISABLED;
-        return 1;
-}
 static int __kprobes in_kprobes_functions(unsigned long addr)
 {
        struct kprobe_blackpoint *kb;
@@ -1098,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
 /* Check passed kprobe is valid and return kprobe in kprobe_table. */
 static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
 {
-        struct kprobe *old_p, *list_p;
+        struct kprobe *ap, *list_p;
-        old_p = get_kprobe(p->addr);
+        ap = get_kprobe(p->addr);
-        if (unlikely(!old_p))
+        if (unlikely(!ap))
                return NULL;
-        if (p != old_p) {
+        if (p != ap) {
-                list_for_each_entry_rcu(list_p, &old_p->list, list)
+                list_for_each_entry_rcu(list_p, &ap->list, list)
                        if (list_p == p)
                        /* kprobe p is a valid probe */
                                goto valid;
                return NULL;
        }
 valid:
-        return old_p;
+        return ap;
 }
 /* Return error if the kprobe is being re-registered */
 static inline int check_kprobe_rereg(struct kprobe *p)
 {
        int ret = 0;
-        struct kprobe *old_p;
        mutex_lock(&kprobe_mutex);
-        old_p = __get_valid_kprobe(p);
+        if (__get_valid_kprobe(p))
-        if (old_p)
                ret = -EINVAL;
        mutex_unlock(&kprobe_mutex);
        return ret;
 }
@@ -1229,67 +1403,121 @@ fail_with_jump_label:
 }
 EXPORT_SYMBOL_GPL(register_kprobe);
+/* Check if all probes on the aggrprobe are disabled */
+static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
+{
+        struct kprobe *kp;
+        list_for_each_entry_rcu(kp, &ap->list, list)
+                if (!kprobe_disabled(kp))
+                        /*
+                         * There is an active probe on the list.
+                         * We can't disable this ap.
+                         */
+                        return 0;
+        return 1;
+}
+/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
+static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
+{
+        struct kprobe *orig_p;
+        /* Get an original kprobe for return */
+        orig_p = __get_valid_kprobe(p);
+        if (unlikely(orig_p == NULL))
+                return NULL;
+        if (!kprobe_disabled(p)) {
+                /* Disable probe if it is a child probe */
+                if (p != orig_p)
+                        p->flags |= KPROBE_FLAG_DISABLED;
+                /* Try to disarm and disable this/parent probe */
+                if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
+                        disarm_kprobe(orig_p);
+                        orig_p->flags |= KPROBE_FLAG_DISABLED;
+                }
+        }
+        return orig_p;
+}
 /*
 * Unregister a kprobe without a scheduler synchronization.
 */
 static int __kprobes __unregister_kprobe_top(struct kprobe *p)
 {
-        struct kprobe *old_p, *list_p;
+        struct kprobe *ap, *list_p;
-        old_p = __get_valid_kprobe(p);
+        /* Disable kprobe. This will disarm it if needed. */
-        if (old_p == NULL)
+        ap = __disable_kprobe(p);
+        if (ap == NULL)
                return -EINVAL;
-        if (old_p == p ||
+        if (ap == p)
-            (kprobe_aggrprobe(old_p) &&
-             list_is_singular(&old_p->list))) {
                /*
-                 * Only probe on the hash list. Disarm only if kprobes are
+                 * This probe is an independent(and non-optimized) kprobe
-                 * enabled and not gone - otherwise, the breakpoint would
+                 * (not an aggrprobe). Remove from the hash list.
-                 * already have been removed. We save on flushing icache.
                 */
-                if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
+                goto disarmed;
-                        disarm_kprobe(old_p);
-                hlist_del_rcu(&old_p->hlist);
+        /* Following process expects this probe is an aggrprobe */
-        } else {
+        WARN_ON(!kprobe_aggrprobe(ap));
+        if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
+                /*
+                 * !disarmed could be happen if the probe is under delayed
+                 * unoptimizing.
+                 */
+                goto disarmed;
+        else {
+                /* If disabling probe has special handlers, update aggrprobe */
                if (p->break_handler && !kprobe_gone(p))
-                        old_p->break_handler = NULL;
+                        ap->break_handler = NULL;
                if (p->post_handler && !kprobe_gone(p)) {
-                        list_for_each_entry_rcu(list_p, &old_p->list, list) {
+                        list_for_each_entry_rcu(list_p, &ap->list, list) {
                                if ((list_p != p) && (list_p->post_handler))
                                        goto noclean;
                        }
-                        old_p->post_handler = NULL;
+                        ap->post_handler = NULL;
                }
 noclean:
+                /*
+                 * Remove from the aggrprobe: this path will do nothing in
+                 * __unregister_kprobe_bottom().
+                 */
                list_del_rcu(&p->list);
-                if (!kprobe_disabled(old_p)) {
+                if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
-                        try_to_disable_aggr_kprobe(old_p);
+                        /*
-                        if (!kprobes_all_disarmed) {
+                         * Try to optimize this probe again, because post
-                                if (kprobe_disabled(old_p))
+                         * handler may have been changed.
-                                        disarm_kprobe(old_p);
+                         */
-                                else
+                        optimize_kprobe(ap);
-                                        /* Try to optimize this probe again */
-                                        optimize_kprobe(old_p);
-                        }
-                }
        }
        return 0;
+disarmed:
+        BUG_ON(!kprobe_disarmed(ap));
+        hlist_del_rcu(&ap->hlist);
+        return 0;
 }
 static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
 {
-        struct kprobe *old_p;
+        struct kprobe *ap;
        if (list_empty(&p->list))
+                /* This is an independent kprobe */
                arch_remove_kprobe(p);
        else if (list_is_singular(&p->list)) {
-                /* "p" is the last child of an aggr_kprobe */
+                /* This is the last child of an aggrprobe */
-                old_p = list_entry(p->list.next, struct kprobe, list);
+                ap = list_entry(p->list.next, struct kprobe, list);
                list_del(&p->list);
-                arch_remove_kprobe(old_p);
+                free_aggr_kprobe(ap);
-                free_aggr_kprobe(old_p);
        }
+        /* Otherwise, do nothing. */
 }
 int __kprobes register_kprobes(struct kprobe **kps, int num)
@@ -1607,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
 int __kprobes disable_kprobe(struct kprobe *kp)
 {
        int ret = 0;
-        struct kprobe *p;
        mutex_lock(&kprobe_mutex);
-        /* Check whether specified probe is valid. */
+        /* Disable this kprobe */
-        p = __get_valid_kprobe(kp);
+        if (__disable_kprobe(kp) == NULL)
-        if (unlikely(p == NULL)) {
                ret = -EINVAL;
-                goto out;
-        }
-        /* If the probe is already disabled (or gone), just return */
-        if (kprobe_disabled(kp))
-                goto out;
-        kp->flags |= KPROBE_FLAG_DISABLED;
-        if (p != kp)
-                /* When kp != p, p is always enabled. */
-                try_to_disable_aggr_kprobe(p);
-        if (!kprobes_all_disarmed && kprobe_disabled(p))
-                disarm_kprobe(p);
-out:
        mutex_unlock(&kprobe_mutex);
        return ret;
 }
@@ -1927,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void)
        mutex_lock(&kprobe_mutex);
        /* If kprobes are already disarmed, just return */
-        if (kprobes_all_disarmed)
+        if (kprobes_all_disarmed) {
-                goto already_disabled;
+                mutex_unlock(&kprobe_mutex);
+                return;
+        }
        kprobes_all_disarmed = true;
        printk(KERN_INFO "Kprobes globally disabled\n");
-        /*
-         * Here we call get_online_cpus() for avoiding text_mutex deadlock,
-         * because disarming may also unoptimize kprobes.
-         */
-        get_online_cpus();
        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist) {
                        if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
-                                __disarm_kprobe(p);
+                                __disarm_kprobe(p, false);
                }
        }
        mutex_unlock(&text_mutex);
-        put_online_cpus();
        mutex_unlock(&kprobe_mutex);
-        /* Allow all currently running kprobes to complete */
-        synchronize_sched();
-        return;
-already_disabled:
+        /* Wait for disarming all kprobes by optimizer */
-        mutex_unlock(&kprobe_mutex);
+        wait_for_kprobe_optimizer();
-        return;
 }
 /*
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index eac7e3364335..aede71245e9f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -312,9 +312,75 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
                ctx->nr_stat++;
 }
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__read_size(struct perf_event *event)
+{
+        int entry = sizeof(u64); /* value */
+        int size = 0;
+        int nr = 1;
+        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+                size += sizeof(u64);
+        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+                size += sizeof(u64);
+        if (event->attr.read_format & PERF_FORMAT_ID)
+                entry += sizeof(u64);
+        if (event->attr.read_format & PERF_FORMAT_GROUP) {
+                nr += event->group_leader->nr_siblings;
+                size += sizeof(u64);
+        }
+        size += entry * nr;
+        event->read_size = size;
+}
+static void perf_event__header_size(struct perf_event *event)
+{
+        struct perf_sample_data *data;
+        u64 sample_type = event->attr.sample_type;
+        u16 size = 0;
+        perf_event__read_size(event);
+        if (sample_type & PERF_SAMPLE_IP)
+                size += sizeof(data->ip);
+        if (sample_type & PERF_SAMPLE_TID)
+                size += sizeof(data->tid_entry);
+        if (sample_type & PERF_SAMPLE_TIME)
+                size += sizeof(data->time);
+        if (sample_type & PERF_SAMPLE_ADDR)
+                size += sizeof(data->addr);
+        if (sample_type & PERF_SAMPLE_ID)
+                size += sizeof(data->id);
+        if (sample_type & PERF_SAMPLE_STREAM_ID)
+                size += sizeof(data->stream_id);
+        if (sample_type & PERF_SAMPLE_CPU)
+                size += sizeof(data->cpu_entry);
+        if (sample_type & PERF_SAMPLE_PERIOD)
+                size += sizeof(data->period);
+        if (sample_type & PERF_SAMPLE_READ)
+                size += event->read_size;
+        event->header_size = size;
+}
 static void perf_group_attach(struct perf_event *event)
 {
-        struct perf_event *group_leader = event->group_leader;
+        struct perf_event *group_leader = event->group_leader, *pos;
        /*
         * We can have double attach due to group movement in perf_event_open.
@@ -333,6 +399,11 @@ static void perf_group_attach(struct perf_event *event)
        list_add_tail(&event->group_entry, &group_leader->sibling_list);
        group_leader->nr_siblings++;
+        perf_event__header_size(group_leader);
+        list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
+                perf_event__header_size(pos);
 }
 /*
@@ -391,7 +462,7 @@ static void perf_group_detach(struct perf_event *event)
        if (event->group_leader != event) {
                list_del_init(&event->group_entry);
                event->group_leader->nr_siblings--;
-                return;
+                goto out;
        }
        if (!list_empty(&event->group_entry))
@@ -410,6 +481,12 @@ static void perf_group_detach(struct perf_event *event)
                /* Inherit group flags from the previous leader */
                sibling->group_flags = event->group_flags;
        }
+out:
+        perf_event__header_size(event->group_leader);
+        list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
+                perf_event__header_size(tmp);
 }
 static inline int
@@ -1073,7 +1150,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
        /*
         * not supported on inherited events
         */
-        if (event->attr.inherit)
+        if (event->attr.inherit || !is_sampling_event(event))
                return -EINVAL;
        atomic_add(refresh, &event->event_limit);
@@ -2289,31 +2366,6 @@ static int perf_release(struct inode *inode, struct file *file)
        return perf_event_release_kernel(event);
 }
-static int perf_event_read_size(struct perf_event *event)
-{
-        int entry = sizeof(u64); /* value */
-        int size = 0;
-        int nr = 1;
-        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-                size += sizeof(u64);
-        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-                size += sizeof(u64);
-        if (event->attr.read_format & PERF_FORMAT_ID)
-                entry += sizeof(u64);
-        if (event->attr.read_format & PERF_FORMAT_GROUP) {
-                nr += event->group_leader->nr_siblings;
-                size += sizeof(u64);
-        }
-        size += entry * nr;
-        return size;
-}
 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 {
        struct perf_event *child;
@@ -2428,7 +2480,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
        if (event->state == PERF_EVENT_STATE_ERROR)
                return 0;
-        if (count < perf_event_read_size(event))
+        if (count < event->read_size)
                return -ENOSPC;
        WARN_ON_ONCE(event->ctx->parent_ctx);
@@ -2514,7 +2566,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
        int ret = 0;
        u64 value;
-        if (!event->attr.sample_period)
+        if (!is_sampling_event(event))
                return -EINVAL;
        if (copy_from_user(&value, arg, sizeof(value)))
@@ -3606,59 +3658,34 @@ void perf_prepare_sample(struct perf_event_header *header,
        data->type = sample_type;
        header->type = PERF_RECORD_SAMPLE;
-        header->size = sizeof(*header);
+        header->size = sizeof(*header) + event->header_size;
        header->misc = 0;
        header->misc |= perf_misc_flags(regs);
-        if (sample_type & PERF_SAMPLE_IP) {
+        if (sample_type & PERF_SAMPLE_IP)
                data->ip = perf_instruction_pointer(regs);
-                header->size += sizeof(data->ip);
-        }
        if (sample_type & PERF_SAMPLE_TID) {
                /* namespace issues */
                data->tid_entry.pid = perf_event_pid(event, current);
                data->tid_entry.tid = perf_event_tid(event, current);
-                header->size += sizeof(data->tid_entry);
        }
-        if (sample_type & PERF_SAMPLE_TIME) {
+        if (sample_type & PERF_SAMPLE_TIME)
                data->time = perf_clock();
-                header->size += sizeof(data->time);
+        if (sample_type & PERF_SAMPLE_ID)
-        }
-        if (sample_type & PERF_SAMPLE_ADDR)
-                header->size += sizeof(data->addr);
-        if (sample_type & PERF_SAMPLE_ID) {
                data->id = primary_event_id(event);
-                header->size += sizeof(data->id);
+        if (sample_type & PERF_SAMPLE_STREAM_ID)
-        }
-        if (sample_type & PERF_SAMPLE_STREAM_ID) {
                data->stream_id = event->id;
-                header->size += sizeof(data->stream_id);
-        }
        if (sample_type & PERF_SAMPLE_CPU) {
                data->cpu_entry.cpu             = raw_smp_processor_id();
                data->cpu_entry.reserved        = 0;
-                header->size += sizeof(data->cpu_entry);
        }
-        if (sample_type & PERF_SAMPLE_PERIOD)
-                header->size += sizeof(data->period);
-        if (sample_type & PERF_SAMPLE_READ)
-                header->size += perf_event_read_size(event);
        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                int size = 1;
@@ -3726,7 +3753,7 @@ perf_event_read_event(struct perf_event *event,
                .header = {
                        .type = PERF_RECORD_READ,
                        .misc = 0,
-                        .size = sizeof(read_event) + perf_event_read_size(event),
+                        .size = sizeof(read_event) + event->read_size,
                },
                .pid = perf_event_pid(event, task),
                .tid = perf_event_tid(event, task),
@@ -4240,6 +4267,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
        struct hw_perf_event *hwc = &event->hw;
        int ret = 0;
+        /*
+         * Non-sampling counters might still use the PMI to fold short
+         * hardware counters, ignore those.
+         */
+        if (unlikely(!is_sampling_event(event)))
+                return 0;
        if (!throttle) {
                hwc->interrupts++;
        } else {
@@ -4385,7 +4419,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
        if (!regs)
                return;
-        if (!hwc->sample_period)
+        if (!is_sampling_event(event))
                return;
        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
@@ -4548,7 +4582,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
        struct hw_perf_event *hwc = &event->hw;
        struct hlist_head *head;
-        if (hwc->sample_period) {
+        if (is_sampling_event(event)) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(event);
        }
@@ -4805,15 +4839,6 @@ static int perf_tp_event_init(struct perf_event *event)
        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                return -ENOENT;
-        /*
-         * Raw tracepoint data is a severe data leak, only allow root to
-         * have these.
-         */
-        if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
-                        perf_paranoid_tracepoint_raw() &&
-                        !capable(CAP_SYS_ADMIN))
-                return -EPERM;
        err = perf_trace_init(event);
        if (err)
                return err;
@@ -4926,31 +4951,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 static void perf_swevent_start_hrtimer(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
+        s64 period;
+        if (!is_sampling_event(event))
+                return;
        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hwc->hrtimer.function = perf_swevent_hrtimer;
-        if (hwc->sample_period) {
-                s64 period = local64_read(&hwc->period_left);
-                if (period) {
+        period = local64_read(&hwc->period_left);
-                        if (period < 0)
+        if (period) {
-                                period = 10000;
+                if (period < 0)
+                        period = 10000;
-                        local64_set(&hwc->period_left, 0);
+                local64_set(&hwc->period_left, 0);
-                } else {
+        } else {
-                        period = max_t(u64, 10000, hwc->sample_period);
+                period = max_t(u64, 10000, hwc->sample_period);
-                }
+        }
-                __hrtimer_start_range_ns(&hwc->hrtimer,
+        __hrtimer_start_range_ns(&hwc->hrtimer,
                                ns_to_ktime(period), 0,
                                HRTIMER_MODE_REL_PINNED, 0);
-        }
 }
 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
-        if (hwc->sample_period) {
+        if (is_sampling_event(event)) {
                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
                local64_set(&hwc->period_left, ktime_to_ns(remaining));
@@ -5715,6 +5742,11 @@ SYSCALL_DEFINE5(perf_event_open,
        mutex_unlock(&current->perf_event_mutex);
        /*
+         * Precalculate sample_data sizes
+         */
+        perf_event__header_size(event);
+        /*
         * Drop the reference on the group_event after placing the
         * new event on the sibling_list. This ensures destruction
         * of the group leader will find the pointer to itself in
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5abfa1518554..cbd97da7a613 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -746,22 +746,6 @@ static struct ctl_table kern_table[] = {
                .extra2         = &one,
        },
 #endif
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
-        {
-                .procname       = "unknown_nmi_panic",
-                .data           = &unknown_nmi_panic,
-                .maxlen         = sizeof (int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-        {
-                .procname       = "nmi_watchdog",
-                .data           = &nmi_watchdog_enabled,
-                .maxlen         = sizeof (int),
-                .mode           = 0644,
-                .proc_handler   = proc_nmi_enabled,
-        },
-#endif
 #if defined(CONFIG_X86)
        {
                .procname       = "panic_on_unrecovered_nmi",
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 39c059ca670e..19a359d5e6d5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
 /* Count the events in use (per event id, not per instance) */
 static int      total_ref_count;
+static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
+                                 struct perf_event *p_event)
+{
+        /* No tracing, just counting, so no obvious leak */
+        if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
+                return 0;
+        /* Some events are ok to be traced by non-root users... */
+        if (p_event->attach_state == PERF_ATTACH_TASK) {
+                if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
+                        return 0;
+        }
+        /*
+         * ...otherwise raw tracepoint data can be a severe data leak,
+         * only allow root to have these.
+         */
+        if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        return 0;
+}
 static int perf_trace_event_init(struct ftrace_event_call *tp_event,
                                 struct perf_event *p_event)
 {
        struct hlist_head __percpu *list;
-        int ret = -ENOMEM;
+        int ret;
        int cpu;
+        ret = perf_trace_event_perm(tp_event, p_event);
+        if (ret)
+                return ret;
        p_event->tp_event = tp_event;
        if (tp_event->perf_refcount++ > 0)
                return 0;
+        ret = -ENOMEM;
        list = alloc_percpu(struct hlist_head);
        if (!list)
                goto fail;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6e3c41a4024c..cad4e42060a9 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -547,13 +547,13 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
-static int __init spawn_watchdog_task(void)
+void __init lockup_detector_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
        int err;
        if (no_watchdog)
-                return 0;
+                return;
        err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
        WARN_ON(notifier_to_errno(err));
@@ -561,6 +561,5 @@ static int __init spawn_watchdog_task(void)
        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
        register_cpu_notifier(&cpu_nfb);
-        return 0;
+        return;
 }
-early_initcall(spawn_watchdog_task);