x86 mmiotrace: move files into arch/x86/mm/.

Signed-off-by: Pekka Paalanen <pq@iki.fi> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
author: Pekka Paalanen <pq@iki.fi> 2008-05-12 15:20:59 -0400
committer: Thomas Gleixner <tglx@linutronix.de> 2008-05-24 05:25:37 -0400
commit: ff3a3e9ba5e4273a8bc10570adab4a390fb90757 (patch)
tree: 63fd9b1c69ba53c514b9b2eb59ee17f10d6511de /arch/x86/mm/kmmio.c
parent: 49023168261a7f9a2fd4a1ca1adbfea922556015 (diff)
1 files changed, 499 insertions, 0 deletions
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
new file mode 100644
index 000000000000..3ad27b8504a5
--- /dev/null
+++ b/arch/x86/mm/kmmio.c
@@ -0,0 +1,499 @@
+/* Support for MMIO probes.
+ * Benfit many code from kprobes
+ * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
+ *     2007 Alexander Eichner
+ *     2008 Pekka Paalanen <pq@iki.fi>
+ */
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/mutex.h>
+#include <asm/io.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/errno.h>
+#include <asm/debugreg.h>
+#include <linux/mmiotrace.h>
+#define KMMIO_PAGE_HASH_BITS 4
+#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
+struct kmmio_fault_page {
+        struct list_head list;
+        struct kmmio_fault_page *release_next;
+        unsigned long page; /* location of the fault page */
+        /*
+         * Number of times this page has been registered as a part
+         * of a probe. If zero, page is disarmed and this may be freed.
+         * Used only by writers (RCU).
+         */
+        int count;
+};
+struct kmmio_delayed_release {
+        struct rcu_head rcu;
+        struct kmmio_fault_page *release_list;
+};
+struct kmmio_context {
+        struct kmmio_fault_page *fpage;
+        struct kmmio_probe *probe;
+        unsigned long saved_flags;
+        unsigned long addr;
+        int active;
+};
+static DEFINE_SPINLOCK(kmmio_lock);
+/* Protected by kmmio_lock */
+unsigned int kmmio_count;
+/* Read-protected by RCU, write-protected by kmmio_lock. */
+static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
+static LIST_HEAD(kmmio_probes);
+static struct list_head *kmmio_page_list(unsigned long page)
+{
+        return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
+}
+/* Accessed per-cpu */
+static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
+/*
+ * this is basically a dynamic stabbing problem:
+ * Could use the existing prio tree code or
+ * Possible better implementations:
+ * The Interval Skip List: A Data Structure for Finding All Intervals That
+ * Overlap a Point (might be simple)
+ * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
+ */
+/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
+static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
+{
+        struct kmmio_probe *p;
+        list_for_each_entry_rcu(p, &kmmio_probes, list) {
+                if (addr >= p->addr && addr <= (p->addr + p->len))
+                        return p;
+        }
+        return NULL;
+}
+/* You must be holding RCU read lock. */
+static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
+{
+        struct list_head *head;
+        struct kmmio_fault_page *p;
+        page &= PAGE_MASK;
+        head = kmmio_page_list(page);
+        list_for_each_entry_rcu(p, head, list) {
+                if (p->page == page)
+                        return p;
+        }
+        return NULL;
+}
+static void set_page_present(unsigned long addr, bool present, int *pglevel)
+{
+        pteval_t pteval;
+        pmdval_t pmdval;
+        int level;
+        pmd_t *pmd;
+        pte_t *pte = lookup_address(addr, &level);
+        if (!pte) {
+                pr_err("kmmio: no pte for page 0x%08lx\n", addr);
+                return;
+        }
+        if (pglevel)
+                *pglevel = level;
+        switch (level) {
+        case PG_LEVEL_2M:
+                pmd = (pmd_t *)pte;
+                pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
+                if (present)
+                        pmdval |= _PAGE_PRESENT;
+                set_pmd(pmd, __pmd(pmdval));
+                break;
+        case PG_LEVEL_4K:
+                pteval = pte_val(*pte) & ~_PAGE_PRESENT;
+                if (present)
+                        pteval |= _PAGE_PRESENT;
+                set_pte_atomic(pte, __pte(pteval));
+                break;
+        default:
+                pr_err("kmmio: unexpected page level 0x%x.\n", level);
+                return;
+        }
+        __flush_tlb_one(addr);
+}
+/** Mark the given page as not present. Access to it will trigger a fault. */
+static void arm_kmmio_fault_page(unsigned long page, int *page_level)
+{
+        set_page_present(page & PAGE_MASK, false, page_level);
+}
+/** Mark the given page as present. */
+static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
+{
+        set_page_present(page & PAGE_MASK, true, page_level);
+}
+/*
+ * This is being called from do_page_fault().
+ *
+ * We may be in an interrupt or a critical section. Also prefecthing may
+ * trigger a page fault. We may be in the middle of process switch.
+ * We cannot take any locks, because we could be executing especially
+ * within a kmmio critical section.
+ *
+ * Local interrupts are disabled, so preemption cannot happen.
+ * Do not enable interrupts, do not sleep, and watch out for other CPUs.
+ */
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate
+ * and they remain disabled thorough out this function.
+ */
+int kmmio_handler(struct pt_regs *regs, unsigned long addr)
+{
+        struct kmmio_context *ctx;
+        struct kmmio_fault_page *faultpage;
+        int ret = 0; /* default to fault not handled */
+        /*
+         * Preemption is now disabled to prevent process switch during
+         * single stepping. We can only handle one active kmmio trace
+         * per cpu, so ensure that we finish it before something else
+         * gets to run. We also hold the RCU read lock over single
+         * stepping to avoid looking up the probe and kmmio_fault_page
+         * again.
+         */
+        preempt_disable();
+        rcu_read_lock();
+        faultpage = get_kmmio_fault_page(addr);
+        if (!faultpage) {
+                /*
+                 * Either this page fault is not caused by kmmio, or
+                 * another CPU just pulled the kmmio probe from under
+                 * our feet. The latter case should not be possible.
+                 */
+                goto no_kmmio;
+        }
+        ctx = &get_cpu_var(kmmio_ctx);
+        if (ctx->active) {
+                disarm_kmmio_fault_page(faultpage->page, NULL);
+                if (addr == ctx->addr) {
+                        /*
+                         * On SMP we sometimes get recursive probe hits on the
+                         * same address. Context is already saved, fall out.
+                         */
+                        pr_debug("kmmio: duplicate probe hit on CPU %d, for "
+                                                "address 0x%08lx.\n",
+                                                smp_processor_id(), addr);
+                        ret = 1;
+                        goto no_kmmio_ctx;
+                }
+                /*
+                 * Prevent overwriting already in-flight context.
+                 * This should not happen, let's hope disarming at least
+                 * prevents a panic.
+                 */
+                pr_emerg("kmmio: recursive probe hit on CPU %d, "
+                                        "for address 0x%08lx. Ignoring.\n",
+                                        smp_processor_id(), addr);
+                pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
+                                        ctx->addr);
+                goto no_kmmio_ctx;
+        }
+        ctx->active++;
+        ctx->fpage = faultpage;
+        ctx->probe = get_kmmio_probe(addr);
+        ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
+        ctx->addr = addr;
+        if (ctx->probe && ctx->probe->pre_handler)
+                ctx->probe->pre_handler(ctx->probe, regs, addr);
+        /*
+         * Enable single-stepping and disable interrupts for the faulting
+         * context. Local interrupts must not get enabled during stepping.
+         */
+        regs->flags |= X86_EFLAGS_TF;
+        regs->flags &= ~X86_EFLAGS_IF;
+        /* Now we set present bit in PTE and single step. */
+        disarm_kmmio_fault_page(ctx->fpage->page, NULL);
+        /*
+         * If another cpu accesses the same page while we are stepping,
+         * the access will not be caught. It will simply succeed and the
+         * only downside is we lose the event. If this becomes a problem,
+         * the user should drop to single cpu before tracing.
+         */
+        put_cpu_var(kmmio_ctx);
+        return 1; /* fault handled */
+no_kmmio_ctx:
+        put_cpu_var(kmmio_ctx);
+no_kmmio:
+        rcu_read_unlock();
+        preempt_enable_no_resched();
+        return ret;
+}
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate
+ * and they remain disabled thorough out this function.
+ * This must always get called as the pair to kmmio_handler().
+ */
+static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
+{
+        int ret = 0;
+        struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
+        if (!ctx->active) {
+                pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+                                                        smp_processor_id());
+                goto out;
+        }
+        if (ctx->probe && ctx->probe->post_handler)
+                ctx->probe->post_handler(ctx->probe, condition, regs);
+        arm_kmmio_fault_page(ctx->fpage->page, NULL);
+        regs->flags &= ~X86_EFLAGS_TF;
+        regs->flags |= ctx->saved_flags;
+        /* These were acquired in kmmio_handler(). */
+        ctx->active--;
+        BUG_ON(ctx->active);
+        rcu_read_unlock();
+        preempt_enable_no_resched();
+        /*
+         * if somebody else is singlestepping across a probe point, flags
+         * will have TF set, in which case, continue the remaining processing
+         * of do_debug, as if this is not a probe hit.
+         */
+        if (!(regs->flags & X86_EFLAGS_TF))
+                ret = 1;
+out:
+        put_cpu_var(kmmio_ctx);
+        return ret;
+}
+/* You must be holding kmmio_lock. */
+static int add_kmmio_fault_page(unsigned long page)
+{
+        struct kmmio_fault_page *f;
+        page &= PAGE_MASK;
+        f = get_kmmio_fault_page(page);
+        if (f) {
+                if (!f->count)
+                        arm_kmmio_fault_page(f->page, NULL);
+                f->count++;
+                return 0;
+        }
+        f = kmalloc(sizeof(*f), GFP_ATOMIC);
+        if (!f)
+                return -1;
+        f->count = 1;
+        f->page = page;
+        list_add_rcu(&f->list, kmmio_page_list(f->page));
+        arm_kmmio_fault_page(f->page, NULL);
+        return 0;
+}
+/* You must be holding kmmio_lock. */
+static void release_kmmio_fault_page(unsigned long page,
+                                struct kmmio_fault_page **release_list)
+{
+        struct kmmio_fault_page *f;
+        page &= PAGE_MASK;
+        f = get_kmmio_fault_page(page);
+        if (!f)
+                return;
+        f->count--;
+        BUG_ON(f->count < 0);
+        if (!f->count) {
+                disarm_kmmio_fault_page(f->page, NULL);
+                f->release_next = *release_list;
+                *release_list = f;
+        }
+}
+int register_kmmio_probe(struct kmmio_probe *p)
+{
+        unsigned long flags;
+        int ret = 0;
+        unsigned long size = 0;
+        spin_lock_irqsave(&kmmio_lock, flags);
+        if (get_kmmio_probe(p->addr)) {
+                ret = -EEXIST;
+                goto out;
+        }
+        kmmio_count++;
+        list_add_rcu(&p->list, &kmmio_probes);
+        while (size < p->len) {
+                if (add_kmmio_fault_page(p->addr + size))
+                        pr_err("kmmio: Unable to set page fault.\n");
+                size += PAGE_SIZE;
+        }
+out:
+        spin_unlock_irqrestore(&kmmio_lock, flags);
+        /*
+         * XXX: What should I do here?
+         * Here was a call to global_flush_tlb(), but it does not exist
+         * anymore. It seems it's not needed after all.
+         */
+        return ret;
+}
+EXPORT_SYMBOL(register_kmmio_probe);
+static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
+{
+        struct kmmio_delayed_release *dr = container_of(
+                                                head,
+                                                struct kmmio_delayed_release,
+                                                rcu);
+        struct kmmio_fault_page *p = dr->release_list;
+        while (p) {
+                struct kmmio_fault_page *next = p->release_next;
+                BUG_ON(p->count);
+                kfree(p);
+                p = next;
+        }
+        kfree(dr);
+}
+static void remove_kmmio_fault_pages(struct rcu_head *head)
+{
+        struct kmmio_delayed_release *dr = container_of(
+                                                head,
+                                                struct kmmio_delayed_release,
+                                                rcu);
+        struct kmmio_fault_page *p = dr->release_list;
+        struct kmmio_fault_page **prevp = &dr->release_list;
+        unsigned long flags;
+        spin_lock_irqsave(&kmmio_lock, flags);
+        while (p) {
+                if (!p->count)
+                        list_del_rcu(&p->list);
+                else
+                        *prevp = p->release_next;
+                prevp = &p->release_next;
+                p = p->release_next;
+        }
+        spin_unlock_irqrestore(&kmmio_lock, flags);
+        /* This is the real RCU destroy call. */
+        call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
+}
+/*
+ * Remove a kmmio probe. You have to synchronize_rcu() before you can be
+ * sure that the callbacks will not be called anymore. Only after that
+ * you may actually release your struct kmmio_probe.
+ *
+ * Unregistering a kmmio fault page has three steps:
+ * 1. release_kmmio_fault_page()
+ *    Disarm the page, wait a grace period to let all faults finish.
+ * 2. remove_kmmio_fault_pages()
+ *    Remove the pages from kmmio_page_table.
+ * 3. rcu_free_kmmio_fault_pages()
+ *    Actally free the kmmio_fault_page structs as with RCU.
+ */
+void unregister_kmmio_probe(struct kmmio_probe *p)
+{
+        unsigned long flags;
+        unsigned long size = 0;
+        struct kmmio_fault_page *release_list = NULL;
+        struct kmmio_delayed_release *drelease;
+        spin_lock_irqsave(&kmmio_lock, flags);
+        while (size < p->len) {
+                release_kmmio_fault_page(p->addr + size, &release_list);
+                size += PAGE_SIZE;
+        }
+        list_del_rcu(&p->list);
+        kmmio_count--;
+        spin_unlock_irqrestore(&kmmio_lock, flags);
+        drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
+        if (!drelease) {
+                pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
+                return;
+        }
+        drelease->release_list = release_list;
+        /*
+         * This is not really RCU here. We have just disarmed a set of
+         * pages so that they cannot trigger page faults anymore. However,
+         * we cannot remove the pages from kmmio_page_table,
+         * because a probe hit might be in flight on another CPU. The
+         * pages are collected into a list, and they will be removed from
+         * kmmio_page_table when it is certain that no probe hit related to
+         * these pages can be in flight. RCU grace period sounds like a
+         * good choice.
+         *
+         * If we removed the pages too early, kmmio page fault handler might
+         * not find the respective kmmio_fault_page and determine it's not
+         * a kmmio fault, when it actually is. This would lead to madness.
+         */
+        call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
+}
+EXPORT_SYMBOL(unregister_kmmio_probe);
+static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
+                                                                void *args)
+{
+        struct die_args *arg = args;
+        if (val == DIE_DEBUG && (arg->err & DR_STEP))
+                if (post_kmmio_handler(arg->err, arg->regs) == 1)
+                        return NOTIFY_STOP;
+        return NOTIFY_DONE;
+}
+static struct notifier_block nb_die = {
+        .notifier_call = kmmio_die_notifier
+};
+static int __init init_kmmio(void)
+{
+        int i;
+        for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+                INIT_LIST_HEAD(&kmmio_page_table[i]);
+        return register_die_notifier(&nb_die);
+}
+fs_initcall(init_kmmio); /* should be before device_initcall() */
author	Pekka Paalanen <pq@iki.fi>	2008-05-12 15:20:59 -0400
committer	Thomas Gleixner <tglx@linutronix.de>	2008-05-24 05:25:37 -0400
commit	ff3a3e9ba5e4273a8bc10570adab4a390fb90757 (patch)
tree	63fd9b1c69ba53c514b9b2eb59ee17f10d6511de /arch/x86/mm/kmmio.c
parent	49023168261a7f9a2fd4a1ca1adbfea922556015 (diff)

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c new file mode 100644 index 000000000000..3ad27b8504a5 --- /dev/null +++ b/arch/x86/mm/kmmio.c
@@ -0,0 +1,499 @@
	1	/* Support for MMIO probes.
	2	* Benfit many code from kprobes
	3	* (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
	4	* 2007 Alexander Eichner
	5	* 2008 Pekka Paalanen <pq@iki.fi>
	6	*/
	7
	8	#include <linux/list.h>
	9	#include <linux/spinlock.h>
	10	#include <linux/hash.h>
	11	#include <linux/init.h>
	12	#include <linux/module.h>
	13	#include <linux/kernel.h>
	14	#include <linux/uaccess.h>
	15	#include <linux/ptrace.h>
	16	#include <linux/preempt.h>
	17	#include <linux/percpu.h>
	18	#include <linux/kdebug.h>
	19	#include <linux/mutex.h>
	20	#include <asm/io.h>
	21	#include <asm/cacheflush.h>
	22	#include <asm/tlbflush.h>
	23	#include <asm/errno.h>
	24	#include <asm/debugreg.h>
	25	#include <linux/mmiotrace.h>
	26
	27	#define KMMIO_PAGE_HASH_BITS 4
	28	#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
	29
	30	struct kmmio_fault_page {
	31	struct list_head list;
	32	struct kmmio_fault_page *release_next;
	33	unsigned long page; /* location of the fault page */
	34
	35	/*
	36	* Number of times this page has been registered as a part
	37	* of a probe. If zero, page is disarmed and this may be freed.
	38	* Used only by writers (RCU).
	39	*/
	40	int count;
	41	};
	42
	43	struct kmmio_delayed_release {
	44	struct rcu_head rcu;
	45	struct kmmio_fault_page *release_list;
	46	};
	47
	48	struct kmmio_context {
	49	struct kmmio_fault_page *fpage;
	50	struct kmmio_probe *probe;
	51	unsigned long saved_flags;
	52	unsigned long addr;
	53	int active;
	54	};
	55
	56	static DEFINE_SPINLOCK(kmmio_lock);
	57
	58	/* Protected by kmmio_lock */
	59	unsigned int kmmio_count;
	60
	61	/* Read-protected by RCU, write-protected by kmmio_lock. */
	62	static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
	63	static LIST_HEAD(kmmio_probes);
	64
	65	static struct list_head *kmmio_page_list(unsigned long page)
	66	{
	67	return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
	68	}
	69
	70	/* Accessed per-cpu */
	71	static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
	72
	73	/*
	74	* this is basically a dynamic stabbing problem:
	75	* Could use the existing prio tree code or
	76	* Possible better implementations:
	77	* The Interval Skip List: A Data Structure for Finding All Intervals That
	78	* Overlap a Point (might be simple)
	79	* Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
	80	*/
	81	/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
	82	static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
	83	{
	84	struct kmmio_probe *p;
	85	list_for_each_entry_rcu(p, &kmmio_probes, list) {
	86	if (addr >= p->addr && addr <= (p->addr + p->len))
	87	return p;
	88	}
	89	return NULL;
	90	}
	91
	92	/* You must be holding RCU read lock. */
	93	static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
	94	{
	95	struct list_head *head;
	96	struct kmmio_fault_page *p;
	97
	98	page &= PAGE_MASK;
	99	head = kmmio_page_list(page);
	100	list_for_each_entry_rcu(p, head, list) {
	101	if (p->page == page)
	102	return p;
	103	}
	104	return NULL;
	105	}
	106
	107	static void set_page_present(unsigned long addr, bool present, int *pglevel)
	108	{
	109	pteval_t pteval;
	110	pmdval_t pmdval;
	111	int level;
	112	pmd_t *pmd;
	113	pte_t *pte = lookup_address(addr, &level);
	114
	115	if (!pte) {
	116	pr_err("kmmio: no pte for page 0x%08lx\n", addr);
	117	return;
	118	}
	119
	120	if (pglevel)
	121	*pglevel = level;
	122
	123	switch (level) {
	124	case PG_LEVEL_2M:
	125	pmd = (pmd_t *)pte;
	126	pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
	127	if (present)
	128	pmdval \|= _PAGE_PRESENT;
	129	set_pmd(pmd, __pmd(pmdval));
	130	break;
	131
	132	case PG_LEVEL_4K:
	133	pteval = pte_val(*pte) & ~_PAGE_PRESENT;
	134	if (present)
	135	pteval \|= _PAGE_PRESENT;
	136	set_pte_atomic(pte, __pte(pteval));
	137	break;
	138
	139	default:
	140	pr_err("kmmio: unexpected page level 0x%x.\n", level);
	141	return;
	142	}
	143
	144	__flush_tlb_one(addr);
	145	}
	146
	147	/** Mark the given page as not present. Access to it will trigger a fault. */
	148	static void arm_kmmio_fault_page(unsigned long page, int *page_level)
	149	{
	150	set_page_present(page & PAGE_MASK, false, page_level);
	151	}
	152
	153	/** Mark the given page as present. */
	154	static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
	155	{
	156	set_page_present(page & PAGE_MASK, true, page_level);
	157	}
	158
	159	/*
	160	* This is being called from do_page_fault().
	161	*
	162	* We may be in an interrupt or a critical section. Also prefecthing may
	163	* trigger a page fault. We may be in the middle of process switch.
	164	* We cannot take any locks, because we could be executing especially
	165	* within a kmmio critical section.
	166	*
	167	* Local interrupts are disabled, so preemption cannot happen.
	168	* Do not enable interrupts, do not sleep, and watch out for other CPUs.
	169	*/
	170	/*
	171	* Interrupts are disabled on entry as trap3 is an interrupt gate
	172	* and they remain disabled thorough out this function.
	173	*/
	174	int kmmio_handler(struct pt_regs *regs, unsigned long addr)
	175	{
	176	struct kmmio_context *ctx;
	177	struct kmmio_fault_page *faultpage;
	178	int ret = 0; /* default to fault not handled */
	179
	180	/*
	181	* Preemption is now disabled to prevent process switch during
	182	* single stepping. We can only handle one active kmmio trace
	183	* per cpu, so ensure that we finish it before something else
	184	* gets to run. We also hold the RCU read lock over single
	185	* stepping to avoid looking up the probe and kmmio_fault_page
	186	* again.
	187	*/
	188	preempt_disable();
	189	rcu_read_lock();
	190
	191	faultpage = get_kmmio_fault_page(addr);
	192	if (!faultpage) {
	193	/*
	194	* Either this page fault is not caused by kmmio, or
	195	* another CPU just pulled the kmmio probe from under
	196	* our feet. The latter case should not be possible.
	197	*/
	198	goto no_kmmio;
	199	}
	200
	201	ctx = &get_cpu_var(kmmio_ctx);
	202	if (ctx->active) {
	203	disarm_kmmio_fault_page(faultpage->page, NULL);
	204	if (addr == ctx->addr) {
	205	/*
	206	* On SMP we sometimes get recursive probe hits on the
	207	* same address. Context is already saved, fall out.
	208	*/
	209	pr_debug("kmmio: duplicate probe hit on CPU %d, for "
	210	"address 0x%08lx.\n",
	211	smp_processor_id(), addr);
	212	ret = 1;
	213	goto no_kmmio_ctx;
	214	}
	215	/*
	216	* Prevent overwriting already in-flight context.
	217	* This should not happen, let's hope disarming at least
	218	* prevents a panic.
	219	*/
	220	pr_emerg("kmmio: recursive probe hit on CPU %d, "
	221	"for address 0x%08lx. Ignoring.\n",
	222	smp_processor_id(), addr);
	223	pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
	224	ctx->addr);
	225	goto no_kmmio_ctx;
	226	}
	227	ctx->active++;
	228
	229	ctx->fpage = faultpage;
	230	ctx->probe = get_kmmio_probe(addr);
	231	ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF \| X86_EFLAGS_IF));
	232	ctx->addr = addr;
	233
	234	if (ctx->probe && ctx->probe->pre_handler)
	235	ctx->probe->pre_handler(ctx->probe, regs, addr);
	236
	237	/*
	238	* Enable single-stepping and disable interrupts for the faulting
	239	* context. Local interrupts must not get enabled during stepping.
	240	*/
	241	regs->flags \|= X86_EFLAGS_TF;
	242	regs->flags &= ~X86_EFLAGS_IF;
	243
	244	/* Now we set present bit in PTE and single step. */
	245	disarm_kmmio_fault_page(ctx->fpage->page, NULL);
	246
	247	/*
	248	* If another cpu accesses the same page while we are stepping,
	249	* the access will not be caught. It will simply succeed and the
	250	* only downside is we lose the event. If this becomes a problem,
	251	* the user should drop to single cpu before tracing.
	252	*/
	253
	254	put_cpu_var(kmmio_ctx);
	255	return 1; /* fault handled */
	256
	257	no_kmmio_ctx:
	258	put_cpu_var(kmmio_ctx);
	259	no_kmmio:
	260	rcu_read_unlock();
	261	preempt_enable_no_resched();
	262	return ret;
	263	}
	264
	265	/*
	266	* Interrupts are disabled on entry as trap1 is an interrupt gate
	267	* and they remain disabled thorough out this function.
	268	* This must always get called as the pair to kmmio_handler().
	269	*/
	270	static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
	271	{
	272	int ret = 0;
	273	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
	274
	275	if (!ctx->active) {
	276	pr_debug("kmmio: spurious debug trap on CPU %d.\n",
	277	smp_processor_id());
	278	goto out;
	279	}
	280
	281	if (ctx->probe && ctx->probe->post_handler)
	282	ctx->probe->post_handler(ctx->probe, condition, regs);
	283
	284	arm_kmmio_fault_page(ctx->fpage->page, NULL);
	285
	286	regs->flags &= ~X86_EFLAGS_TF;
	287	regs->flags \|= ctx->saved_flags;
	288
	289	/* These were acquired in kmmio_handler(). */
	290	ctx->active--;
	291	BUG_ON(ctx->active);
	292	rcu_read_unlock();
	293	preempt_enable_no_resched();
	294
	295	/*
	296	* if somebody else is singlestepping across a probe point, flags
	297	* will have TF set, in which case, continue the remaining processing
	298	* of do_debug, as if this is not a probe hit.
	299	*/
	300	if (!(regs->flags & X86_EFLAGS_TF))
	301	ret = 1;
	302	out:
	303	put_cpu_var(kmmio_ctx);
	304	return ret;
	305	}
	306
	307	/* You must be holding kmmio_lock. */
	308	static int add_kmmio_fault_page(unsigned long page)
	309	{
	310	struct kmmio_fault_page *f;
	311
	312	page &= PAGE_MASK;
	313	f = get_kmmio_fault_page(page);
	314	if (f) {
	315	if (!f->count)
	316	arm_kmmio_fault_page(f->page, NULL);
	317	f->count++;
	318	return 0;
	319	}
	320
	321	f = kmalloc(sizeof(*f), GFP_ATOMIC);
	322	if (!f)
	323	return -1;
	324
	325	f->count = 1;
	326	f->page = page;
	327	list_add_rcu(&f->list, kmmio_page_list(f->page));
	328
	329	arm_kmmio_fault_page(f->page, NULL);
	330
	331	return 0;
	332	}
	333
	334	/* You must be holding kmmio_lock. */
	335	static void release_kmmio_fault_page(unsigned long page,
	336	struct kmmio_fault_page **release_list)
	337	{
	338	struct kmmio_fault_page *f;
	339
	340	page &= PAGE_MASK;
	341	f = get_kmmio_fault_page(page);
	342	if (!f)
	343	return;
	344
	345	f->count--;
	346	BUG_ON(f->count < 0);
	347	if (!f->count) {
	348	disarm_kmmio_fault_page(f->page, NULL);
	349	f->release_next = *release_list;
	350	*release_list = f;
	351	}
	352	}
	353
	354	int register_kmmio_probe(struct kmmio_probe *p)
	355	{
	356	unsigned long flags;
	357	int ret = 0;
	358	unsigned long size = 0;
	359
	360	spin_lock_irqsave(&kmmio_lock, flags);
	361	if (get_kmmio_probe(p->addr)) {
	362	ret = -EEXIST;
	363	goto out;
	364	}
	365	kmmio_count++;
	366	list_add_rcu(&p->list, &kmmio_probes);
	367	while (size < p->len) {
	368	if (add_kmmio_fault_page(p->addr + size))
	369	pr_err("kmmio: Unable to set page fault.\n");
	370	size += PAGE_SIZE;
	371	}
	372	out:
	373	spin_unlock_irqrestore(&kmmio_lock, flags);
	374	/*
	375	* XXX: What should I do here?
	376	* Here was a call to global_flush_tlb(), but it does not exist
	377	* anymore. It seems it's not needed after all.
	378	*/
	379	return ret;
	380	}
	381	EXPORT_SYMBOL(register_kmmio_probe);
	382
	383	static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
	384	{
	385	struct kmmio_delayed_release *dr = container_of(
	386	head,
	387	struct kmmio_delayed_release,
	388	rcu);
	389	struct kmmio_fault_page *p = dr->release_list;
	390	while (p) {
	391	struct kmmio_fault_page *next = p->release_next;
	392	BUG_ON(p->count);
	393	kfree(p);
	394	p = next;
	395	}
	396	kfree(dr);
	397	}
	398
	399	static void remove_kmmio_fault_pages(struct rcu_head *head)
	400	{
	401	struct kmmio_delayed_release *dr = container_of(
	402	head,
	403	struct kmmio_delayed_release,
	404	rcu);
	405	struct kmmio_fault_page *p = dr->release_list;
	406	struct kmmio_fault_page **prevp = &dr->release_list;
	407	unsigned long flags;
	408	spin_lock_irqsave(&kmmio_lock, flags);
	409	while (p) {
	410	if (!p->count)
	411	list_del_rcu(&p->list);
	412	else
	413	*prevp = p->release_next;
	414	prevp = &p->release_next;
	415	p = p->release_next;
	416	}
	417	spin_unlock_irqrestore(&kmmio_lock, flags);
	418	/* This is the real RCU destroy call. */
	419	call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
	420	}
	421
	422	/*
	423	* Remove a kmmio probe. You have to synchronize_rcu() before you can be
	424	* sure that the callbacks will not be called anymore. Only after that
	425	* you may actually release your struct kmmio_probe.
	426	*
	427	* Unregistering a kmmio fault page has three steps:
	428	* 1. release_kmmio_fault_page()
	429	* Disarm the page, wait a grace period to let all faults finish.
	430	* 2. remove_kmmio_fault_pages()
	431	* Remove the pages from kmmio_page_table.
	432	* 3. rcu_free_kmmio_fault_pages()
	433	* Actally free the kmmio_fault_page structs as with RCU.
	434	*/
	435	void unregister_kmmio_probe(struct kmmio_probe *p)
	436	{
	437	unsigned long flags;
	438	unsigned long size = 0;
	439	struct kmmio_fault_page *release_list = NULL;
	440	struct kmmio_delayed_release *drelease;
	441
	442	spin_lock_irqsave(&kmmio_lock, flags);
	443	while (size < p->len) {
	444	release_kmmio_fault_page(p->addr + size, &release_list);
	445	size += PAGE_SIZE;
	446	}
	447	list_del_rcu(&p->list);
	448	kmmio_count--;
	449	spin_unlock_irqrestore(&kmmio_lock, flags);
	450
	451	drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
	452	if (!drelease) {
	453	pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
	454	return;
	455	}
	456	drelease->release_list = release_list;
	457
	458	/*
	459	* This is not really RCU here. We have just disarmed a set of
	460	* pages so that they cannot trigger page faults anymore. However,
	461	* we cannot remove the pages from kmmio_page_table,
	462	* because a probe hit might be in flight on another CPU. The
	463	* pages are collected into a list, and they will be removed from
	464	* kmmio_page_table when it is certain that no probe hit related to
	465	* these pages can be in flight. RCU grace period sounds like a
	466	* good choice.
	467	*
	468	* If we removed the pages too early, kmmio page fault handler might
	469	* not find the respective kmmio_fault_page and determine it's not
	470	* a kmmio fault, when it actually is. This would lead to madness.
	471	*/
	472	call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
	473	}
	474	EXPORT_SYMBOL(unregister_kmmio_probe);
	475
	476	static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
	477	void *args)
	478	{
	479	struct die_args *arg = args;
	480
	481	if (val == DIE_DEBUG && (arg->err & DR_STEP))
	482	if (post_kmmio_handler(arg->err, arg->regs) == 1)
	483	return NOTIFY_STOP;
	484
	485	return NOTIFY_DONE;
	486	}
	487
	488	static struct notifier_block nb_die = {
	489	.notifier_call = kmmio_die_notifier
	490	};
	491
	492	static int __init init_kmmio(void)
	493	{
	494	int i;
	495	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
	496	INIT_LIST_HEAD(&kmmio_page_table[i]);
	497	return register_die_notifier(&nb_die);
	498	}
	499	fs_initcall(init_kmmio); /* should be before device_initcall() */