x86: mmiotrace full patch, preview 1

kmmio.c handles the list of mmio probes with callbacks, list of traced pages, and attaching into the page fault handler and die notifier. It arms, traps and disarms the given pages, this is the core of mmiotrace. mmio-mod.c is a user interface, hooking into ioremap functions and registering the mmio probes. It also decodes the required information from trapped mmio accesses via the pre and post callbacks in each probe. Currently, hooking into ioremap functions works by redefining the symbols of the target (binary) kernel module, so that it calls the traced versions of the functions. The most notable changes done since the last discussion are: - kmmio.c is a built-in, not part of the module - direct call from fault.c to kmmio.c, removing all dynamic hooks - prepare for unregistering probes at any time - make kmmio re-initializable and accessible to more than one user - rewrite kmmio locking to remove all spinlocks from page fault path Can I abuse call_rcu() like I do in kmmio.c:unregister_kmmio_probe() or is there a better way? The function called via call_rcu() itself calls call_rcu() again, will this work or break? There I need a second grace period for RCU after the first grace period for page faults. Mmiotrace itself (mmio-mod.c) is still a module, I am going to attack that next. At some point I will start looking into how to make mmiotrace a tracer component of ftrace (thanks for the hint, Ingo). Ftrace should make the user space part of mmiotracing as simple as 'cat /debug/trace/mmio > dump.txt'. Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
author: Pekka Paalanen <pq@iki.fi> 2008-05-12 15:20:57 -0400
committer: Thomas Gleixner <tglx@linutronix.de> 2008-05-24 05:22:12 -0400
commit: 0fd0e3da4557c479b820b9a4a7afa25b4637ddf2 (patch)
tree: 5f34b3673202303f394c6dd180a15751f50014e9 /arch/x86/kernel/mmiotrace/kmmio.c
parent: f513638030ca384b0bace4df64f0b82f6ae1e4c6 (diff)
1 files changed, 240 insertions, 109 deletions
diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c
index 5e239d0b8467..539a9b19588f 100644
--- a/arch/x86/kernel/mmiotrace/kmmio.c
+++ b/arch/x86/kernel/mmiotrace/kmmio.c
@@ -6,6 +6,7 @@
 */
 #include <linux/version.h>
+#include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/hash.h>
 #include <linux/init.h>
@@ -17,70 +18,119 @@
 #include <linux/ptrace.h>
 #include <linux/preempt.h>
 #include <linux/percpu.h>
+#include <linux/kdebug.h>
 #include <asm/io.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
-#include "kmmio.h"
+#include <linux/mmiotrace.h>
-#define KMMIO_HASH_BITS 6
-#define KMMIO_TABLE_SIZE (1 << KMMIO_HASH_BITS)
 #define KMMIO_PAGE_HASH_BITS 4
 #define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
+struct kmmio_fault_page {
+        struct list_head list;
+        struct kmmio_fault_page *release_next;
+        unsigned long page; /* location of the fault page */
+        /*
+         * Number of times this page has been registered as a part
+         * of a probe. If zero, page is disarmed and this may be freed.
+         * Used only by writers (RCU).
+         */
+        int count;
+};
+struct kmmio_delayed_release {
+        struct rcu_head rcu;
+        struct kmmio_fault_page *release_list;
+};
 struct kmmio_context {
        struct kmmio_fault_page *fpage;
        struct kmmio_probe *probe;
        unsigned long saved_flags;
+        unsigned long addr;
        int active;
 };
-static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code,
-                                                unsigned long address);
 static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
                                                                void *args);
+static DECLARE_MUTEX(kmmio_init_mutex);
 static DEFINE_SPINLOCK(kmmio_lock);
 /* These are protected by kmmio_lock */
+static int kmmio_initialized;
 unsigned int kmmio_count;
-static unsigned int handler_registered;
+/* Read-protected by RCU, write-protected by kmmio_lock. */
 static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
 static LIST_HEAD(kmmio_probes);
+static struct list_head *kmmio_page_list(unsigned long page)
+{
+        return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
+}
 /* Accessed per-cpu */
 static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
+/* protected by kmmio_init_mutex */
 static struct notifier_block nb_die = {
        .notifier_call = kmmio_die_notifier
 };
-int init_kmmio(void)
+/**
+ * Makes sure kmmio is initialized and usable.
+ * This must be called before any other kmmio function defined here.
+ * May sleep.
+ */
+void reference_kmmio(void)
 {
-        int i;
+        down(&kmmio_init_mutex);
-        for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+        spin_lock_irq(&kmmio_lock);
-                INIT_LIST_HEAD(&kmmio_page_table[i]);
+        if (!kmmio_initialized) {
+                int i;
-        register_die_notifier(&nb_die);
+                for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
-        return 0;
+                        INIT_LIST_HEAD(&kmmio_page_table[i]);
+                if (register_die_notifier(&nb_die))
+                        BUG();
+        }
+        kmmio_initialized++;
+        spin_unlock_irq(&kmmio_lock);
+        up(&kmmio_init_mutex);
 }
+EXPORT_SYMBOL_GPL(reference_kmmio);
-void cleanup_kmmio(void)
+/**
+ * Clean up kmmio after use. This must be called for every call to
+ * reference_kmmio(). All probes registered after the corresponding
+ * reference_kmmio() must have been unregistered when calling this.
+ * May sleep.
+ */
+void unreference_kmmio(void)
 {
-        /*
+        bool unreg = false;
-         * Assume the following have been already cleaned by calling
-         * unregister_kmmio_probe() appropriately:
+        down(&kmmio_init_mutex);
-         * kmmio_page_table, kmmio_probes
+        spin_lock_irq(&kmmio_lock);
-         */
-        if (handler_registered) {
+        if (kmmio_initialized == 1) {
-                if (mmiotrace_unregister_pf(&kmmio_page_fault))
+                BUG_ON(is_kmmio_active());
-                        BUG();
+                unreg = true;
-                synchronize_rcu();
        }
-        unregister_die_notifier(&nb_die);
+        kmmio_initialized--;
+        BUG_ON(kmmio_initialized < 0);
+        spin_unlock_irq(&kmmio_lock);
+        if (unreg)
+                unregister_die_notifier(&nb_die); /* calls sync_rcu() */
+        up(&kmmio_init_mutex);
 }
+EXPORT_SYMBOL(unreference_kmmio);
 /*
 * this is basically a dynamic stabbing problem:
@@ -90,33 +140,33 @@ void cleanup_kmmio(void)
 * Overlap a Point (might be simple)
 * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
 */
-/* Get the kmmio at this addr (if any). You must be holding kmmio_lock. */
+/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
 static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
 {
        struct kmmio_probe *p;
-        list_for_each_entry(p, &kmmio_probes, list) {
+        list_for_each_entry_rcu(p, &kmmio_probes, list) {
                if (addr >= p->addr && addr <= (p->addr + p->len))
                        return p;
        }
        return NULL;
 }
+/* You must be holding RCU read lock. */
 static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 {
-        struct list_head *head, *tmp;
+        struct list_head *head;
+        struct kmmio_fault_page *p;
        page &= PAGE_MASK;
-        head = &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
+        head = kmmio_page_list(page);
-        list_for_each(tmp, head) {
+        list_for_each_entry_rcu(p, head, list) {
-                struct kmmio_fault_page *p
-                        = list_entry(tmp, struct kmmio_fault_page, list);
                if (p->page == page)
                        return p;
        }
        return NULL;
 }
+/** Mark the given page as not present. Access to it will trigger a fault. */
 static void arm_kmmio_fault_page(unsigned long page, int *page_level)
 {
        unsigned long address = page & PAGE_MASK;
@@ -124,8 +174,8 @@ static void arm_kmmio_fault_page(unsigned long page, int *page_level)
        pte_t *pte = lookup_address(address, &level);
        if (!pte) {
-                printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n",
+                pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n",
-                                                __FUNCTION__, page);
+                                                        __func__, page);
                return;
        }
@@ -143,6 +193,7 @@ static void arm_kmmio_fault_page(unsigned long page, int *page_level)
        __flush_tlb_one(page);
 }
+/** Mark the given page as present. */
 static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
 {
        unsigned long address = page & PAGE_MASK;
@@ -150,8 +201,8 @@ static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
        pte_t *pte = lookup_address(address, &level);
        if (!pte) {
-                printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n",
+                pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n",
-                                                __FUNCTION__, page);
+                                                        __func__, page);
                return;
        }
@@ -170,12 +221,24 @@ static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
 }
 /*
+ * This is being called from do_page_fault().
+ *
+ * We may be in an interrupt or a critical section. Also prefecthing may
+ * trigger a page fault. We may be in the middle of process switch.
+ * We cannot take any locks, because we could be executing especially
+ * within a kmmio critical section.
+ *
+ * Local interrupts are disabled, so preemption cannot happen.
+ * Do not enable interrupts, do not sleep, and watch out for other CPUs.
+ */
+/*
 * Interrupts are disabled on entry as trap3 is an interrupt gate
 * and they remain disabled thorough out this function.
 */
-static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
+int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 {
-        struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
+        struct kmmio_context *ctx;
+        struct kmmio_fault_page *faultpage;
        /*
         * Preemption is now disabled to prevent process switch during
@@ -186,40 +249,40 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
         * XXX what if an interrupt occurs between returning from
         * do_page_fault() and entering the single-step exception handler?
         * And that interrupt triggers a kmmio trap?
+         * XXX If we tracing an interrupt service routine or whatever, is
+         * this enough to keep it on the current cpu?
         */
        preempt_disable();
-        /* interrupts disabled and CPU-local data => atomicity guaranteed. */
+        rcu_read_lock();
+        faultpage = get_kmmio_fault_page(addr);
+        if (!faultpage) {
+                /*
+                 * Either this page fault is not caused by kmmio, or
+                 * another CPU just pulled the kmmio probe from under
+                 * our feet. In the latter case all hell breaks loose.
+                 */
+                goto no_kmmio;
+        }
+        ctx = &get_cpu_var(kmmio_ctx);
        if (ctx->active) {
                /*
-                 * This avoids a deadlock with kmmio_lock.
+                 * Prevent overwriting already in-flight context.
                 * If this page fault really was due to kmmio trap,
                 * all hell breaks loose.
                 */
-                printk(KERN_EMERG "mmiotrace: recursive probe hit on CPU %d, "
+                pr_emerg("kmmio: recursive probe hit on CPU %d, "
-                                        "for address %lu. Ignoring.\n",
+                                        "for address 0x%08lx. Ignoring.\n",
                                        smp_processor_id(), addr);
-                goto no_kmmio;
+                goto no_kmmio_ctx;
        }
        ctx->active++;
-        /*
+        ctx->fpage = faultpage;
-         * Acquire the kmmio lock to prevent changes affecting
-         * get_kmmio_fault_page() and get_kmmio_probe(), since we save their
-         * returned pointers.
-         * The lock is released in post_kmmio_handler().
-         * XXX: could/should get_kmmio_*() be using RCU instead of spinlock?
-         */
-        spin_lock(&kmmio_lock);
-        ctx->fpage = get_kmmio_fault_page(addr);
-        if (!ctx->fpage) {
-                /* this page fault is not caused by kmmio */
-                goto no_kmmio_locked;
-        }
        ctx->probe = get_kmmio_probe(addr);
        ctx->saved_flags = (regs->flags & (TF_MASK|IF_MASK));
+        ctx->addr = addr;
        if (ctx->probe && ctx->probe->pre_handler)
                ctx->probe->pre_handler(ctx->probe, regs, addr);
@@ -227,46 +290,62 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
        regs->flags |= TF_MASK;
        regs->flags &= ~IF_MASK;
-        /* We hold lock, now we set present bit in PTE and single step. */
+        /* Now we set present bit in PTE and single step. */
        disarm_kmmio_fault_page(ctx->fpage->page, NULL);
        put_cpu_var(kmmio_ctx);
+        rcu_read_unlock();
        return 1;
-no_kmmio_locked:
+no_kmmio_ctx:
-        spin_unlock(&kmmio_lock);
+        put_cpu_var(kmmio_ctx);
-        ctx->active--;
 no_kmmio:
+        rcu_read_unlock();
        preempt_enable_no_resched();
-        put_cpu_var(kmmio_ctx);
+        return 0; /* page fault not handled by kmmio */
-        /* page fault not handled by kmmio */
-        return 0;
 }
 /*
 * Interrupts are disabled on entry as trap1 is an interrupt gate
 * and they remain disabled thorough out this function.
- * And we hold kmmio lock.
+ * This must always get called as the pair to kmmio_handler().
 */
 static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 {
        int ret = 0;
+        struct kmmio_probe *probe;
+        struct kmmio_fault_page *faultpage;
        struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
        if (!ctx->active)
                goto out;
+        rcu_read_lock();
+        faultpage = get_kmmio_fault_page(ctx->addr);
+        probe = get_kmmio_probe(ctx->addr);
+        if (faultpage != ctx->fpage || probe != ctx->probe) {
+                /*
+                 * The trace setup changed after kmmio_handler() and before
+                 * running this respective post handler. User does not want
+                 * the result anymore.
+                 */
+                ctx->probe = NULL;
+                ctx->fpage = NULL;
+        }
        if (ctx->probe && ctx->probe->post_handler)
                ctx->probe->post_handler(ctx->probe, condition, regs);
-        arm_kmmio_fault_page(ctx->fpage->page, NULL);
+        if (ctx->fpage)
+                arm_kmmio_fault_page(ctx->fpage->page, NULL);
        regs->flags &= ~TF_MASK;
        regs->flags |= ctx->saved_flags;
        /* These were acquired in kmmio_handler(). */
        ctx->active--;
-        spin_unlock(&kmmio_lock);
+        BUG_ON(ctx->active);
        preempt_enable_no_resched();
        /*
@@ -277,11 +356,13 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
        if (!(regs->flags & TF_MASK))
                ret = 1;
+        rcu_read_unlock();
 out:
        put_cpu_var(kmmio_ctx);
        return ret;
 }
+/* You must be holding kmmio_lock. */
 static int add_kmmio_fault_page(unsigned long page)
 {
        struct kmmio_fault_page *f;
@@ -289,6 +370,8 @@ static int add_kmmio_fault_page(unsigned long page)
        page &= PAGE_MASK;
        f = get_kmmio_fault_page(page);
        if (f) {
+                if (!f->count)
+                        arm_kmmio_fault_page(f->page, NULL);
                f->count++;
                return 0;
        }
@@ -299,15 +382,16 @@ static int add_kmmio_fault_page(unsigned long page)
        f->count = 1;
        f->page = page;
-        list_add(&f->list,
+        list_add_rcu(&f->list, kmmio_page_list(f->page));
-                 &kmmio_page_table[hash_long(f->page, KMMIO_PAGE_HASH_BITS)]);
        arm_kmmio_fault_page(f->page, NULL);
        return 0;
 }
-static void release_kmmio_fault_page(unsigned long page)
+/* You must be holding kmmio_lock. */
+static void release_kmmio_fault_page(unsigned long page,
+                                struct kmmio_fault_page **release_list)
 {
        struct kmmio_fault_page *f;
@@ -317,9 +401,11 @@ static void release_kmmio_fault_page(unsigned long page)
                return;
        f->count--;
+        BUG_ON(f->count < 0);
        if (!f->count) {
                disarm_kmmio_fault_page(f->page, NULL);
-                list_del(&f->list);
+                f->release_next = *release_list;
+                *release_list = f;
        }
 }
@@ -334,68 +420,113 @@ int register_kmmio_probe(struct kmmio_probe *p)
                ret = -EEXIST;
                goto out;
        }
-        list_add(&p->list, &kmmio_probes);
+        list_add_rcu(&p->list, &kmmio_probes);
-        /*printk("adding fault pages...\n");*/
        while (size < p->len) {
                if (add_kmmio_fault_page(p->addr + size))
-                        printk(KERN_ERR "mmio: Unable to set page fault.\n");
+                        pr_err("kmmio: Unable to set page fault.\n");
                size += PAGE_SIZE;
        }
-        if (!handler_registered) {
-                if (mmiotrace_register_pf(&kmmio_page_fault))
-                        printk(KERN_ERR "mmiotrace: Cannot register page "
-                                        "fault handler.\n");
-                else
-                        handler_registered++;
-        }
 out:
        spin_unlock_irq(&kmmio_lock);
        /*
         * XXX: What should I do here?
         * Here was a call to global_flush_tlb(), but it does not exist
-         * anymore.
+         * anymore. It seems it's not needed after all.
         */
        return ret;
 }
+EXPORT_SYMBOL(register_kmmio_probe);
+static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
+{
+        struct kmmio_delayed_release *dr = container_of(
+                                                head,
+                                                struct kmmio_delayed_release,
+                                                rcu);
+        struct kmmio_fault_page *p = dr->release_list;
+        while (p) {
+                struct kmmio_fault_page *next = p->release_next;
+                BUG_ON(p->count);
+                kfree(p);
+                p = next;
+        }
+        kfree(dr);
+}
+static void remove_kmmio_fault_pages(struct rcu_head *head)
+{
+        struct kmmio_delayed_release *dr = container_of(
+                                                head,
+                                                struct kmmio_delayed_release,
+                                                rcu);
+        struct kmmio_fault_page *p = dr->release_list;
+        struct kmmio_fault_page **prevp = &dr->release_list;
+        unsigned long flags;
+        spin_lock_irqsave(&kmmio_lock, flags);
+        while (p) {
+                if (!p->count)
+                        list_del_rcu(&p->list);
+                else
+                        *prevp = p->release_next;
+                prevp = &p->release_next;
+                p = p->release_next;
+        }
+        spin_unlock_irqrestore(&kmmio_lock, flags);
+        /* This is the real RCU destroy call. */
+        call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
+}
+/*
+ * Remove a kmmio probe. You have to synchronize_rcu() before you can be
+ * sure that the callbacks will not be called anymore.
+ *
+ * Unregistering a kmmio fault page has three steps:
+ * 1. release_kmmio_fault_page()
+ *    Disarm the page, wait a grace period to let all faults finish.
+ * 2. remove_kmmio_fault_pages()
+ *    Remove the pages from kmmio_page_table.
+ * 3. rcu_free_kmmio_fault_pages()
+ *    Actally free the kmmio_fault_page structs as with RCU.
+ */
 void unregister_kmmio_probe(struct kmmio_probe *p)
 {
        unsigned long size = 0;
+        struct kmmio_fault_page *release_list = NULL;
+        struct kmmio_delayed_release *drelease;
        spin_lock_irq(&kmmio_lock);
        while (size < p->len) {
-                release_kmmio_fault_page(p->addr + size);
+                release_kmmio_fault_page(p->addr + size, &release_list);
                size += PAGE_SIZE;
        }
-        list_del(&p->list);
+        list_del_rcu(&p->list);
        kmmio_count--;
        spin_unlock_irq(&kmmio_lock);
-}
-/*
+        drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
- * According to 2.6.20, mainly x86_64 arch:
+        if (!drelease) {
- * This is being called from do_page_fault(), via the page fault notifier
+                pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
- * chain. The chain is called for both user space faults and kernel space
+                return;
- * faults (address >= TASK_SIZE64), except not on faults serviced by
+        }
- * vmalloc_fault().
+        drelease->release_list = release_list;
- *
- * We may be in an interrupt or a critical section. Also prefecthing may
+        /*
- * trigger a page fault. We may be in the middle of process switch.
+         * This is not really RCU here. We have just disarmed a set of
- * The page fault hook functionality has put us inside RCU read lock.
+         * pages so that they cannot trigger page faults anymore. However,
- *
+         * we cannot remove the pages from kmmio_page_table,
- * Local interrupts are disabled, so preemption cannot happen.
+         * because a probe hit might be in flight on another CPU. The
- * Do not enable interrupts, do not sleep, and watch out for other CPUs.
+         * pages are collected into a list, and they will be removed from
- */
+         * kmmio_page_table when it is certain that no probe hit related to
-static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code,
+         * these pages can be in flight. RCU grace period sounds like a
-                                                unsigned long address)
+         * good choice.
-{
+         *
-        if (is_kmmio_active())
+         * If we removed the pages too early, kmmio page fault handler might
-                if (kmmio_handler(regs, address) == 1)
+         * not find the respective kmmio_fault_page and determine it's not
-                        return -1;
+         * a kmmio fault, when it actually is. This would lead to madness.
-        return 0;
+         */
+        call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
 }
+EXPORT_SYMBOL(unregister_kmmio_probe);
 static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
                                                                void *args)
author	Pekka Paalanen <pq@iki.fi>	2008-05-12 15:20:57 -0400
committer	Thomas Gleixner <tglx@linutronix.de>	2008-05-24 05:22:12 -0400
commit	0fd0e3da4557c479b820b9a4a7afa25b4637ddf2 (patch)
tree	5f34b3673202303f394c6dd180a15751f50014e9 /arch/x86/kernel/mmiotrace/kmmio.c
parent	f513638030ca384b0bace4df64f0b82f6ae1e4c6 (diff)