dm raid1: add userspace log

This patch contains a device-mapper mirror log module that forwards requests to userspace for processing. The structures used for communication between kernel and userspace are located in include/linux/dm-log-userspace.h. Due to the frequency, diversity, and 2-way communication nature of the exchanges between kernel and userspace, 'connector' was chosen as the interface for communication. The first log implementations written in userspace - "clustered-disk" and "clustered-core" - support clustered shared storage. A userspace daemon (in the LVM2 source code repository) uses openAIS/corosync to process requests in an ordered fashion with the rest of the nodes in the cluster so as to prevent log state corruption. Other implementations with no association to LVM or openAIS/corosync, are certainly possible. (Imagine if two machines are writing to the same region of a mirror. They would both mark the region dirty, but you need a cluster-aware entity that can handle properly marking the region clean when they are done. Otherwise, you might clear the region when the first machine is done, not the second.) Signed-off-by: Jonathan Brassow <jbrassow@redhat.com> Cc: Evgeniy Polyakov <johnpol@2ka.mipt.ru> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
author: Jonthan Brassow <jbrassow@redhat.com> 2009-06-22 05:12:35 -0400
committer: Alasdair G Kergon <agk@redhat.com> 2009-06-22 05:12:35 -0400
commit: f5db4af466e2dca0fe822019812d586ca910b00c (patch)
tree: 1bbaaa36509df9f7eecc19ccffa434048cf4b555 /drivers/md/dm-log-userspace-base.c
parent: 754c5fc7ebb417b23601a6222a6005cc2e7f2913 (diff)
1 files changed, 696 insertions, 0 deletions
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
new file mode 100644
index 000000000000..e69b96560997
--- /dev/null
+++ b/drivers/md/dm-log-userspace-base.c
@@ -0,0 +1,696 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+#include <linux/bio.h>
+#include <linux/dm-dirty-log.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-log-userspace.h>
+#include "dm-log-userspace-transfer.h"
+struct flush_entry {
+        int type;
+        region_t region;
+        struct list_head list;
+};
+struct log_c {
+        struct dm_target *ti;
+        uint32_t region_size;
+        region_t region_count;
+        char uuid[DM_UUID_LEN];
+        char *usr_argv_str;
+        uint32_t usr_argc;
+        /*
+         * in_sync_hint gets set when doing is_remote_recovering.  It
+         * represents the first region that needs recovery.  IOW, the
+         * first zero bit of sync_bits.  This can be useful for to limit
+         * traffic for calls like is_remote_recovering and get_resync_work,
+         * but be take care in its use for anything else.
+         */
+        uint64_t in_sync_hint;
+        spinlock_t flush_lock;
+        struct list_head flush_list;  /* only for clear and mark requests */
+};
+static mempool_t *flush_entry_pool;
+static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
+{
+        return kmalloc(sizeof(struct flush_entry), gfp_mask);
+}
+static void flush_entry_free(void *element, void *pool_data)
+{
+        kfree(element);
+}
+static int userspace_do_request(struct log_c *lc, const char *uuid,
+                                int request_type, char *data, size_t data_size,
+                                char *rdata, size_t *rdata_size)
+{
+        int r;
+        /*
+         * If the server isn't there, -ESRCH is returned,
+         * and we must keep trying until the server is
+         * restored.
+         */
+retry:
+        r = dm_consult_userspace(uuid, request_type, data,
+                                 data_size, rdata, rdata_size);
+        if (r != -ESRCH)
+                return r;
+        DMERR(" Userspace log server not found.");
+        while (1) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(2*HZ);
+                DMWARN("Attempting to contact userspace log server...");
+                r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str,
+                                         strlen(lc->usr_argv_str) + 1,
+                                         NULL, NULL);
+                if (!r)
+                        break;
+        }
+        DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
+        r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL,
+                                 0, NULL, NULL);
+        if (!r)
+                goto retry;
+        DMERR("Error trying to resume userspace log: %d", r);
+        return -ESRCH;
+}
+static int build_constructor_string(struct dm_target *ti,
+                                    unsigned argc, char **argv,
+                                    char **ctr_str)
+{
+        int i, str_size;
+        char *str = NULL;
+        *ctr_str = NULL;
+        for (i = 0, str_size = 0; i < argc; i++)
+                str_size += strlen(argv[i]) + 1; /* +1 for space between args */
+        str_size += 20; /* Max number of chars in a printed u64 number */
+        str = kzalloc(str_size, GFP_KERNEL);
+        if (!str) {
+                DMWARN("Unable to allocate memory for constructor string");
+                return -ENOMEM;
+        }
+        for (i = 0, str_size = 0; i < argc; i++)
+                str_size += sprintf(str + str_size, "%s ", argv[i]);
+        str_size += sprintf(str + str_size, "%llu",
+                            (unsigned long long)ti->len);
+        *ctr_str = str;
+        return str_size;
+}
+/*
+ * userspace_ctr
+ *
+ * argv contains:
+ *      <UUID> <other args>
+ * Where 'other args' is the userspace implementation specific log
+ * arguments.  An example might be:
+ *      <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]
+ *
+ * So, this module will strip off the <UUID> for identification purposes
+ * when communicating with userspace about a log; but will pass on everything
+ * else.
+ */
+static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
+                         unsigned argc, char **argv)
+{
+        int r = 0;
+        int str_size;
+        char *ctr_str = NULL;
+        struct log_c *lc = NULL;
+        uint64_t rdata;
+        size_t rdata_size = sizeof(rdata);
+        if (argc < 3) {
+                DMWARN("Too few arguments to userspace dirty log");
+                return -EINVAL;
+        }
+        lc = kmalloc(sizeof(*lc), GFP_KERNEL);
+        if (!lc) {
+                DMWARN("Unable to allocate userspace log context.");
+                return -ENOMEM;
+        }
+        lc->ti = ti;
+        if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
+                DMWARN("UUID argument too long.");
+                kfree(lc);
+                return -EINVAL;
+        }
+        strncpy(lc->uuid, argv[0], DM_UUID_LEN);
+        spin_lock_init(&lc->flush_lock);
+        INIT_LIST_HEAD(&lc->flush_list);
+        str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
+        if (str_size < 0) {
+                kfree(lc);
+                return str_size;
+        }
+        /* Send table string */
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR,
+                                 ctr_str, str_size, NULL, NULL);
+        if (r == -ESRCH) {
+                DMERR("Userspace log server not found");
+                goto out;
+        }
+        /* Since the region size does not change, get it now */
+        rdata_size = sizeof(rdata);
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE,
+                                 NULL, 0, (char *)&rdata, &rdata_size);
+        if (r) {
+                DMERR("Failed to get region size of dirty log");
+                goto out;
+        }
+        lc->region_size = (uint32_t)rdata;
+        lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
+out:
+        if (r) {
+                kfree(lc);
+                kfree(ctr_str);
+        } else {
+                lc->usr_argv_str = ctr_str;
+                lc->usr_argc = argc;
+                log->context = lc;
+        }
+        return r;
+}
+static void userspace_dtr(struct dm_dirty_log *log)
+{
+        int r;
+        struct log_c *lc = log->context;
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR,
+                                 NULL, 0,
+                                 NULL, NULL);
+        kfree(lc->usr_argv_str);
+        kfree(lc);
+        return;
+}
+static int userspace_presuspend(struct dm_dirty_log *log)
+{
+        int r;
+        struct log_c *lc = log->context;
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND,
+                                 NULL, 0,
+                                 NULL, NULL);
+        return r;
+}
+static int userspace_postsuspend(struct dm_dirty_log *log)
+{
+        int r;
+        struct log_c *lc = log->context;
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND,
+                                 NULL, 0,
+                                 NULL, NULL);
+        return r;
+}
+static int userspace_resume(struct dm_dirty_log *log)
+{
+        int r;
+        struct log_c *lc = log->context;
+        lc->in_sync_hint = 0;
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME,
+                                 NULL, 0,
+                                 NULL, NULL);
+        return r;
+}
+static uint32_t userspace_get_region_size(struct dm_dirty_log *log)
+{
+        struct log_c *lc = log->context;
+        return lc->region_size;
+}
+/*
+ * userspace_is_clean
+ *
+ * Check whether a region is clean.  If there is any sort of
+ * failure when consulting the server, we return not clean.
+ *
+ * Returns: 1 if clean, 0 otherwise
+ */
+static int userspace_is_clean(struct dm_dirty_log *log, region_t region)
+{
+        int r;
+        uint64_t region64 = (uint64_t)region;
+        int64_t is_clean;
+        size_t rdata_size;
+        struct log_c *lc = log->context;
+        rdata_size = sizeof(is_clean);
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN,
+                                 (char *)&region64, sizeof(region64),
+                                 (char *)&is_clean, &rdata_size);
+        return (r) ? 0 : (int)is_clean;
+}
+/*
+ * userspace_in_sync
+ *
+ * Check if the region is in-sync.  If there is any sort
+ * of failure when consulting the server, we assume that
+ * the region is not in sync.
+ *
+ * If 'can_block' is set, return immediately
+ *
+ * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK
+ */
+static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
+                             int can_block)
+{
+        int r;
+        uint64_t region64 = region;
+        int64_t in_sync;
+        size_t rdata_size;
+        struct log_c *lc = log->context;
+        /*
+         * We can never respond directly - even if in_sync_hint is
+         * set.  This is because another machine could see a device
+         * failure and mark the region out-of-sync.  If we don't go
+         * to userspace to ask, we might think the region is in-sync
+         * and allow a read to pick up data that is stale.  (This is
+         * very unlikely if a device actually fails; but it is very
+         * likely if a connection to one device from one machine fails.)
+         *
+         * There still might be a problem if the mirror caches the region
+         * state as in-sync... but then this call would not be made.  So,
+         * that is a mirror problem.
+         */
+        if (!can_block)
+                return -EWOULDBLOCK;
+        rdata_size = sizeof(in_sync);
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC,
+                                 (char *)&region64, sizeof(region64),
+                                 (char *)&in_sync, &rdata_size);
+        return (r) ? 0 : (int)in_sync;
+}
+/*
+ * userspace_flush
+ *
+ * This function is ok to block.
+ * The flush happens in two stages.  First, it sends all
+ * clear/mark requests that are on the list.  Then it
+ * tells the server to commit them.  This gives the
+ * server a chance to optimise the commit, instead of
+ * doing it for every request.
+ *
+ * Additionally, we could implement another thread that
+ * sends the requests up to the server - reducing the
+ * load on flush.  Then the flush would have less in
+ * the list and be responsible for the finishing commit.
+ *
+ * Returns: 0 on success, < 0 on failure
+ */
+static int userspace_flush(struct dm_dirty_log *log)
+{
+        int r = 0;
+        unsigned long flags;
+        struct log_c *lc = log->context;
+        LIST_HEAD(flush_list);
+        struct flush_entry *fe, *tmp_fe;
+        spin_lock_irqsave(&lc->flush_lock, flags);
+        list_splice_init(&lc->flush_list, &flush_list);
+        spin_unlock_irqrestore(&lc->flush_lock, flags);
+        if (list_empty(&flush_list))
+                return 0;
+        /*
+         * FIXME: Count up requests, group request types,
+         * allocate memory to stick all requests in and
+         * send to server in one go.  Failing the allocation,
+         * do it one by one.
+         */
+        list_for_each_entry(fe, &flush_list, list) {
+                r = userspace_do_request(lc, lc->uuid, fe->type,
+                                         (char *)&fe->region,
+                                         sizeof(fe->region),
+                                         NULL, NULL);
+                if (r)
+                        goto fail;
+        }
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
+                                 NULL, 0, NULL, NULL);
+fail:
+        /*
+         * We can safely remove these entries, even if failure.
+         * Calling code will receive an error and will know that
+         * the log facility has failed.
+         */
+        list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
+                list_del(&fe->list);
+                mempool_free(fe, flush_entry_pool);
+        }
+        if (r)
+                dm_table_event(lc->ti->table);
+        return r;
+}
+/*
+ * userspace_mark_region
+ *
+ * This function should avoid blocking unless absolutely required.
+ * (Memory allocation is valid for blocking.)
+ */
+static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
+{
+        unsigned long flags;
+        struct log_c *lc = log->context;
+        struct flush_entry *fe;
+        /* Wait for an allocation, but _never_ fail */
+        fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
+        BUG_ON(!fe);
+        spin_lock_irqsave(&lc->flush_lock, flags);
+        fe->type = DM_ULOG_MARK_REGION;
+        fe->region = region;
+        list_add(&fe->list, &lc->flush_list);
+        spin_unlock_irqrestore(&lc->flush_lock, flags);
+        return;
+}
+/*
+ * userspace_clear_region
+ *
+ * This function must not block.
+ * So, the alloc can't block.  In the worst case, it is ok to
+ * fail.  It would simply mean we can't clear the region.
+ * Does nothing to current sync context, but does mean
+ * the region will be re-sync'ed on a reload of the mirror
+ * even though it is in-sync.
+ */
+static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
+{
+        unsigned long flags;
+        struct log_c *lc = log->context;
+        struct flush_entry *fe;
+        /*
+         * If we fail to allocate, we skip the clearing of
+         * the region.  This doesn't hurt us in any way, except
+         * to cause the region to be resync'ed when the
+         * device is activated next time.
+         */
+        fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
+        if (!fe) {
+                DMERR("Failed to allocate memory to clear region.");
+                return;
+        }
+        spin_lock_irqsave(&lc->flush_lock, flags);
+        fe->type = DM_ULOG_CLEAR_REGION;
+        fe->region = region;
+        list_add(&fe->list, &lc->flush_list);
+        spin_unlock_irqrestore(&lc->flush_lock, flags);
+        return;
+}
+/*
+ * userspace_get_resync_work
+ *
+ * Get a region that needs recovery.  It is valid to return
+ * an error for this function.
+ *
+ * Returns: 1 if region filled, 0 if no work, <0 on error
+ */
+static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
+{
+        int r;
+        size_t rdata_size;
+        struct log_c *lc = log->context;
+        struct {
+                int64_t i; /* 64-bit for mix arch compatibility */
+                region_t r;
+        } pkg;
+        if (lc->in_sync_hint >= lc->region_count)
+                return 0;
+        rdata_size = sizeof(pkg);
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
+                                 NULL, 0,
+                                 (char *)&pkg, &rdata_size);
+        *region = pkg.r;
+        return (r) ? r : (int)pkg.i;
+}
+/*
+ * userspace_set_region_sync
+ *
+ * Set the sync status of a given region.  This function
+ * must not fail.
+ */
+static void userspace_set_region_sync(struct dm_dirty_log *log,
+                                      region_t region, int in_sync)
+{
+        int r;
+        struct log_c *lc = log->context;
+        struct {
+                region_t r;
+                int64_t i;
+        } pkg;
+        pkg.r = region;
+        pkg.i = (int64_t)in_sync;
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
+                                 (char *)&pkg, sizeof(pkg),
+                                 NULL, NULL);
+        /*
+         * It would be nice to be able to report failures.
+         * However, it is easy emough to detect and resolve.
+         */
+        return;
+}
+/*
+ * userspace_get_sync_count
+ *
+ * If there is any sort of failure when consulting the server,
+ * we assume that the sync count is zero.
+ *
+ * Returns: sync count on success, 0 on failure
+ */
+static region_t userspace_get_sync_count(struct dm_dirty_log *log)
+{
+        int r;
+        size_t rdata_size;
+        uint64_t sync_count;
+        struct log_c *lc = log->context;
+        rdata_size = sizeof(sync_count);
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
+                                 NULL, 0,
+                                 (char *)&sync_count, &rdata_size);
+        if (r)
+                return 0;
+        if (sync_count >= lc->region_count)
+                lc->in_sync_hint = lc->region_count;
+        return (region_t)sync_count;
+}
+/*
+ * userspace_status
+ *
+ * Returns: amount of space consumed
+ */
+static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
+                            char *result, unsigned maxlen)
+{
+        int r = 0;
+        size_t sz = (size_t)maxlen;
+        struct log_c *lc = log->context;
+        switch (status_type) {
+        case STATUSTYPE_INFO:
+                r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
+                                         NULL, 0,
+                                         result, &sz);
+                if (r) {
+                        sz = 0;
+                        DMEMIT("%s 1 COM_FAILURE", log->type->name);
+                }
+                break;
+        case STATUSTYPE_TABLE:
+                sz = 0;
+                DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1,
+                       lc->uuid, lc->usr_argv_str);
+                break;
+        }
+        return (r) ? 0 : (int)sz;
+}
+/*
+ * userspace_is_remote_recovering
+ *
+ * Returns: 1 if region recovering, 0 otherwise
+ */
+static int userspace_is_remote_recovering(struct dm_dirty_log *log,
+                                          region_t region)
+{
+        int r;
+        uint64_t region64 = region;
+        struct log_c *lc = log->context;
+        static unsigned long long limit;
+        struct {
+                int64_t is_recovering;
+                uint64_t in_sync_hint;
+        } pkg;
+        size_t rdata_size = sizeof(pkg);
+        /*
+         * Once the mirror has been reported to be in-sync,
+         * it will never again ask for recovery work.  So,
+         * we can safely say there is not a remote machine
+         * recovering if the device is in-sync.  (in_sync_hint
+         * must be reset at resume time.)
+         */
+        if (region < lc->in_sync_hint)
+                return 0;
+        else if (jiffies < limit)
+                return 1;
+        limit = jiffies + (HZ / 4);
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING,
+                                 (char *)&region64, sizeof(region64),
+                                 (char *)&pkg, &rdata_size);
+        if (r)
+                return 1;
+        lc->in_sync_hint = pkg.in_sync_hint;
+        return (int)pkg.is_recovering;
+}
+static struct dm_dirty_log_type _userspace_type = {
+        .name = "userspace",
+        .module = THIS_MODULE,
+        .ctr = userspace_ctr,
+        .dtr = userspace_dtr,
+        .presuspend = userspace_presuspend,
+        .postsuspend = userspace_postsuspend,
+        .resume = userspace_resume,
+        .get_region_size = userspace_get_region_size,
+        .is_clean = userspace_is_clean,
+        .in_sync = userspace_in_sync,
+        .flush = userspace_flush,
+        .mark_region = userspace_mark_region,
+        .clear_region = userspace_clear_region,
+        .get_resync_work = userspace_get_resync_work,
+        .set_region_sync = userspace_set_region_sync,
+        .get_sync_count = userspace_get_sync_count,
+        .status = userspace_status,
+        .is_remote_recovering = userspace_is_remote_recovering,
+};
+static int __init userspace_dirty_log_init(void)
+{
+        int r = 0;
+        flush_entry_pool = mempool_create(100, flush_entry_alloc,
+                                          flush_entry_free, NULL);
+        if (!flush_entry_pool) {
+                DMWARN("Unable to create flush_entry_pool:  No memory.");
+                return -ENOMEM;
+        }
+        r = dm_ulog_tfr_init();
+        if (r) {
+                DMWARN("Unable to initialize userspace log communications");
+                mempool_destroy(flush_entry_pool);
+                return r;
+        }
+        r = dm_dirty_log_type_register(&_userspace_type);
+        if (r) {
+                DMWARN("Couldn't register userspace dirty log type");
+                dm_ulog_tfr_exit();
+                mempool_destroy(flush_entry_pool);
+                return r;
+        }
+        DMINFO("version 1.0.0 loaded");
+        return 0;
+}
+static void __exit userspace_dirty_log_exit(void)
+{
+        dm_dirty_log_type_unregister(&_userspace_type);
+        dm_ulog_tfr_exit();
+        mempool_destroy(flush_entry_pool);
+        DMINFO("version 1.0.0 unloaded");
+        return;
+}
+module_init(userspace_dirty_log_init);
+module_exit(userspace_dirty_log_exit);
+MODULE_DESCRIPTION(DM_NAME " userspace dirty log link");
+MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
author	Jonthan Brassow <jbrassow@redhat.com>	2009-06-22 05:12:35 -0400
committer	Alasdair G Kergon <agk@redhat.com>	2009-06-22 05:12:35 -0400
commit	f5db4af466e2dca0fe822019812d586ca910b00c (patch)
tree	1bbaaa36509df9f7eecc19ccffa434048cf4b555 /drivers/md/dm-log-userspace-base.c
parent	754c5fc7ebb417b23601a6222a6005cc2e7f2913 (diff)

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c new file mode 100644 index 000000000000..e69b96560997 --- /dev/null +++ b/drivers/md/dm-log-userspace-base.c
@@ -0,0 +1,696 @@
	1	/*
	2	* Copyright (C) 2006-2009 Red Hat, Inc.
	3	*
	4	* This file is released under the LGPL.
	5	*/
	6
	7	#include <linux/bio.h>
	8	#include <linux/dm-dirty-log.h>
	9	#include <linux/device-mapper.h>
	10	#include <linux/dm-log-userspace.h>
	11
	12	#include "dm-log-userspace-transfer.h"
	13
	14	struct flush_entry {
	15	int type;
	16	region_t region;
	17	struct list_head list;
	18	};
	19
	20	struct log_c {
	21	struct dm_target *ti;
	22	uint32_t region_size;
	23	region_t region_count;
	24	char uuid[DM_UUID_LEN];
	25
	26	char *usr_argv_str;
	27	uint32_t usr_argc;
	28
	29	/*
	30	* in_sync_hint gets set when doing is_remote_recovering. It
	31	* represents the first region that needs recovery. IOW, the
	32	* first zero bit of sync_bits. This can be useful for to limit
	33	* traffic for calls like is_remote_recovering and get_resync_work,
	34	* but be take care in its use for anything else.
	35	*/
	36	uint64_t in_sync_hint;
	37
	38	spinlock_t flush_lock;
	39	struct list_head flush_list; /* only for clear and mark requests */
	40	};
	41
	42	static mempool_t *flush_entry_pool;
	43
	44	static void flush_entry_alloc(gfp_t gfp_mask, void pool_data)
	45	{
	46	return kmalloc(sizeof(struct flush_entry), gfp_mask);
	47	}
	48
	49	static void flush_entry_free(void element, void pool_data)
	50	{
	51	kfree(element);
	52	}
	53
	54	static int userspace_do_request(struct log_c lc, const char uuid,
	55	int request_type, char *data, size_t data_size,
	56	char rdata, size_t rdata_size)
	57	{
	58	int r;
	59
	60	/*
	61	* If the server isn't there, -ESRCH is returned,
	62	* and we must keep trying until the server is
	63	* restored.
	64	*/
	65	retry:
	66	r = dm_consult_userspace(uuid, request_type, data,
	67	data_size, rdata, rdata_size);
	68
	69	if (r != -ESRCH)
	70	return r;
	71
	72	DMERR(" Userspace log server not found.");
	73	while (1) {
	74	set_current_state(TASK_INTERRUPTIBLE);
	75	schedule_timeout(2*HZ);
	76	DMWARN("Attempting to contact userspace log server...");
	77	r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str,
	78	strlen(lc->usr_argv_str) + 1,
	79	NULL, NULL);
	80	if (!r)
	81	break;
	82	}
	83	DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
	84	r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL,
	85	0, NULL, NULL);
	86	if (!r)
	87	goto retry;
	88
	89	DMERR("Error trying to resume userspace log: %d", r);
	90
	91	return -ESRCH;
	92	}
	93
	94	static int build_constructor_string(struct dm_target *ti,
	95	unsigned argc, char **argv,
	96	char **ctr_str)
	97	{
	98	int i, str_size;
	99	char *str = NULL;
	100
	101	*ctr_str = NULL;
	102
	103	for (i = 0, str_size = 0; i < argc; i++)
	104	str_size += strlen(argv[i]) + 1; /* +1 for space between args */
	105
	106	str_size += 20; /* Max number of chars in a printed u64 number */
	107
	108	str = kzalloc(str_size, GFP_KERNEL);
	109	if (!str) {
	110	DMWARN("Unable to allocate memory for constructor string");
	111	return -ENOMEM;
	112	}
	113
	114	for (i = 0, str_size = 0; i < argc; i++)
	115	str_size += sprintf(str + str_size, "%s ", argv[i]);
	116	str_size += sprintf(str + str_size, "%llu",
	117	(unsigned long long)ti->len);
	118
	119	*ctr_str = str;
	120	return str_size;
	121	}
	122
	123	/*
	124	* userspace_ctr
	125	*
	126	* argv contains:
	127	* <UUID> <other args>
	128	* Where 'other args' is the userspace implementation specific log
	129	* arguments. An example might be:
	130	* <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]
	131	*
	132	* So, this module will strip off the <UUID> for identification purposes
	133	* when communicating with userspace about a log; but will pass on everything
	134	* else.
	135	*/
	136	static int userspace_ctr(struct dm_dirty_log log, struct dm_target ti,
	137	unsigned argc, char **argv)
	138	{
	139	int r = 0;
	140	int str_size;
	141	char *ctr_str = NULL;
	142	struct log_c *lc = NULL;
	143	uint64_t rdata;
	144	size_t rdata_size = sizeof(rdata);
	145
	146	if (argc < 3) {
	147	DMWARN("Too few arguments to userspace dirty log");
	148	return -EINVAL;
	149	}
	150
	151	lc = kmalloc(sizeof(*lc), GFP_KERNEL);
	152	if (!lc) {
	153	DMWARN("Unable to allocate userspace log context.");
	154	return -ENOMEM;
	155	}
	156
	157	lc->ti = ti;
	158
	159	if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
	160	DMWARN("UUID argument too long.");
	161	kfree(lc);
	162	return -EINVAL;
	163	}
	164
	165	strncpy(lc->uuid, argv[0], DM_UUID_LEN);
	166	spin_lock_init(&lc->flush_lock);
	167	INIT_LIST_HEAD(&lc->flush_list);
	168
	169	str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
	170	if (str_size < 0) {
	171	kfree(lc);
	172	return str_size;
	173	}
	174
	175	/* Send table string */
	176	r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR,
	177	ctr_str, str_size, NULL, NULL);
	178
	179	if (r == -ESRCH) {
	180	DMERR("Userspace log server not found");
	181	goto out;
	182	}
	183
	184	/* Since the region size does not change, get it now */
	185	rdata_size = sizeof(rdata);
	186	r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE,
	187	NULL, 0, (char *)&rdata, &rdata_size);
	188
	189	if (r) {
	190	DMERR("Failed to get region size of dirty log");
	191	goto out;
	192	}
	193
	194	lc->region_size = (uint32_t)rdata;
	195	lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
	196
	197	out:
	198	if (r) {
	199	kfree(lc);
	200	kfree(ctr_str);
	201	} else {
	202	lc->usr_argv_str = ctr_str;
	203	lc->usr_argc = argc;
	204	log->context = lc;
	205	}
	206
	207	return r;
	208	}
	209
	210	static void userspace_dtr(struct dm_dirty_log *log)
	211	{
	212	int r;
	213	struct log_c *lc = log->context;
	214
	215	r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR,
	216	NULL, 0,
	217	NULL, NULL);
	218
	219	kfree(lc->usr_argv_str);
	220	kfree(lc);
	221
	222	return;
	223	}
	224
	225	static int userspace_presuspend(struct dm_dirty_log *log)
	226	{
	227	int r;
	228	struct log_c *lc = log->context;
	229
	230	r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND,
	231	NULL, 0,
	232	NULL, NULL);
	233
	234	return r;
	235	}
	236
	237	static int userspace_postsuspend(struct dm_dirty_log *log)
	238	{
	239	int r;
	240	struct log_c *lc = log->context;
	241
	242	r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND,
	243	NULL, 0,
	244	NULL, NULL);
	245
	246	return r;
	247	}
	248
	249	static int userspace_resume(struct dm_dirty_log *log)
	250	{
	251	int r;
	252	struct log_c *lc = log->context;
	253
	254	lc->in_sync_hint = 0;
	255	r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME,
	256	NULL, 0,
	257	NULL, NULL);
	258
	259	return r;
	260	}
	261
	262	static uint32_t userspace_get_region_size(struct dm_dirty_log *log)
	263	{
	264	struct log_c *lc = log->context;
	265
	266	return lc->region_size;
	267	}
	268
	269	/*
	270	* userspace_is_clean
	271	*
	272	* Check whether a region is clean. If there is any sort of
	273	* failure when consulting the server, we return not clean.
	274	*
	275	* Returns: 1 if clean, 0 otherwise
	276	*/
	277	static int userspace_is_clean(struct dm_dirty_log *log, region_t region)
	278	{
	279	int r;
	280	uint64_t region64 = (uint64_t)region;
	281	int64_t is_clean;
	282	size_t rdata_size;
	283	struct log_c *lc = log->context;
	284
	285	rdata_size = sizeof(is_clean);
	286	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN,
	287	(char *)&region64, sizeof(region64),
	288	(char *)&is_clean, &rdata_size);
	289
	290	return (r) ? 0 : (int)is_clean;
	291	}
	292
	293	/*
	294	* userspace_in_sync
	295	*
	296	* Check if the region is in-sync. If there is any sort
	297	* of failure when consulting the server, we assume that
	298	* the region is not in sync.
	299	*
	300	* If 'can_block' is set, return immediately
	301	*
	302	* Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK
	303	*/
	304	static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
	305	int can_block)
	306	{
	307	int r;
	308	uint64_t region64 = region;
	309	int64_t in_sync;
	310	size_t rdata_size;
	311	struct log_c *lc = log->context;
	312
	313	/*
	314	* We can never respond directly - even if in_sync_hint is
	315	* set. This is because another machine could see a device
	316	* failure and mark the region out-of-sync. If we don't go
	317	* to userspace to ask, we might think the region is in-sync
	318	* and allow a read to pick up data that is stale. (This is
	319	* very unlikely if a device actually fails; but it is very
	320	* likely if a connection to one device from one machine fails.)
	321	*
	322	* There still might be a problem if the mirror caches the region
	323	* state as in-sync... but then this call would not be made. So,
	324	* that is a mirror problem.
	325	*/
	326	if (!can_block)
	327	return -EWOULDBLOCK;
	328
	329	rdata_size = sizeof(in_sync);
	330	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC,
	331	(char *)&region64, sizeof(region64),
	332	(char *)&in_sync, &rdata_size);
	333	return (r) ? 0 : (int)in_sync;
	334	}
	335
	336	/*
	337	* userspace_flush
	338	*
	339	* This function is ok to block.
	340	* The flush happens in two stages. First, it sends all
	341	* clear/mark requests that are on the list. Then it
	342	* tells the server to commit them. This gives the
	343	* server a chance to optimise the commit, instead of
	344	* doing it for every request.
	345	*
	346	* Additionally, we could implement another thread that
	347	* sends the requests up to the server - reducing the
	348	* load on flush. Then the flush would have less in
	349	* the list and be responsible for the finishing commit.
	350	*
	351	* Returns: 0 on success, < 0 on failure
	352	*/
	353	static int userspace_flush(struct dm_dirty_log *log)
	354	{
	355	int r = 0;
	356	unsigned long flags;
	357	struct log_c *lc = log->context;
	358	LIST_HEAD(flush_list);
	359	struct flush_entry fe, tmp_fe;
	360
	361	spin_lock_irqsave(&lc->flush_lock, flags);
	362	list_splice_init(&lc->flush_list, &flush_list);
	363	spin_unlock_irqrestore(&lc->flush_lock, flags);
	364
	365	if (list_empty(&flush_list))
	366	return 0;
	367
	368	/*
	369	* FIXME: Count up requests, group request types,
	370	* allocate memory to stick all requests in and
	371	* send to server in one go. Failing the allocation,
	372	* do it one by one.
	373	*/
	374
	375	list_for_each_entry(fe, &flush_list, list) {
	376	r = userspace_do_request(lc, lc->uuid, fe->type,
	377	(char *)&fe->region,
	378	sizeof(fe->region),
	379	NULL, NULL);
	380	if (r)
	381	goto fail;
	382	}
	383
	384	r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
	385	NULL, 0, NULL, NULL);
	386
	387	fail:
	388	/*
	389	* We can safely remove these entries, even if failure.
	390	* Calling code will receive an error and will know that
	391	* the log facility has failed.
	392	*/
	393	list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
	394	list_del(&fe->list);
	395	mempool_free(fe, flush_entry_pool);
	396	}
	397
	398	if (r)
	399	dm_table_event(lc->ti->table);
	400
	401	return r;
	402	}
	403
	404	/*
	405	* userspace_mark_region
	406	*
	407	* This function should avoid blocking unless absolutely required.
	408	* (Memory allocation is valid for blocking.)
	409	*/
	410	static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
	411	{
	412	unsigned long flags;
	413	struct log_c *lc = log->context;
	414	struct flush_entry *fe;
	415
	416	/* Wait for an allocation, but _never_ fail */
	417	fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
	418	BUG_ON(!fe);
	419
	420	spin_lock_irqsave(&lc->flush_lock, flags);
	421	fe->type = DM_ULOG_MARK_REGION;
	422	fe->region = region;
	423	list_add(&fe->list, &lc->flush_list);
	424	spin_unlock_irqrestore(&lc->flush_lock, flags);
	425
	426	return;
	427	}
	428
	429	/*
	430	* userspace_clear_region
	431	*
	432	* This function must not block.
	433	* So, the alloc can't block. In the worst case, it is ok to
	434	* fail. It would simply mean we can't clear the region.
	435	* Does nothing to current sync context, but does mean
	436	* the region will be re-sync'ed on a reload of the mirror
	437	* even though it is in-sync.
	438	*/
	439	static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
	440	{
	441	unsigned long flags;
	442	struct log_c *lc = log->context;
	443	struct flush_entry *fe;
	444
	445	/*
	446	* If we fail to allocate, we skip the clearing of
	447	* the region. This doesn't hurt us in any way, except
	448	* to cause the region to be resync'ed when the
	449	* device is activated next time.
	450	*/
	451	fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
	452	if (!fe) {
	453	DMERR("Failed to allocate memory to clear region.");
	454	return;
	455	}
	456
	457	spin_lock_irqsave(&lc->flush_lock, flags);
	458	fe->type = DM_ULOG_CLEAR_REGION;
	459	fe->region = region;
	460	list_add(&fe->list, &lc->flush_list);
	461	spin_unlock_irqrestore(&lc->flush_lock, flags);
	462
	463	return;
	464	}
	465
	466	/*
	467	* userspace_get_resync_work
	468	*
	469	* Get a region that needs recovery. It is valid to return
	470	* an error for this function.
	471	*
	472	* Returns: 1 if region filled, 0 if no work, <0 on error
	473	*/
	474	static int userspace_get_resync_work(struct dm_dirty_log log, region_t region)
	475	{
	476	int r;
	477	size_t rdata_size;
	478	struct log_c *lc = log->context;
	479	struct {
	480	int64_t i; /* 64-bit for mix arch compatibility */
	481	region_t r;
	482	} pkg;
	483
	484	if (lc->in_sync_hint >= lc->region_count)
	485	return 0;
	486
	487	rdata_size = sizeof(pkg);
	488	r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
	489	NULL, 0,
	490	(char *)&pkg, &rdata_size);
	491
	492	*region = pkg.r;
	493	return (r) ? r : (int)pkg.i;
	494	}
	495
	496	/*
	497	* userspace_set_region_sync
	498	*
	499	* Set the sync status of a given region. This function
	500	* must not fail.
	501	*/
	502	static void userspace_set_region_sync(struct dm_dirty_log *log,
	503	region_t region, int in_sync)
	504	{
	505	int r;
	506	struct log_c *lc = log->context;
	507	struct {
	508	region_t r;
	509	int64_t i;
	510	} pkg;
	511
	512	pkg.r = region;
	513	pkg.i = (int64_t)in_sync;
	514
	515	r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
	516	(char *)&pkg, sizeof(pkg),
	517	NULL, NULL);
	518
	519	/*
	520	* It would be nice to be able to report failures.
	521	* However, it is easy emough to detect and resolve.
	522	*/
	523	return;
	524	}
	525
	526	/*
	527	* userspace_get_sync_count
	528	*
	529	* If there is any sort of failure when consulting the server,
	530	* we assume that the sync count is zero.
	531	*
	532	* Returns: sync count on success, 0 on failure
	533	*/
	534	static region_t userspace_get_sync_count(struct dm_dirty_log *log)
	535	{
	536	int r;
	537	size_t rdata_size;
	538	uint64_t sync_count;
	539	struct log_c *lc = log->context;
	540
	541	rdata_size = sizeof(sync_count);
	542	r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
	543	NULL, 0,
	544	(char *)&sync_count, &rdata_size);
	545
	546	if (r)
	547	return 0;
	548
	549	if (sync_count >= lc->region_count)
	550	lc->in_sync_hint = lc->region_count;
	551
	552	return (region_t)sync_count;
	553	}
	554
	555	/*
	556	* userspace_status
	557	*
	558	* Returns: amount of space consumed
	559	*/
	560	static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
	561	char *result, unsigned maxlen)
	562	{
	563	int r = 0;
	564	size_t sz = (size_t)maxlen;
	565	struct log_c *lc = log->context;
	566
	567	switch (status_type) {
	568	case STATUSTYPE_INFO:
	569	r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
	570	NULL, 0,
	571	result, &sz);
	572
	573	if (r) {
	574	sz = 0;
	575	DMEMIT("%s 1 COM_FAILURE", log->type->name);
	576	}
	577	break;
	578	case STATUSTYPE_TABLE:
	579	sz = 0;
	580	DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1,
	581	lc->uuid, lc->usr_argv_str);
	582	break;
	583	}
	584	return (r) ? 0 : (int)sz;
	585	}
	586
	587	/*
	588	* userspace_is_remote_recovering
	589	*
	590	* Returns: 1 if region recovering, 0 otherwise
	591	*/
	592	static int userspace_is_remote_recovering(struct dm_dirty_log *log,
	593	region_t region)
	594	{
	595	int r;
	596	uint64_t region64 = region;
	597	struct log_c *lc = log->context;
	598	static unsigned long long limit;
	599	struct {
	600	int64_t is_recovering;
	601	uint64_t in_sync_hint;
	602	} pkg;
	603	size_t rdata_size = sizeof(pkg);
	604
	605	/*
	606	* Once the mirror has been reported to be in-sync,
	607	* it will never again ask for recovery work. So,
	608	* we can safely say there is not a remote machine
	609	* recovering if the device is in-sync. (in_sync_hint
	610	* must be reset at resume time.)
	611	*/
	612	if (region < lc->in_sync_hint)
	613	return 0;
	614	else if (jiffies < limit)
	615	return 1;
	616
	617	limit = jiffies + (HZ / 4);
	618	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING,
	619	(char *)&region64, sizeof(region64),
	620	(char *)&pkg, &rdata_size);
	621	if (r)
	622	return 1;
	623
	624	lc->in_sync_hint = pkg.in_sync_hint;
	625
	626	return (int)pkg.is_recovering;
	627	}
	628
	629	static struct dm_dirty_log_type _userspace_type = {
	630	.name = "userspace",
	631	.module = THIS_MODULE,
	632	.ctr = userspace_ctr,
	633	.dtr = userspace_dtr,
	634	.presuspend = userspace_presuspend,
	635	.postsuspend = userspace_postsuspend,
	636	.resume = userspace_resume,
	637	.get_region_size = userspace_get_region_size,
	638	.is_clean = userspace_is_clean,
	639	.in_sync = userspace_in_sync,
	640	.flush = userspace_flush,
	641	.mark_region = userspace_mark_region,
	642	.clear_region = userspace_clear_region,
	643	.get_resync_work = userspace_get_resync_work,
	644	.set_region_sync = userspace_set_region_sync,
	645	.get_sync_count = userspace_get_sync_count,
	646	.status = userspace_status,
	647	.is_remote_recovering = userspace_is_remote_recovering,
	648	};
	649
	650	static int __init userspace_dirty_log_init(void)
	651	{
	652	int r = 0;
	653
	654	flush_entry_pool = mempool_create(100, flush_entry_alloc,
	655	flush_entry_free, NULL);
	656
	657	if (!flush_entry_pool) {
	658	DMWARN("Unable to create flush_entry_pool: No memory.");
	659	return -ENOMEM;
	660	}
	661
	662	r = dm_ulog_tfr_init();
	663	if (r) {
	664	DMWARN("Unable to initialize userspace log communications");
	665	mempool_destroy(flush_entry_pool);
	666	return r;
	667	}
	668
	669	r = dm_dirty_log_type_register(&_userspace_type);
	670	if (r) {
	671	DMWARN("Couldn't register userspace dirty log type");
	672	dm_ulog_tfr_exit();
	673	mempool_destroy(flush_entry_pool);
	674	return r;
	675	}
	676
	677	DMINFO("version 1.0.0 loaded");
	678	return 0;
	679	}
	680
	681	static void __exit userspace_dirty_log_exit(void)
	682	{
	683	dm_dirty_log_type_unregister(&_userspace_type);
	684	dm_ulog_tfr_exit();
	685	mempool_destroy(flush_entry_pool);
	686
	687	DMINFO("version 1.0.0 unloaded");
	688	return;
	689	}
	690
	691	module_init(userspace_dirty_log_init);
	692	module_exit(userspace_dirty_log_exit);
	693
	694	MODULE_DESCRIPTION(DM_NAME " userspace dirty log link");
	695	MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>");
	696	MODULE_LICENSE("GPL");