aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/dm-log-userspace-base.c
diff options
context:
space:
mode:
authorJonthan Brassow <jbrassow@redhat.com>2009-06-22 05:12:35 -0400
committerAlasdair G Kergon <agk@redhat.com>2009-06-22 05:12:35 -0400
commitf5db4af466e2dca0fe822019812d586ca910b00c (patch)
tree1bbaaa36509df9f7eecc19ccffa434048cf4b555 /drivers/md/dm-log-userspace-base.c
parent754c5fc7ebb417b23601a6222a6005cc2e7f2913 (diff)
dm raid1: add userspace log
This patch contains a device-mapper mirror log module that forwards requests to userspace for processing. The structures used for communication between kernel and userspace are located in include/linux/dm-log-userspace.h. Due to the frequency, diversity, and 2-way communication nature of the exchanges between kernel and userspace, 'connector' was chosen as the interface for communication. The first log implementations written in userspace - "clustered-disk" and "clustered-core" - support clustered shared storage. A userspace daemon (in the LVM2 source code repository) uses openAIS/corosync to process requests in an ordered fashion with the rest of the nodes in the cluster so as to prevent log state corruption. Other implementations with no association to LVM or openAIS/corosync, are certainly possible. (Imagine if two machines are writing to the same region of a mirror. They would both mark the region dirty, but you need a cluster-aware entity that can handle properly marking the region clean when they are done. Otherwise, you might clear the region when the first machine is done, not the second.) Signed-off-by: Jonathan Brassow <jbrassow@redhat.com> Cc: Evgeniy Polyakov <johnpol@2ka.mipt.ru> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Diffstat (limited to 'drivers/md/dm-log-userspace-base.c')
-rw-r--r--drivers/md/dm-log-userspace-base.c696
1 files changed, 696 insertions, 0 deletions
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
new file mode 100644
index 000000000000..e69b96560997
--- /dev/null
+++ b/drivers/md/dm-log-userspace-base.c
@@ -0,0 +1,696 @@
1/*
2 * Copyright (C) 2006-2009 Red Hat, Inc.
3 *
4 * This file is released under the LGPL.
5 */
6
7#include <linux/bio.h>
8#include <linux/dm-dirty-log.h>
9#include <linux/device-mapper.h>
10#include <linux/dm-log-userspace.h>
11
12#include "dm-log-userspace-transfer.h"
13
14struct flush_entry {
15 int type;
16 region_t region;
17 struct list_head list;
18};
19
20struct log_c {
21 struct dm_target *ti;
22 uint32_t region_size;
23 region_t region_count;
24 char uuid[DM_UUID_LEN];
25
26 char *usr_argv_str;
27 uint32_t usr_argc;
28
29 /*
30 * in_sync_hint gets set when doing is_remote_recovering. It
31 * represents the first region that needs recovery. IOW, the
32 * first zero bit of sync_bits. This can be useful for to limit
33 * traffic for calls like is_remote_recovering and get_resync_work,
34 * but be take care in its use for anything else.
35 */
36 uint64_t in_sync_hint;
37
38 spinlock_t flush_lock;
39 struct list_head flush_list; /* only for clear and mark requests */
40};
41
42static mempool_t *flush_entry_pool;
43
44static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
45{
46 return kmalloc(sizeof(struct flush_entry), gfp_mask);
47}
48
49static void flush_entry_free(void *element, void *pool_data)
50{
51 kfree(element);
52}
53
54static int userspace_do_request(struct log_c *lc, const char *uuid,
55 int request_type, char *data, size_t data_size,
56 char *rdata, size_t *rdata_size)
57{
58 int r;
59
60 /*
61 * If the server isn't there, -ESRCH is returned,
62 * and we must keep trying until the server is
63 * restored.
64 */
65retry:
66 r = dm_consult_userspace(uuid, request_type, data,
67 data_size, rdata, rdata_size);
68
69 if (r != -ESRCH)
70 return r;
71
72 DMERR(" Userspace log server not found.");
73 while (1) {
74 set_current_state(TASK_INTERRUPTIBLE);
75 schedule_timeout(2*HZ);
76 DMWARN("Attempting to contact userspace log server...");
77 r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str,
78 strlen(lc->usr_argv_str) + 1,
79 NULL, NULL);
80 if (!r)
81 break;
82 }
83 DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
84 r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL,
85 0, NULL, NULL);
86 if (!r)
87 goto retry;
88
89 DMERR("Error trying to resume userspace log: %d", r);
90
91 return -ESRCH;
92}
93
94static int build_constructor_string(struct dm_target *ti,
95 unsigned argc, char **argv,
96 char **ctr_str)
97{
98 int i, str_size;
99 char *str = NULL;
100
101 *ctr_str = NULL;
102
103 for (i = 0, str_size = 0; i < argc; i++)
104 str_size += strlen(argv[i]) + 1; /* +1 for space between args */
105
106 str_size += 20; /* Max number of chars in a printed u64 number */
107
108 str = kzalloc(str_size, GFP_KERNEL);
109 if (!str) {
110 DMWARN("Unable to allocate memory for constructor string");
111 return -ENOMEM;
112 }
113
114 for (i = 0, str_size = 0; i < argc; i++)
115 str_size += sprintf(str + str_size, "%s ", argv[i]);
116 str_size += sprintf(str + str_size, "%llu",
117 (unsigned long long)ti->len);
118
119 *ctr_str = str;
120 return str_size;
121}
122
123/*
124 * userspace_ctr
125 *
126 * argv contains:
127 * <UUID> <other args>
128 * Where 'other args' is the userspace implementation specific log
129 * arguments. An example might be:
130 * <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]
131 *
132 * So, this module will strip off the <UUID> for identification purposes
133 * when communicating with userspace about a log; but will pass on everything
134 * else.
135 */
136static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
137 unsigned argc, char **argv)
138{
139 int r = 0;
140 int str_size;
141 char *ctr_str = NULL;
142 struct log_c *lc = NULL;
143 uint64_t rdata;
144 size_t rdata_size = sizeof(rdata);
145
146 if (argc < 3) {
147 DMWARN("Too few arguments to userspace dirty log");
148 return -EINVAL;
149 }
150
151 lc = kmalloc(sizeof(*lc), GFP_KERNEL);
152 if (!lc) {
153 DMWARN("Unable to allocate userspace log context.");
154 return -ENOMEM;
155 }
156
157 lc->ti = ti;
158
159 if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
160 DMWARN("UUID argument too long.");
161 kfree(lc);
162 return -EINVAL;
163 }
164
165 strncpy(lc->uuid, argv[0], DM_UUID_LEN);
166 spin_lock_init(&lc->flush_lock);
167 INIT_LIST_HEAD(&lc->flush_list);
168
169 str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
170 if (str_size < 0) {
171 kfree(lc);
172 return str_size;
173 }
174
175 /* Send table string */
176 r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR,
177 ctr_str, str_size, NULL, NULL);
178
179 if (r == -ESRCH) {
180 DMERR("Userspace log server not found");
181 goto out;
182 }
183
184 /* Since the region size does not change, get it now */
185 rdata_size = sizeof(rdata);
186 r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE,
187 NULL, 0, (char *)&rdata, &rdata_size);
188
189 if (r) {
190 DMERR("Failed to get region size of dirty log");
191 goto out;
192 }
193
194 lc->region_size = (uint32_t)rdata;
195 lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
196
197out:
198 if (r) {
199 kfree(lc);
200 kfree(ctr_str);
201 } else {
202 lc->usr_argv_str = ctr_str;
203 lc->usr_argc = argc;
204 log->context = lc;
205 }
206
207 return r;
208}
209
210static void userspace_dtr(struct dm_dirty_log *log)
211{
212 int r;
213 struct log_c *lc = log->context;
214
215 r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR,
216 NULL, 0,
217 NULL, NULL);
218
219 kfree(lc->usr_argv_str);
220 kfree(lc);
221
222 return;
223}
224
225static int userspace_presuspend(struct dm_dirty_log *log)
226{
227 int r;
228 struct log_c *lc = log->context;
229
230 r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND,
231 NULL, 0,
232 NULL, NULL);
233
234 return r;
235}
236
237static int userspace_postsuspend(struct dm_dirty_log *log)
238{
239 int r;
240 struct log_c *lc = log->context;
241
242 r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND,
243 NULL, 0,
244 NULL, NULL);
245
246 return r;
247}
248
249static int userspace_resume(struct dm_dirty_log *log)
250{
251 int r;
252 struct log_c *lc = log->context;
253
254 lc->in_sync_hint = 0;
255 r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME,
256 NULL, 0,
257 NULL, NULL);
258
259 return r;
260}
261
262static uint32_t userspace_get_region_size(struct dm_dirty_log *log)
263{
264 struct log_c *lc = log->context;
265
266 return lc->region_size;
267}
268
269/*
270 * userspace_is_clean
271 *
272 * Check whether a region is clean. If there is any sort of
273 * failure when consulting the server, we return not clean.
274 *
275 * Returns: 1 if clean, 0 otherwise
276 */
277static int userspace_is_clean(struct dm_dirty_log *log, region_t region)
278{
279 int r;
280 uint64_t region64 = (uint64_t)region;
281 int64_t is_clean;
282 size_t rdata_size;
283 struct log_c *lc = log->context;
284
285 rdata_size = sizeof(is_clean);
286 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN,
287 (char *)&region64, sizeof(region64),
288 (char *)&is_clean, &rdata_size);
289
290 return (r) ? 0 : (int)is_clean;
291}
292
293/*
294 * userspace_in_sync
295 *
296 * Check if the region is in-sync. If there is any sort
297 * of failure when consulting the server, we assume that
298 * the region is not in sync.
299 *
300 * If 'can_block' is set, return immediately
301 *
302 * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK
303 */
304static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
305 int can_block)
306{
307 int r;
308 uint64_t region64 = region;
309 int64_t in_sync;
310 size_t rdata_size;
311 struct log_c *lc = log->context;
312
313 /*
314 * We can never respond directly - even if in_sync_hint is
315 * set. This is because another machine could see a device
316 * failure and mark the region out-of-sync. If we don't go
317 * to userspace to ask, we might think the region is in-sync
318 * and allow a read to pick up data that is stale. (This is
319 * very unlikely if a device actually fails; but it is very
320 * likely if a connection to one device from one machine fails.)
321 *
322 * There still might be a problem if the mirror caches the region
323 * state as in-sync... but then this call would not be made. So,
324 * that is a mirror problem.
325 */
326 if (!can_block)
327 return -EWOULDBLOCK;
328
329 rdata_size = sizeof(in_sync);
330 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC,
331 (char *)&region64, sizeof(region64),
332 (char *)&in_sync, &rdata_size);
333 return (r) ? 0 : (int)in_sync;
334}
335
336/*
337 * userspace_flush
338 *
339 * This function is ok to block.
340 * The flush happens in two stages. First, it sends all
341 * clear/mark requests that are on the list. Then it
342 * tells the server to commit them. This gives the
343 * server a chance to optimise the commit, instead of
344 * doing it for every request.
345 *
346 * Additionally, we could implement another thread that
347 * sends the requests up to the server - reducing the
348 * load on flush. Then the flush would have less in
349 * the list and be responsible for the finishing commit.
350 *
351 * Returns: 0 on success, < 0 on failure
352 */
353static int userspace_flush(struct dm_dirty_log *log)
354{
355 int r = 0;
356 unsigned long flags;
357 struct log_c *lc = log->context;
358 LIST_HEAD(flush_list);
359 struct flush_entry *fe, *tmp_fe;
360
361 spin_lock_irqsave(&lc->flush_lock, flags);
362 list_splice_init(&lc->flush_list, &flush_list);
363 spin_unlock_irqrestore(&lc->flush_lock, flags);
364
365 if (list_empty(&flush_list))
366 return 0;
367
368 /*
369 * FIXME: Count up requests, group request types,
370 * allocate memory to stick all requests in and
371 * send to server in one go. Failing the allocation,
372 * do it one by one.
373 */
374
375 list_for_each_entry(fe, &flush_list, list) {
376 r = userspace_do_request(lc, lc->uuid, fe->type,
377 (char *)&fe->region,
378 sizeof(fe->region),
379 NULL, NULL);
380 if (r)
381 goto fail;
382 }
383
384 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
385 NULL, 0, NULL, NULL);
386
387fail:
388 /*
389 * We can safely remove these entries, even if failure.
390 * Calling code will receive an error and will know that
391 * the log facility has failed.
392 */
393 list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
394 list_del(&fe->list);
395 mempool_free(fe, flush_entry_pool);
396 }
397
398 if (r)
399 dm_table_event(lc->ti->table);
400
401 return r;
402}
403
404/*
405 * userspace_mark_region
406 *
407 * This function should avoid blocking unless absolutely required.
408 * (Memory allocation is valid for blocking.)
409 */
410static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
411{
412 unsigned long flags;
413 struct log_c *lc = log->context;
414 struct flush_entry *fe;
415
416 /* Wait for an allocation, but _never_ fail */
417 fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
418 BUG_ON(!fe);
419
420 spin_lock_irqsave(&lc->flush_lock, flags);
421 fe->type = DM_ULOG_MARK_REGION;
422 fe->region = region;
423 list_add(&fe->list, &lc->flush_list);
424 spin_unlock_irqrestore(&lc->flush_lock, flags);
425
426 return;
427}
428
429/*
430 * userspace_clear_region
431 *
432 * This function must not block.
433 * So, the alloc can't block. In the worst case, it is ok to
434 * fail. It would simply mean we can't clear the region.
435 * Does nothing to current sync context, but does mean
436 * the region will be re-sync'ed on a reload of the mirror
437 * even though it is in-sync.
438 */
439static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
440{
441 unsigned long flags;
442 struct log_c *lc = log->context;
443 struct flush_entry *fe;
444
445 /*
446 * If we fail to allocate, we skip the clearing of
447 * the region. This doesn't hurt us in any way, except
448 * to cause the region to be resync'ed when the
449 * device is activated next time.
450 */
451 fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
452 if (!fe) {
453 DMERR("Failed to allocate memory to clear region.");
454 return;
455 }
456
457 spin_lock_irqsave(&lc->flush_lock, flags);
458 fe->type = DM_ULOG_CLEAR_REGION;
459 fe->region = region;
460 list_add(&fe->list, &lc->flush_list);
461 spin_unlock_irqrestore(&lc->flush_lock, flags);
462
463 return;
464}
465
466/*
467 * userspace_get_resync_work
468 *
469 * Get a region that needs recovery. It is valid to return
470 * an error for this function.
471 *
472 * Returns: 1 if region filled, 0 if no work, <0 on error
473 */
474static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
475{
476 int r;
477 size_t rdata_size;
478 struct log_c *lc = log->context;
479 struct {
480 int64_t i; /* 64-bit for mix arch compatibility */
481 region_t r;
482 } pkg;
483
484 if (lc->in_sync_hint >= lc->region_count)
485 return 0;
486
487 rdata_size = sizeof(pkg);
488 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
489 NULL, 0,
490 (char *)&pkg, &rdata_size);
491
492 *region = pkg.r;
493 return (r) ? r : (int)pkg.i;
494}
495
496/*
497 * userspace_set_region_sync
498 *
499 * Set the sync status of a given region. This function
500 * must not fail.
501 */
502static void userspace_set_region_sync(struct dm_dirty_log *log,
503 region_t region, int in_sync)
504{
505 int r;
506 struct log_c *lc = log->context;
507 struct {
508 region_t r;
509 int64_t i;
510 } pkg;
511
512 pkg.r = region;
513 pkg.i = (int64_t)in_sync;
514
515 r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
516 (char *)&pkg, sizeof(pkg),
517 NULL, NULL);
518
519 /*
520 * It would be nice to be able to report failures.
521 * However, it is easy emough to detect and resolve.
522 */
523 return;
524}
525
526/*
527 * userspace_get_sync_count
528 *
529 * If there is any sort of failure when consulting the server,
530 * we assume that the sync count is zero.
531 *
532 * Returns: sync count on success, 0 on failure
533 */
534static region_t userspace_get_sync_count(struct dm_dirty_log *log)
535{
536 int r;
537 size_t rdata_size;
538 uint64_t sync_count;
539 struct log_c *lc = log->context;
540
541 rdata_size = sizeof(sync_count);
542 r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
543 NULL, 0,
544 (char *)&sync_count, &rdata_size);
545
546 if (r)
547 return 0;
548
549 if (sync_count >= lc->region_count)
550 lc->in_sync_hint = lc->region_count;
551
552 return (region_t)sync_count;
553}
554
555/*
556 * userspace_status
557 *
558 * Returns: amount of space consumed
559 */
560static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
561 char *result, unsigned maxlen)
562{
563 int r = 0;
564 size_t sz = (size_t)maxlen;
565 struct log_c *lc = log->context;
566
567 switch (status_type) {
568 case STATUSTYPE_INFO:
569 r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
570 NULL, 0,
571 result, &sz);
572
573 if (r) {
574 sz = 0;
575 DMEMIT("%s 1 COM_FAILURE", log->type->name);
576 }
577 break;
578 case STATUSTYPE_TABLE:
579 sz = 0;
580 DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1,
581 lc->uuid, lc->usr_argv_str);
582 break;
583 }
584 return (r) ? 0 : (int)sz;
585}
586
587/*
588 * userspace_is_remote_recovering
589 *
590 * Returns: 1 if region recovering, 0 otherwise
591 */
592static int userspace_is_remote_recovering(struct dm_dirty_log *log,
593 region_t region)
594{
595 int r;
596 uint64_t region64 = region;
597 struct log_c *lc = log->context;
598 static unsigned long long limit;
599 struct {
600 int64_t is_recovering;
601 uint64_t in_sync_hint;
602 } pkg;
603 size_t rdata_size = sizeof(pkg);
604
605 /*
606 * Once the mirror has been reported to be in-sync,
607 * it will never again ask for recovery work. So,
608 * we can safely say there is not a remote machine
609 * recovering if the device is in-sync. (in_sync_hint
610 * must be reset at resume time.)
611 */
612 if (region < lc->in_sync_hint)
613 return 0;
614 else if (jiffies < limit)
615 return 1;
616
617 limit = jiffies + (HZ / 4);
618 r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING,
619 (char *)&region64, sizeof(region64),
620 (char *)&pkg, &rdata_size);
621 if (r)
622 return 1;
623
624 lc->in_sync_hint = pkg.in_sync_hint;
625
626 return (int)pkg.is_recovering;
627}
628
629static struct dm_dirty_log_type _userspace_type = {
630 .name = "userspace",
631 .module = THIS_MODULE,
632 .ctr = userspace_ctr,
633 .dtr = userspace_dtr,
634 .presuspend = userspace_presuspend,
635 .postsuspend = userspace_postsuspend,
636 .resume = userspace_resume,
637 .get_region_size = userspace_get_region_size,
638 .is_clean = userspace_is_clean,
639 .in_sync = userspace_in_sync,
640 .flush = userspace_flush,
641 .mark_region = userspace_mark_region,
642 .clear_region = userspace_clear_region,
643 .get_resync_work = userspace_get_resync_work,
644 .set_region_sync = userspace_set_region_sync,
645 .get_sync_count = userspace_get_sync_count,
646 .status = userspace_status,
647 .is_remote_recovering = userspace_is_remote_recovering,
648};
649
650static int __init userspace_dirty_log_init(void)
651{
652 int r = 0;
653
654 flush_entry_pool = mempool_create(100, flush_entry_alloc,
655 flush_entry_free, NULL);
656
657 if (!flush_entry_pool) {
658 DMWARN("Unable to create flush_entry_pool: No memory.");
659 return -ENOMEM;
660 }
661
662 r = dm_ulog_tfr_init();
663 if (r) {
664 DMWARN("Unable to initialize userspace log communications");
665 mempool_destroy(flush_entry_pool);
666 return r;
667 }
668
669 r = dm_dirty_log_type_register(&_userspace_type);
670 if (r) {
671 DMWARN("Couldn't register userspace dirty log type");
672 dm_ulog_tfr_exit();
673 mempool_destroy(flush_entry_pool);
674 return r;
675 }
676
677 DMINFO("version 1.0.0 loaded");
678 return 0;
679}
680
681static void __exit userspace_dirty_log_exit(void)
682{
683 dm_dirty_log_type_unregister(&_userspace_type);
684 dm_ulog_tfr_exit();
685 mempool_destroy(flush_entry_pool);
686
687 DMINFO("version 1.0.0 unloaded");
688 return;
689}
690
691module_init(userspace_dirty_log_init);
692module_exit(userspace_dirty_log_exit);
693
694MODULE_DESCRIPTION(DM_NAME " userspace dirty log link");
695MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>");
696MODULE_LICENSE("GPL");