aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/persistent-data
diff options
context:
space:
mode:
authorJoe Thornber <thornber@redhat.com>2011-10-31 16:19:11 -0400
committerAlasdair G Kergon <agk@redhat.com>2011-10-31 16:19:11 -0400
commit3241b1d3e0aaafbfcd320f4d71ade629728cc4f4 (patch)
tree499461f724d4db3d7118641f4a20f5be23549edd /drivers/md/persistent-data
parent95d402f057f2e208e4631893f6cd4a59c7c05e41 (diff)
dm: add persistent data library
The persistent-data library offers a re-usable framework for the storage and management of on-disk metadata in device-mapper targets. It's used by the thin-provisioning target in the next patch and in an upcoming hierarchical storage target. For further information, please read Documentation/device-mapper/persistent-data.txt Signed-off-by: Joe Thornber <thornber@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Diffstat (limited to 'drivers/md/persistent-data')
-rw-r--r--drivers/md/persistent-data/Kconfig8
-rw-r--r--drivers/md/persistent-data/Makefile11
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c620
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h123
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h137
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c566
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c244
-rw-r--r--drivers/md/persistent-data/dm-btree.c805
-rw-r--r--drivers/md/persistent-data/dm-btree.h145
-rw-r--r--drivers/md/persistent-data/dm-persistent-data-internal.h19
-rw-r--r--drivers/md/persistent-data/dm-space-map-checker.c437
-rw-r--r--drivers/md/persistent-data/dm-space-map-checker.h26
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c705
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.h126
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c335
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.h25
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c596
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.h33
-rw-r--r--drivers/md/persistent-data/dm-space-map.h134
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c400
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h130
21 files changed, 5625 insertions, 0 deletions
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
new file mode 100644
index 000000000000..ceb359050a59
--- /dev/null
+++ b/drivers/md/persistent-data/Kconfig
@@ -0,0 +1,8 @@
1config DM_PERSISTENT_DATA
2 tristate
3 depends on BLK_DEV_DM && EXPERIMENTAL
4 select LIBCRC32C
5 select DM_BUFIO
6 ---help---
7 Library providing immutable on-disk data structure support for
8 device-mapper targets such as the thin provisioning target.
diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile
new file mode 100644
index 000000000000..cfa95f662230
--- /dev/null
+++ b/drivers/md/persistent-data/Makefile
@@ -0,0 +1,11 @@
1obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o
2dm-persistent-data-objs := \
3 dm-block-manager.o \
4 dm-space-map-checker.o \
5 dm-space-map-common.o \
6 dm-space-map-disk.o \
7 dm-space-map-metadata.o \
8 dm-transaction-manager.o \
9 dm-btree.o \
10 dm-btree-remove.o \
11 dm-btree-spine.o
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
new file mode 100644
index 000000000000..0317ecdc6e53
--- /dev/null
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -0,0 +1,620 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6#include "dm-block-manager.h"
7#include "dm-persistent-data-internal.h"
8#include "../dm-bufio.h"
9
10#include <linux/crc32c.h>
11#include <linux/module.h>
12#include <linux/slab.h>
13#include <linux/rwsem.h>
14#include <linux/device-mapper.h>
15#include <linux/stacktrace.h>
16
17#define DM_MSG_PREFIX "block manager"
18
19/*----------------------------------------------------------------*/
20
21/*
22 * This is a read/write semaphore with a couple of differences.
23 *
24 * i) There is a restriction on the number of concurrent read locks that
25 * may be held at once. This is just an implementation detail.
26 *
27 * ii) Recursive locking attempts are detected and return EINVAL. A stack
28 * trace is also emitted for the previous lock aquisition.
29 *
30 * iii) Priority is given to write locks.
31 */
32#define MAX_HOLDERS 4
33#define MAX_STACK 10
34
35typedef unsigned long stack_entries[MAX_STACK];
36
37struct block_lock {
38 spinlock_t lock;
39 __s32 count;
40 struct list_head waiters;
41 struct task_struct *holders[MAX_HOLDERS];
42
43#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
44 struct stack_trace traces[MAX_HOLDERS];
45 stack_entries entries[MAX_HOLDERS];
46#endif
47};
48
49struct waiter {
50 struct list_head list;
51 struct task_struct *task;
52 int wants_write;
53};
54
55static unsigned __find_holder(struct block_lock *lock,
56 struct task_struct *task)
57{
58 unsigned i;
59
60 for (i = 0; i < MAX_HOLDERS; i++)
61 if (lock->holders[i] == task)
62 break;
63
64 BUG_ON(i == MAX_HOLDERS);
65 return i;
66}
67
68/* call this *after* you increment lock->count */
69static void __add_holder(struct block_lock *lock, struct task_struct *task)
70{
71 unsigned h = __find_holder(lock, NULL);
72#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
73 struct stack_trace *t;
74#endif
75
76 get_task_struct(task);
77 lock->holders[h] = task;
78
79#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
80 t = lock->traces + h;
81 t->nr_entries = 0;
82 t->max_entries = MAX_STACK;
83 t->entries = lock->entries[h];
84 t->skip = 2;
85 save_stack_trace(t);
86#endif
87}
88
89/* call this *before* you decrement lock->count */
90static void __del_holder(struct block_lock *lock, struct task_struct *task)
91{
92 unsigned h = __find_holder(lock, task);
93 lock->holders[h] = NULL;
94 put_task_struct(task);
95}
96
97static int __check_holder(struct block_lock *lock)
98{
99 unsigned i;
100#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
101 static struct stack_trace t;
102 static stack_entries entries;
103#endif
104
105 for (i = 0; i < MAX_HOLDERS; i++) {
106 if (lock->holders[i] == current) {
107 DMERR("recursive lock detected in pool metadata");
108#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
109 DMERR("previously held here:");
110 print_stack_trace(lock->traces + i, 4);
111
112 DMERR("subsequent aquisition attempted here:");
113 t.nr_entries = 0;
114 t.max_entries = MAX_STACK;
115 t.entries = entries;
116 t.skip = 3;
117 save_stack_trace(&t);
118 print_stack_trace(&t, 4);
119#endif
120 return -EINVAL;
121 }
122 }
123
124 return 0;
125}
126
127static void __wait(struct waiter *w)
128{
129 for (;;) {
130 set_task_state(current, TASK_UNINTERRUPTIBLE);
131
132 if (!w->task)
133 break;
134
135 schedule();
136 }
137
138 set_task_state(current, TASK_RUNNING);
139}
140
141static void __wake_waiter(struct waiter *w)
142{
143 struct task_struct *task;
144
145 list_del(&w->list);
146 task = w->task;
147 smp_mb();
148 w->task = NULL;
149 wake_up_process(task);
150}
151
152/*
153 * We either wake a few readers or a single writer.
154 */
155static void __wake_many(struct block_lock *lock)
156{
157 struct waiter *w, *tmp;
158
159 BUG_ON(lock->count < 0);
160 list_for_each_entry_safe(w, tmp, &lock->waiters, list) {
161 if (lock->count >= MAX_HOLDERS)
162 return;
163
164 if (w->wants_write) {
165 if (lock->count > 0)
166 return; /* still read locked */
167
168 lock->count = -1;
169 __add_holder(lock, w->task);
170 __wake_waiter(w);
171 return;
172 }
173
174 lock->count++;
175 __add_holder(lock, w->task);
176 __wake_waiter(w);
177 }
178}
179
180static void bl_init(struct block_lock *lock)
181{
182 int i;
183
184 spin_lock_init(&lock->lock);
185 lock->count = 0;
186 INIT_LIST_HEAD(&lock->waiters);
187 for (i = 0; i < MAX_HOLDERS; i++)
188 lock->holders[i] = NULL;
189}
190
191static int __available_for_read(struct block_lock *lock)
192{
193 return lock->count >= 0 &&
194 lock->count < MAX_HOLDERS &&
195 list_empty(&lock->waiters);
196}
197
198static int bl_down_read(struct block_lock *lock)
199{
200 int r;
201 struct waiter w;
202
203 spin_lock(&lock->lock);
204 r = __check_holder(lock);
205 if (r) {
206 spin_unlock(&lock->lock);
207 return r;
208 }
209
210 if (__available_for_read(lock)) {
211 lock->count++;
212 __add_holder(lock, current);
213 spin_unlock(&lock->lock);
214 return 0;
215 }
216
217 get_task_struct(current);
218
219 w.task = current;
220 w.wants_write = 0;
221 list_add_tail(&w.list, &lock->waiters);
222 spin_unlock(&lock->lock);
223
224 __wait(&w);
225 put_task_struct(current);
226 return 0;
227}
228
229static int bl_down_read_nonblock(struct block_lock *lock)
230{
231 int r;
232
233 spin_lock(&lock->lock);
234 r = __check_holder(lock);
235 if (r)
236 goto out;
237
238 if (__available_for_read(lock)) {
239 lock->count++;
240 __add_holder(lock, current);
241 r = 0;
242 } else
243 r = -EWOULDBLOCK;
244
245out:
246 spin_unlock(&lock->lock);
247 return r;
248}
249
250static void bl_up_read(struct block_lock *lock)
251{
252 spin_lock(&lock->lock);
253 BUG_ON(lock->count <= 0);
254 __del_holder(lock, current);
255 --lock->count;
256 if (!list_empty(&lock->waiters))
257 __wake_many(lock);
258 spin_unlock(&lock->lock);
259}
260
261static int bl_down_write(struct block_lock *lock)
262{
263 int r;
264 struct waiter w;
265
266 spin_lock(&lock->lock);
267 r = __check_holder(lock);
268 if (r) {
269 spin_unlock(&lock->lock);
270 return r;
271 }
272
273 if (lock->count == 0 && list_empty(&lock->waiters)) {
274 lock->count = -1;
275 __add_holder(lock, current);
276 spin_unlock(&lock->lock);
277 return 0;
278 }
279
280 get_task_struct(current);
281 w.task = current;
282 w.wants_write = 1;
283
284 /*
285 * Writers given priority. We know there's only one mutator in the
286 * system, so ignoring the ordering reversal.
287 */
288 list_add(&w.list, &lock->waiters);
289 spin_unlock(&lock->lock);
290
291 __wait(&w);
292 put_task_struct(current);
293
294 return 0;
295}
296
297static void bl_up_write(struct block_lock *lock)
298{
299 spin_lock(&lock->lock);
300 __del_holder(lock, current);
301 lock->count = 0;
302 if (!list_empty(&lock->waiters))
303 __wake_many(lock);
304 spin_unlock(&lock->lock);
305}
306
307static void report_recursive_bug(dm_block_t b, int r)
308{
309 if (r == -EINVAL)
310 DMERR("recursive acquisition of block %llu requested.",
311 (unsigned long long) b);
312}
313
314/*----------------------------------------------------------------*/
315
316/*
317 * Block manager is currently implemented using dm-bufio. struct
318 * dm_block_manager and struct dm_block map directly onto a couple of
319 * structs in the bufio interface. I want to retain the freedom to move
320 * away from bufio in the future. So these structs are just cast within
321 * this .c file, rather than making it through to the public interface.
322 */
323static struct dm_buffer *to_buffer(struct dm_block *b)
324{
325 return (struct dm_buffer *) b;
326}
327
328static struct dm_bufio_client *to_bufio(struct dm_block_manager *bm)
329{
330 return (struct dm_bufio_client *) bm;
331}
332
333dm_block_t dm_block_location(struct dm_block *b)
334{
335 return dm_bufio_get_block_number(to_buffer(b));
336}
337EXPORT_SYMBOL_GPL(dm_block_location);
338
339void *dm_block_data(struct dm_block *b)
340{
341 return dm_bufio_get_block_data(to_buffer(b));
342}
343EXPORT_SYMBOL_GPL(dm_block_data);
344
345struct buffer_aux {
346 struct dm_block_validator *validator;
347 struct block_lock lock;
348 int write_locked;
349};
350
351static void dm_block_manager_alloc_callback(struct dm_buffer *buf)
352{
353 struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
354 aux->validator = NULL;
355 bl_init(&aux->lock);
356}
357
358static void dm_block_manager_write_callback(struct dm_buffer *buf)
359{
360 struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
361 if (aux->validator) {
362 aux->validator->prepare_for_write(aux->validator, (struct dm_block *) buf,
363 dm_bufio_get_block_size(dm_bufio_get_client(buf)));
364 }
365}
366
367/*----------------------------------------------------------------
368 * Public interface
369 *--------------------------------------------------------------*/
370struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
371 unsigned block_size,
372 unsigned cache_size,
373 unsigned max_held_per_thread)
374{
375 return (struct dm_block_manager *)
376 dm_bufio_client_create(bdev, block_size, max_held_per_thread,
377 sizeof(struct buffer_aux),
378 dm_block_manager_alloc_callback,
379 dm_block_manager_write_callback);
380}
381EXPORT_SYMBOL_GPL(dm_block_manager_create);
382
383void dm_block_manager_destroy(struct dm_block_manager *bm)
384{
385 return dm_bufio_client_destroy(to_bufio(bm));
386}
387EXPORT_SYMBOL_GPL(dm_block_manager_destroy);
388
389unsigned dm_bm_block_size(struct dm_block_manager *bm)
390{
391 return dm_bufio_get_block_size(to_bufio(bm));
392}
393EXPORT_SYMBOL_GPL(dm_bm_block_size);
394
395dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm)
396{
397 return dm_bufio_get_device_size(to_bufio(bm));
398}
399
400static int dm_bm_validate_buffer(struct dm_block_manager *bm,
401 struct dm_buffer *buf,
402 struct buffer_aux *aux,
403 struct dm_block_validator *v)
404{
405 if (unlikely(!aux->validator)) {
406 int r;
407 if (!v)
408 return 0;
409 r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(to_bufio(bm)));
410 if (unlikely(r))
411 return r;
412 aux->validator = v;
413 } else {
414 if (unlikely(aux->validator != v)) {
415 DMERR("validator mismatch (old=%s vs new=%s) for block %llu",
416 aux->validator->name, v ? v->name : "NULL",
417 (unsigned long long)
418 dm_bufio_get_block_number(buf));
419 return -EINVAL;
420 }
421 }
422
423 return 0;
424}
425int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
426 struct dm_block_validator *v,
427 struct dm_block **result)
428{
429 struct buffer_aux *aux;
430 void *p;
431 int r;
432
433 p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result);
434 if (unlikely(IS_ERR(p)))
435 return PTR_ERR(p);
436
437 aux = dm_bufio_get_aux_data(to_buffer(*result));
438 r = bl_down_read(&aux->lock);
439 if (unlikely(r)) {
440 dm_bufio_release(to_buffer(*result));
441 report_recursive_bug(b, r);
442 return r;
443 }
444
445 aux->write_locked = 0;
446
447 r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
448 if (unlikely(r)) {
449 bl_up_read(&aux->lock);
450 dm_bufio_release(to_buffer(*result));
451 return r;
452 }
453
454 return 0;
455}
456EXPORT_SYMBOL_GPL(dm_bm_read_lock);
457
458int dm_bm_write_lock(struct dm_block_manager *bm,
459 dm_block_t b, struct dm_block_validator *v,
460 struct dm_block **result)
461{
462 struct buffer_aux *aux;
463 void *p;
464 int r;
465
466 p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result);
467 if (unlikely(IS_ERR(p)))
468 return PTR_ERR(p);
469
470 aux = dm_bufio_get_aux_data(to_buffer(*result));
471 r = bl_down_write(&aux->lock);
472 if (r) {
473 dm_bufio_release(to_buffer(*result));
474 report_recursive_bug(b, r);
475 return r;
476 }
477
478 aux->write_locked = 1;
479
480 r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
481 if (unlikely(r)) {
482 bl_up_write(&aux->lock);
483 dm_bufio_release(to_buffer(*result));
484 return r;
485 }
486
487 return 0;
488}
489EXPORT_SYMBOL_GPL(dm_bm_write_lock);
490
491int dm_bm_read_try_lock(struct dm_block_manager *bm,
492 dm_block_t b, struct dm_block_validator *v,
493 struct dm_block **result)
494{
495 struct buffer_aux *aux;
496 void *p;
497 int r;
498
499 p = dm_bufio_get(to_bufio(bm), b, (struct dm_buffer **) result);
500 if (unlikely(IS_ERR(p)))
501 return PTR_ERR(p);
502 if (unlikely(!p))
503 return -EWOULDBLOCK;
504
505 aux = dm_bufio_get_aux_data(to_buffer(*result));
506 r = bl_down_read_nonblock(&aux->lock);
507 if (r < 0) {
508 dm_bufio_release(to_buffer(*result));
509 report_recursive_bug(b, r);
510 return r;
511 }
512 aux->write_locked = 0;
513
514 r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
515 if (unlikely(r)) {
516 bl_up_read(&aux->lock);
517 dm_bufio_release(to_buffer(*result));
518 return r;
519 }
520
521 return 0;
522}
523
524int dm_bm_write_lock_zero(struct dm_block_manager *bm,
525 dm_block_t b, struct dm_block_validator *v,
526 struct dm_block **result)
527{
528 int r;
529 struct buffer_aux *aux;
530 void *p;
531
532 p = dm_bufio_new(to_bufio(bm), b, (struct dm_buffer **) result);
533 if (unlikely(IS_ERR(p)))
534 return PTR_ERR(p);
535
536 memset(p, 0, dm_bm_block_size(bm));
537
538 aux = dm_bufio_get_aux_data(to_buffer(*result));
539 r = bl_down_write(&aux->lock);
540 if (r) {
541 dm_bufio_release(to_buffer(*result));
542 return r;
543 }
544
545 aux->write_locked = 1;
546 aux->validator = v;
547
548 return 0;
549}
550
551int dm_bm_unlock(struct dm_block *b)
552{
553 struct buffer_aux *aux;
554 aux = dm_bufio_get_aux_data(to_buffer(b));
555
556 if (aux->write_locked) {
557 dm_bufio_mark_buffer_dirty(to_buffer(b));
558 bl_up_write(&aux->lock);
559 } else
560 bl_up_read(&aux->lock);
561
562 dm_bufio_release(to_buffer(b));
563
564 return 0;
565}
566EXPORT_SYMBOL_GPL(dm_bm_unlock);
567
568int dm_bm_unlock_move(struct dm_block *b, dm_block_t n)
569{
570 struct buffer_aux *aux;
571
572 aux = dm_bufio_get_aux_data(to_buffer(b));
573
574 if (aux->write_locked) {
575 dm_bufio_mark_buffer_dirty(to_buffer(b));
576 bl_up_write(&aux->lock);
577 } else
578 bl_up_read(&aux->lock);
579
580 dm_bufio_release_move(to_buffer(b), n);
581 return 0;
582}
583
584int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
585 struct dm_block *superblock)
586{
587 int r;
588
589 r = dm_bufio_write_dirty_buffers(to_bufio(bm));
590 if (unlikely(r))
591 return r;
592 r = dm_bufio_issue_flush(to_bufio(bm));
593 if (unlikely(r))
594 return r;
595
596 dm_bm_unlock(superblock);
597
598 r = dm_bufio_write_dirty_buffers(to_bufio(bm));
599 if (unlikely(r))
600 return r;
601 r = dm_bufio_issue_flush(to_bufio(bm));
602 if (unlikely(r))
603 return r;
604
605 return 0;
606}
607
608u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
609{
610 return crc32c(~(u32) 0, data, len) ^ init_xor;
611}
612EXPORT_SYMBOL_GPL(dm_bm_checksum);
613
614/*----------------------------------------------------------------*/
615
616MODULE_LICENSE("GPL");
617MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
618MODULE_DESCRIPTION("Immutable metadata library for dm");
619
620/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
new file mode 100644
index 000000000000..924833d2dfa6
--- /dev/null
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -0,0 +1,123 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef _LINUX_DM_BLOCK_MANAGER_H
8#define _LINUX_DM_BLOCK_MANAGER_H
9
10#include <linux/types.h>
11#include <linux/blkdev.h>
12
13/*----------------------------------------------------------------*/
14
15/*
16 * Block number.
17 */
18typedef uint64_t dm_block_t;
19struct dm_block;
20
21dm_block_t dm_block_location(struct dm_block *b);
22void *dm_block_data(struct dm_block *b);
23
24/*----------------------------------------------------------------*/
25
26/*
27 * @name should be a unique identifier for the block manager, no longer
28 * than 32 chars.
29 *
30 * @max_held_per_thread should be the maximum number of locks, read or
31 * write, that an individual thread holds at any one time.
32 */
33struct dm_block_manager;
34struct dm_block_manager *dm_block_manager_create(
35 struct block_device *bdev, unsigned block_size,
36 unsigned cache_size, unsigned max_held_per_thread);
37void dm_block_manager_destroy(struct dm_block_manager *bm);
38
39unsigned dm_bm_block_size(struct dm_block_manager *bm);
40dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm);
41
42/*----------------------------------------------------------------*/
43
44/*
45 * The validator allows the caller to verify newly-read data and modify
46 * the data just before writing, e.g. to calculate checksums. It's
47 * important to be consistent with your use of validators. The only time
48 * you can change validators is if you call dm_bm_write_lock_zero.
49 */
50struct dm_block_validator {
51 const char *name;
52 void (*prepare_for_write)(struct dm_block_validator *v, struct dm_block *b, size_t block_size);
53
54 /*
55 * Return 0 if the checksum is valid or < 0 on error.
56 */
57 int (*check)(struct dm_block_validator *v, struct dm_block *b, size_t block_size);
58};
59
60/*----------------------------------------------------------------*/
61
62/*
63 * You can have multiple concurrent readers or a single writer holding a
64 * block lock.
65 */
66
67/*
68 * dm_bm_lock() locks a block and returns through @result a pointer to
69 * memory that holds a copy of that block. If you have write-locked the
70 * block then any changes you make to memory pointed to by @result will be
71 * written back to the disk sometime after dm_bm_unlock is called.
72 */
73int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
74 struct dm_block_validator *v,
75 struct dm_block **result);
76
77int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b,
78 struct dm_block_validator *v,
79 struct dm_block **result);
80
81/*
82 * The *_try_lock variants return -EWOULDBLOCK if the block isn't
83 * available immediately.
84 */
85int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b,
86 struct dm_block_validator *v,
87 struct dm_block **result);
88
89/*
90 * Use dm_bm_write_lock_zero() when you know you're going to
91 * overwrite the block completely. It saves a disk read.
92 */
93int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b,
94 struct dm_block_validator *v,
95 struct dm_block **result);
96
97int dm_bm_unlock(struct dm_block *b);
98
99/*
100 * An optimisation; we often want to copy a block's contents to a new
101 * block. eg, as part of the shadowing operation. It's far better for
102 * bufio to do this move behind the scenes than hold 2 locks and memcpy the
103 * data.
104 */
105int dm_bm_unlock_move(struct dm_block *b, dm_block_t n);
106
107/*
108 * It's a common idiom to have a superblock that should be committed last.
109 *
110 * @superblock should be write-locked on entry. It will be unlocked during
111 * this function. All dirty blocks are guaranteed to be written and flushed
112 * before the superblock.
113 *
114 * This method always blocks.
115 */
116int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
117 struct dm_block *superblock);
118
119u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
120
121/*----------------------------------------------------------------*/
122
123#endif /* _LINUX_DM_BLOCK_MANAGER_H */
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
new file mode 100644
index 000000000000..d279c768f8f1
--- /dev/null
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -0,0 +1,137 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_BTREE_INTERNAL_H
8#define DM_BTREE_INTERNAL_H
9
10#include "dm-btree.h"
11
12/*----------------------------------------------------------------*/
13
14/*
15 * We'll need 2 accessor functions for n->csum and n->blocknr
16 * to support dm-btree-spine.c in that case.
17 */
18
19enum node_flags {
20 INTERNAL_NODE = 1,
21 LEAF_NODE = 1 << 1
22};
23
24/*
25 * Every btree node begins with this structure. Make sure it's a multiple
26 * of 8-bytes in size, otherwise the 64bit keys will be mis-aligned.
27 */
28struct node_header {
29 __le32 csum;
30 __le32 flags;
31 __le64 blocknr; /* Block this node is supposed to live in. */
32
33 __le32 nr_entries;
34 __le32 max_entries;
35 __le32 value_size;
36 __le32 padding;
37} __packed;
38
39struct node {
40 struct node_header header;
41 __le64 keys[0];
42} __packed;
43
44
45void inc_children(struct dm_transaction_manager *tm, struct node *n,
46 struct dm_btree_value_type *vt);
47
48int new_block(struct dm_btree_info *info, struct dm_block **result);
49int unlock_block(struct dm_btree_info *info, struct dm_block *b);
50
51/*
52 * Spines keep track of the rolling locks. There are 2 variants, read-only
53 * and one that uses shadowing. These are separate structs to allow the
54 * type checker to spot misuse, for example accidentally calling read_lock
55 * on a shadow spine.
56 */
57struct ro_spine {
58 struct dm_btree_info *info;
59
60 int count;
61 struct dm_block *nodes[2];
62};
63
64void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info);
65int exit_ro_spine(struct ro_spine *s);
66int ro_step(struct ro_spine *s, dm_block_t new_child);
67struct node *ro_node(struct ro_spine *s);
68
69struct shadow_spine {
70 struct dm_btree_info *info;
71
72 int count;
73 struct dm_block *nodes[2];
74
75 dm_block_t root;
76};
77
78void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info);
79int exit_shadow_spine(struct shadow_spine *s);
80
81int shadow_step(struct shadow_spine *s, dm_block_t b,
82 struct dm_btree_value_type *vt);
83
84/*
85 * The spine must have at least one entry before calling this.
86 */
87struct dm_block *shadow_current(struct shadow_spine *s);
88
89/*
90 * The spine must have at least two entries before calling this.
91 */
92struct dm_block *shadow_parent(struct shadow_spine *s);
93
94int shadow_has_parent(struct shadow_spine *s);
95
96int shadow_root(struct shadow_spine *s);
97
98/*
99 * Some inlines.
100 */
101static inline __le64 *key_ptr(struct node *n, uint32_t index)
102{
103 return n->keys + index;
104}
105
106static inline void *value_base(struct node *n)
107{
108 return &n->keys[le32_to_cpu(n->header.max_entries)];
109}
110
111/*
112 * FIXME: Now that value size is stored in node we don't need the third parm.
113 */
114static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size)
115{
116 BUG_ON(value_size != le32_to_cpu(n->header.value_size));
117 return value_base(n) + (value_size * index);
118}
119
120/*
121 * Assumes the values are suitably-aligned and converts to core format.
122 */
123static inline uint64_t value64(struct node *n, uint32_t index)
124{
125 __le64 *values_le = value_base(n);
126
127 return le64_to_cpu(values_le[index]);
128}
129
130/*
131 * Searching for a key within a single node.
132 */
133int lower_bound(struct node *n, uint64_t key);
134
135extern struct dm_block_validator btree_node_validator;
136
137#endif /* DM_BTREE_INTERNAL_H */
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
new file mode 100644
index 000000000000..65fd85ec6514
--- /dev/null
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -0,0 +1,566 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-btree.h"
8#include "dm-btree-internal.h"
9#include "dm-transaction-manager.h"
10
11#include <linux/module.h>
12
13/*
14 * Removing an entry from a btree
15 * ==============================
16 *
17 * A very important constraint for our btree is that no node, except the
18 * root, may have fewer than a certain number of entries.
19 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
20 *
21 * Ensuring this is complicated by the way we want to only ever hold the
22 * locks on 2 nodes concurrently, and only change nodes in a top to bottom
23 * fashion.
24 *
25 * Each node may have a left or right sibling. When decending the spine,
26 * if a node contains only MIN_ENTRIES then we try and increase this to at
27 * least MIN_ENTRIES + 1. We do this in the following ways:
28 *
29 * [A] No siblings => this can only happen if the node is the root, in which
30 * case we copy the childs contents over the root.
31 *
32 * [B] No left sibling
33 * ==> rebalance(node, right sibling)
34 *
35 * [C] No right sibling
36 * ==> rebalance(left sibling, node)
37 *
38 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
39 * ==> delete node adding it's contents to left and right
40 *
41 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
42 * ==> rebalance(left, node, right)
43 *
44 * After these operations it's possible that the our original node no
45 * longer contains the desired sub tree. For this reason this rebalancing
46 * is performed on the children of the current node. This also avoids
47 * having a special case for the root.
48 *
49 * Once this rebalancing has occurred we can then step into the child node
50 * for internal nodes. Or delete the entry for leaf nodes.
51 */
52
53/*
54 * Some little utilities for moving node data around.
55 */
56static void node_shift(struct node *n, int shift)
57{
58 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
59 uint32_t value_size = le32_to_cpu(n->header.value_size);
60
61 if (shift < 0) {
62 shift = -shift;
63 BUG_ON(shift > nr_entries);
64 BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size));
65 memmove(key_ptr(n, 0),
66 key_ptr(n, shift),
67 (nr_entries - shift) * sizeof(__le64));
68 memmove(value_ptr(n, 0, value_size),
69 value_ptr(n, shift, value_size),
70 (nr_entries - shift) * value_size);
71 } else {
72 BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
73 memmove(key_ptr(n, shift),
74 key_ptr(n, 0),
75 nr_entries * sizeof(__le64));
76 memmove(value_ptr(n, shift, value_size),
77 value_ptr(n, 0, value_size),
78 nr_entries * value_size);
79 }
80}
81
82static void node_copy(struct node *left, struct node *right, int shift)
83{
84 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
85 uint32_t value_size = le32_to_cpu(left->header.value_size);
86 BUG_ON(value_size != le32_to_cpu(right->header.value_size));
87
88 if (shift < 0) {
89 shift = -shift;
90 BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries));
91 memcpy(key_ptr(left, nr_left),
92 key_ptr(right, 0),
93 shift * sizeof(__le64));
94 memcpy(value_ptr(left, nr_left, value_size),
95 value_ptr(right, 0, value_size),
96 shift * value_size);
97 } else {
98 BUG_ON(shift > le32_to_cpu(right->header.max_entries));
99 memcpy(key_ptr(right, 0),
100 key_ptr(left, nr_left - shift),
101 shift * sizeof(__le64));
102 memcpy(value_ptr(right, 0, value_size),
103 value_ptr(left, nr_left - shift, value_size),
104 shift * value_size);
105 }
106}
107
108/*
109 * Delete a specific entry from a leaf node.
110 */
111static void delete_at(struct node *n, unsigned index)
112{
113 unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
114 unsigned nr_to_copy = nr_entries - (index + 1);
115 uint32_t value_size = le32_to_cpu(n->header.value_size);
116 BUG_ON(index >= nr_entries);
117
118 if (nr_to_copy) {
119 memmove(key_ptr(n, index),
120 key_ptr(n, index + 1),
121 nr_to_copy * sizeof(__le64));
122
123 memmove(value_ptr(n, index, value_size),
124 value_ptr(n, index + 1, value_size),
125 nr_to_copy * value_size);
126 }
127
128 n->header.nr_entries = cpu_to_le32(nr_entries - 1);
129}
130
131static unsigned del_threshold(struct node *n)
132{
133 return le32_to_cpu(n->header.max_entries) / 3;
134}
135
136static unsigned merge_threshold(struct node *n)
137{
138 /*
139 * The extra one is because we know we're potentially going to
140 * delete an entry.
141 */
142 return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1;
143}
144
145struct child {
146 unsigned index;
147 struct dm_block *block;
148 struct node *n;
149};
150
151static struct dm_btree_value_type le64_type = {
152 .context = NULL,
153 .size = sizeof(__le64),
154 .inc = NULL,
155 .dec = NULL,
156 .equal = NULL
157};
158
159static int init_child(struct dm_btree_info *info, struct node *parent,
160 unsigned index, struct child *result)
161{
162 int r, inc;
163 dm_block_t root;
164
165 result->index = index;
166 root = value64(parent, index);
167
168 r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
169 &result->block, &inc);
170 if (r)
171 return r;
172
173 result->n = dm_block_data(result->block);
174
175 if (inc)
176 inc_children(info->tm, result->n, &le64_type);
177
178 *((__le64 *) value_ptr(parent, index, sizeof(__le64))) =
179 cpu_to_le64(dm_block_location(result->block));
180
181 return 0;
182}
183
184static int exit_child(struct dm_btree_info *info, struct child *c)
185{
186 return dm_tm_unlock(info->tm, c->block);
187}
188
189static void shift(struct node *left, struct node *right, int count)
190{
191 if (!count)
192 return;
193
194 if (count > 0) {
195 node_shift(right, count);
196 node_copy(left, right, count);
197 } else {
198 node_copy(left, right, count);
199 node_shift(right, count);
200 }
201
202 left->header.nr_entries =
203 cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count);
204 BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries));
205
206 right->header.nr_entries =
207 cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count);
208 BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries));
209}
210
211static void __rebalance2(struct dm_btree_info *info, struct node *parent,
212 struct child *l, struct child *r)
213{
214 struct node *left = l->n;
215 struct node *right = r->n;
216 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
217 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
218
219 if (nr_left + nr_right <= merge_threshold(left)) {
220 /*
221 * Merge
222 */
223 node_copy(left, right, -nr_right);
224 left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
225 delete_at(parent, r->index);
226
227 /*
228 * We need to decrement the right block, but not it's
229 * children, since they're still referenced by left.
230 */
231 dm_tm_dec(info->tm, dm_block_location(r->block));
232 } else {
233 /*
234 * Rebalance.
235 */
236 unsigned target_left = (nr_left + nr_right) / 2;
237 unsigned shift_ = nr_left - target_left;
238 BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_);
239 BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_);
240 shift(left, right, nr_left - target_left);
241 *key_ptr(parent, r->index) = right->keys[0];
242 }
243}
244
245static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
246 unsigned left_index)
247{
248 int r;
249 struct node *parent;
250 struct child left, right;
251
252 parent = dm_block_data(shadow_current(s));
253
254 r = init_child(info, parent, left_index, &left);
255 if (r)
256 return r;
257
258 r = init_child(info, parent, left_index + 1, &right);
259 if (r) {
260 exit_child(info, &left);
261 return r;
262 }
263
264 __rebalance2(info, parent, &left, &right);
265
266 r = exit_child(info, &left);
267 if (r) {
268 exit_child(info, &right);
269 return r;
270 }
271
272 return exit_child(info, &right);
273}
274
275static void __rebalance3(struct dm_btree_info *info, struct node *parent,
276 struct child *l, struct child *c, struct child *r)
277{
278 struct node *left = l->n;
279 struct node *center = c->n;
280 struct node *right = r->n;
281
282 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
283 uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
284 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
285 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
286
287 unsigned target;
288
289 BUG_ON(left->header.max_entries != center->header.max_entries);
290 BUG_ON(center->header.max_entries != right->header.max_entries);
291
292 if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) {
293 /*
294 * Delete center node:
295 *
296 * We dump as many entries from center as possible into
297 * left, then the rest in right, then rebalance2. This
298 * wastes some cpu, but I want something simple atm.
299 */
300 unsigned shift = min(max_entries - nr_left, nr_center);
301
302 BUG_ON(nr_left + shift > max_entries);
303 node_copy(left, center, -shift);
304 left->header.nr_entries = cpu_to_le32(nr_left + shift);
305
306 if (shift != nr_center) {
307 shift = nr_center - shift;
308 BUG_ON((nr_right + shift) >= max_entries);
309 node_shift(right, shift);
310 node_copy(center, right, shift);
311 right->header.nr_entries = cpu_to_le32(nr_right + shift);
312 }
313 *key_ptr(parent, r->index) = right->keys[0];
314
315 delete_at(parent, c->index);
316 r->index--;
317
318 dm_tm_dec(info->tm, dm_block_location(c->block));
319 __rebalance2(info, parent, l, r);
320
321 return;
322 }
323
324 /*
325 * Rebalance
326 */
327 target = (nr_left + nr_center + nr_right) / 3;
328 BUG_ON(target > max_entries);
329
330 /*
331 * Adjust the left node
332 */
333 shift(left, center, nr_left - target);
334
335 /*
336 * Adjust the right node
337 */
338 shift(center, right, target - nr_right);
339 *key_ptr(parent, c->index) = center->keys[0];
340 *key_ptr(parent, r->index) = right->keys[0];
341}
342
343static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
344 unsigned left_index)
345{
346 int r;
347 struct node *parent = dm_block_data(shadow_current(s));
348 struct child left, center, right;
349
350 /*
351 * FIXME: fill out an array?
352 */
353 r = init_child(info, parent, left_index, &left);
354 if (r)
355 return r;
356
357 r = init_child(info, parent, left_index + 1, &center);
358 if (r) {
359 exit_child(info, &left);
360 return r;
361 }
362
363 r = init_child(info, parent, left_index + 2, &right);
364 if (r) {
365 exit_child(info, &left);
366 exit_child(info, &center);
367 return r;
368 }
369
370 __rebalance3(info, parent, &left, &center, &right);
371
372 r = exit_child(info, &left);
373 if (r) {
374 exit_child(info, &center);
375 exit_child(info, &right);
376 return r;
377 }
378
379 r = exit_child(info, &center);
380 if (r) {
381 exit_child(info, &right);
382 return r;
383 }
384
385 r = exit_child(info, &right);
386 if (r)
387 return r;
388
389 return 0;
390}
391
392static int get_nr_entries(struct dm_transaction_manager *tm,
393 dm_block_t b, uint32_t *result)
394{
395 int r;
396 struct dm_block *block;
397 struct node *n;
398
399 r = dm_tm_read_lock(tm, b, &btree_node_validator, &block);
400 if (r)
401 return r;
402
403 n = dm_block_data(block);
404 *result = le32_to_cpu(n->header.nr_entries);
405
406 return dm_tm_unlock(tm, block);
407}
408
409static int rebalance_children(struct shadow_spine *s,
410 struct dm_btree_info *info, uint64_t key)
411{
412 int i, r, has_left_sibling, has_right_sibling;
413 uint32_t child_entries;
414 struct node *n;
415
416 n = dm_block_data(shadow_current(s));
417
418 if (le32_to_cpu(n->header.nr_entries) == 1) {
419 struct dm_block *child;
420 dm_block_t b = value64(n, 0);
421
422 r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
423 if (r)
424 return r;
425
426 memcpy(n, dm_block_data(child),
427 dm_bm_block_size(dm_tm_get_bm(info->tm)));
428 r = dm_tm_unlock(info->tm, child);
429 if (r)
430 return r;
431
432 dm_tm_dec(info->tm, dm_block_location(child));
433 return 0;
434 }
435
436 i = lower_bound(n, key);
437 if (i < 0)
438 return -ENODATA;
439
440 r = get_nr_entries(info->tm, value64(n, i), &child_entries);
441 if (r)
442 return r;
443
444 if (child_entries > del_threshold(n))
445 return 0;
446
447 has_left_sibling = i > 0;
448 has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
449
450 if (!has_left_sibling)
451 r = rebalance2(s, info, i);
452
453 else if (!has_right_sibling)
454 r = rebalance2(s, info, i - 1);
455
456 else
457 r = rebalance3(s, info, i - 1);
458
459 return r;
460}
461
462static int do_leaf(struct node *n, uint64_t key, unsigned *index)
463{
464 int i = lower_bound(n, key);
465
466 if ((i < 0) ||
467 (i >= le32_to_cpu(n->header.nr_entries)) ||
468 (le64_to_cpu(n->keys[i]) != key))
469 return -ENODATA;
470
471 *index = i;
472
473 return 0;
474}
475
476/*
477 * Prepares for removal from one level of the hierarchy. The caller must
478 * call delete_at() to remove the entry at index.
479 */
480static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
481 struct dm_btree_value_type *vt, dm_block_t root,
482 uint64_t key, unsigned *index)
483{
484 int i = *index, r;
485 struct node *n;
486
487 for (;;) {
488 r = shadow_step(s, root, vt);
489 if (r < 0)
490 break;
491
492 /*
493 * We have to patch up the parent node, ugly, but I don't
494 * see a way to do this automatically as part of the spine
495 * op.
496 */
497 if (shadow_has_parent(s)) {
498 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
499 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)),
500 &location, sizeof(__le64));
501 }
502
503 n = dm_block_data(shadow_current(s));
504
505 if (le32_to_cpu(n->header.flags) & LEAF_NODE)
506 return do_leaf(n, key, index);
507
508 r = rebalance_children(s, info, key);
509 if (r)
510 break;
511
512 n = dm_block_data(shadow_current(s));
513 if (le32_to_cpu(n->header.flags) & LEAF_NODE)
514 return do_leaf(n, key, index);
515
516 i = lower_bound(n, key);
517
518 /*
519 * We know the key is present, or else
520 * rebalance_children would have returned
521 * -ENODATA
522 */
523 root = value64(n, i);
524 }
525
526 return r;
527}
528
529int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
530 uint64_t *keys, dm_block_t *new_root)
531{
532 unsigned level, last_level = info->levels - 1;
533 int index = 0, r = 0;
534 struct shadow_spine spine;
535 struct node *n;
536
537 init_shadow_spine(&spine, info);
538 for (level = 0; level < info->levels; level++) {
539 r = remove_raw(&spine, info,
540 (level == last_level ?
541 &info->value_type : &le64_type),
542 root, keys[level], (unsigned *)&index);
543 if (r < 0)
544 break;
545
546 n = dm_block_data(shadow_current(&spine));
547 if (level != last_level) {
548 root = value64(n, index);
549 continue;
550 }
551
552 BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
553
554 if (info->value_type.dec)
555 info->value_type.dec(info->value_type.context,
556 value_ptr(n, index, info->value_type.size));
557
558 delete_at(n, index);
559 }
560
561 *new_root = shadow_root(&spine);
562 exit_shadow_spine(&spine);
563
564 return r;
565}
566EXPORT_SYMBOL_GPL(dm_btree_remove);
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
new file mode 100644
index 000000000000..d9a7912ee8ee
--- /dev/null
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -0,0 +1,244 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-btree-internal.h"
8#include "dm-transaction-manager.h"
9
10#include <linux/device-mapper.h>
11
12#define DM_MSG_PREFIX "btree spine"
13
14/*----------------------------------------------------------------*/
15
16#define BTREE_CSUM_XOR 121107
17
18static int node_check(struct dm_block_validator *v,
19 struct dm_block *b,
20 size_t block_size);
21
22static void node_prepare_for_write(struct dm_block_validator *v,
23 struct dm_block *b,
24 size_t block_size)
25{
26 struct node *n = dm_block_data(b);
27 struct node_header *h = &n->header;
28
29 h->blocknr = cpu_to_le64(dm_block_location(b));
30 h->csum = cpu_to_le32(dm_bm_checksum(&h->flags,
31 block_size - sizeof(__le32),
32 BTREE_CSUM_XOR));
33
34 BUG_ON(node_check(v, b, 4096));
35}
36
37static int node_check(struct dm_block_validator *v,
38 struct dm_block *b,
39 size_t block_size)
40{
41 struct node *n = dm_block_data(b);
42 struct node_header *h = &n->header;
43 size_t value_size;
44 __le32 csum_disk;
45 uint32_t flags;
46
47 if (dm_block_location(b) != le64_to_cpu(h->blocknr)) {
48 DMERR("node_check failed blocknr %llu wanted %llu",
49 le64_to_cpu(h->blocknr), dm_block_location(b));
50 return -ENOTBLK;
51 }
52
53 csum_disk = cpu_to_le32(dm_bm_checksum(&h->flags,
54 block_size - sizeof(__le32),
55 BTREE_CSUM_XOR));
56 if (csum_disk != h->csum) {
57 DMERR("node_check failed csum %u wanted %u",
58 le32_to_cpu(csum_disk), le32_to_cpu(h->csum));
59 return -EILSEQ;
60 }
61
62 value_size = le32_to_cpu(h->value_size);
63
64 if (sizeof(struct node_header) +
65 (sizeof(__le64) + value_size) * le32_to_cpu(h->max_entries) > block_size) {
66 DMERR("node_check failed: max_entries too large");
67 return -EILSEQ;
68 }
69
70 if (le32_to_cpu(h->nr_entries) > le32_to_cpu(h->max_entries)) {
71 DMERR("node_check failed, too many entries");
72 return -EILSEQ;
73 }
74
75 /*
76 * The node must be either INTERNAL or LEAF.
77 */
78 flags = le32_to_cpu(h->flags);
79 if (!(flags & INTERNAL_NODE) && !(flags & LEAF_NODE)) {
80 DMERR("node_check failed, node is neither INTERNAL or LEAF");
81 return -EILSEQ;
82 }
83
84 return 0;
85}
86
87struct dm_block_validator btree_node_validator = {
88 .name = "btree_node",
89 .prepare_for_write = node_prepare_for_write,
90 .check = node_check
91};
92
93/*----------------------------------------------------------------*/
94
95static int bn_read_lock(struct dm_btree_info *info, dm_block_t b,
96 struct dm_block **result)
97{
98 return dm_tm_read_lock(info->tm, b, &btree_node_validator, result);
99}
100
101static int bn_shadow(struct dm_btree_info *info, dm_block_t orig,
102 struct dm_btree_value_type *vt,
103 struct dm_block **result)
104{
105 int r, inc;
106
107 r = dm_tm_shadow_block(info->tm, orig, &btree_node_validator,
108 result, &inc);
109 if (!r && inc)
110 inc_children(info->tm, dm_block_data(*result), vt);
111
112 return r;
113}
114
115int new_block(struct dm_btree_info *info, struct dm_block **result)
116{
117 return dm_tm_new_block(info->tm, &btree_node_validator, result);
118}
119
120int unlock_block(struct dm_btree_info *info, struct dm_block *b)
121{
122 return dm_tm_unlock(info->tm, b);
123}
124
125/*----------------------------------------------------------------*/
126
127void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info)
128{
129 s->info = info;
130 s->count = 0;
131 s->nodes[0] = NULL;
132 s->nodes[1] = NULL;
133}
134
135int exit_ro_spine(struct ro_spine *s)
136{
137 int r = 0, i;
138
139 for (i = 0; i < s->count; i++) {
140 int r2 = unlock_block(s->info, s->nodes[i]);
141 if (r2 < 0)
142 r = r2;
143 }
144
145 return r;
146}
147
148int ro_step(struct ro_spine *s, dm_block_t new_child)
149{
150 int r;
151
152 if (s->count == 2) {
153 r = unlock_block(s->info, s->nodes[0]);
154 if (r < 0)
155 return r;
156 s->nodes[0] = s->nodes[1];
157 s->count--;
158 }
159
160 r = bn_read_lock(s->info, new_child, s->nodes + s->count);
161 if (!r)
162 s->count++;
163
164 return r;
165}
166
167struct node *ro_node(struct ro_spine *s)
168{
169 struct dm_block *block;
170
171 BUG_ON(!s->count);
172 block = s->nodes[s->count - 1];
173
174 return dm_block_data(block);
175}
176
177/*----------------------------------------------------------------*/
178
179void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info)
180{
181 s->info = info;
182 s->count = 0;
183}
184
185int exit_shadow_spine(struct shadow_spine *s)
186{
187 int r = 0, i;
188
189 for (i = 0; i < s->count; i++) {
190 int r2 = unlock_block(s->info, s->nodes[i]);
191 if (r2 < 0)
192 r = r2;
193 }
194
195 return r;
196}
197
198int shadow_step(struct shadow_spine *s, dm_block_t b,
199 struct dm_btree_value_type *vt)
200{
201 int r;
202
203 if (s->count == 2) {
204 r = unlock_block(s->info, s->nodes[0]);
205 if (r < 0)
206 return r;
207 s->nodes[0] = s->nodes[1];
208 s->count--;
209 }
210
211 r = bn_shadow(s->info, b, vt, s->nodes + s->count);
212 if (!r) {
213 if (!s->count)
214 s->root = dm_block_location(s->nodes[0]);
215
216 s->count++;
217 }
218
219 return r;
220}
221
222struct dm_block *shadow_current(struct shadow_spine *s)
223{
224 BUG_ON(!s->count);
225
226 return s->nodes[s->count - 1];
227}
228
229struct dm_block *shadow_parent(struct shadow_spine *s)
230{
231 BUG_ON(s->count != 2);
232
233 return s->count == 2 ? s->nodes[0] : NULL;
234}
235
236int shadow_has_parent(struct shadow_spine *s)
237{
238 return s->count >= 2;
239}
240
241int shadow_root(struct shadow_spine *s)
242{
243 return s->root;
244}
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
new file mode 100644
index 000000000000..e0638be53ea4
--- /dev/null
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -0,0 +1,805 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-btree-internal.h"
8#include "dm-space-map.h"
9#include "dm-transaction-manager.h"
10
11#include <linux/module.h>
12#include <linux/device-mapper.h>
13
14#define DM_MSG_PREFIX "btree"
15
16/*----------------------------------------------------------------
17 * Array manipulation
18 *--------------------------------------------------------------*/
19static void memcpy_disk(void *dest, const void *src, size_t len)
20 __dm_written_to_disk(src)
21{
22 memcpy(dest, src, len);
23 __dm_unbless_for_disk(src);
24}
25
26static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27 unsigned index, void *elt)
28 __dm_written_to_disk(elt)
29{
30 if (index < nr_elts)
31 memmove(base + (elt_size * (index + 1)),
32 base + (elt_size * index),
33 (nr_elts - index) * elt_size);
34
35 memcpy_disk(base + (elt_size * index), elt, elt_size);
36}
37
38/*----------------------------------------------------------------*/
39
40/* makes the assumption that no two keys are the same. */
41static int bsearch(struct node *n, uint64_t key, int want_hi)
42{
43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44
45 while (hi - lo > 1) {
46 int mid = lo + ((hi - lo) / 2);
47 uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48
49 if (mid_key == key)
50 return mid;
51
52 if (mid_key < key)
53 lo = mid;
54 else
55 hi = mid;
56 }
57
58 return want_hi ? hi : lo;
59}
60
61int lower_bound(struct node *n, uint64_t key)
62{
63 return bsearch(n, key, 0);
64}
65
66void inc_children(struct dm_transaction_manager *tm, struct node *n,
67 struct dm_btree_value_type *vt)
68{
69 unsigned i;
70 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
71
72 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
73 for (i = 0; i < nr_entries; i++)
74 dm_tm_inc(tm, value64(n, i));
75 else if (vt->inc)
76 for (i = 0; i < nr_entries; i++)
77 vt->inc(vt->context,
78 value_ptr(n, i, vt->size));
79}
80
81static int insert_at(size_t value_size, struct node *node, unsigned index,
82 uint64_t key, void *value)
83 __dm_written_to_disk(value)
84{
85 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
86 __le64 key_le = cpu_to_le64(key);
87
88 if (index > nr_entries ||
89 index >= le32_to_cpu(node->header.max_entries)) {
90 DMERR("too many entries in btree node for insert");
91 __dm_unbless_for_disk(value);
92 return -ENOMEM;
93 }
94
95 __dm_bless_for_disk(&key_le);
96
97 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
98 array_insert(value_base(node), value_size, nr_entries, index, value);
99 node->header.nr_entries = cpu_to_le32(nr_entries + 1);
100
101 return 0;
102}
103
104/*----------------------------------------------------------------*/
105
106/*
107 * We want 3n entries (for some n). This works more nicely for repeated
108 * insert remove loops than (2n + 1).
109 */
110static uint32_t calc_max_entries(size_t value_size, size_t block_size)
111{
112 uint32_t total, n;
113 size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
114
115 block_size -= sizeof(struct node_header);
116 total = block_size / elt_size;
117 n = total / 3; /* rounds down */
118
119 return 3 * n;
120}
121
122int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
123{
124 int r;
125 struct dm_block *b;
126 struct node *n;
127 size_t block_size;
128 uint32_t max_entries;
129
130 r = new_block(info, &b);
131 if (r < 0)
132 return r;
133
134 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
135 max_entries = calc_max_entries(info->value_type.size, block_size);
136
137 n = dm_block_data(b);
138 memset(n, 0, block_size);
139 n->header.flags = cpu_to_le32(LEAF_NODE);
140 n->header.nr_entries = cpu_to_le32(0);
141 n->header.max_entries = cpu_to_le32(max_entries);
142 n->header.value_size = cpu_to_le32(info->value_type.size);
143
144 *root = dm_block_location(b);
145 return unlock_block(info, b);
146}
147EXPORT_SYMBOL_GPL(dm_btree_empty);
148
149/*----------------------------------------------------------------*/
150
151/*
152 * Deletion uses a recursive algorithm, since we have limited stack space
153 * we explicitly manage our own stack on the heap.
154 */
155#define MAX_SPINE_DEPTH 64
156struct frame {
157 struct dm_block *b;
158 struct node *n;
159 unsigned level;
160 unsigned nr_children;
161 unsigned current_child;
162};
163
164struct del_stack {
165 struct dm_transaction_manager *tm;
166 int top;
167 struct frame spine[MAX_SPINE_DEPTH];
168};
169
170static int top_frame(struct del_stack *s, struct frame **f)
171{
172 if (s->top < 0) {
173 DMERR("btree deletion stack empty");
174 return -EINVAL;
175 }
176
177 *f = s->spine + s->top;
178
179 return 0;
180}
181
182static int unprocessed_frames(struct del_stack *s)
183{
184 return s->top >= 0;
185}
186
187static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
188{
189 int r;
190 uint32_t ref_count;
191
192 if (s->top >= MAX_SPINE_DEPTH - 1) {
193 DMERR("btree deletion stack out of memory");
194 return -ENOMEM;
195 }
196
197 r = dm_tm_ref(s->tm, b, &ref_count);
198 if (r)
199 return r;
200
201 if (ref_count > 1)
202 /*
203 * This is a shared node, so we can just decrement it's
204 * reference counter and leave the children.
205 */
206 dm_tm_dec(s->tm, b);
207
208 else {
209 struct frame *f = s->spine + ++s->top;
210
211 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
212 if (r) {
213 s->top--;
214 return r;
215 }
216
217 f->n = dm_block_data(f->b);
218 f->level = level;
219 f->nr_children = le32_to_cpu(f->n->header.nr_entries);
220 f->current_child = 0;
221 }
222
223 return 0;
224}
225
226static void pop_frame(struct del_stack *s)
227{
228 struct frame *f = s->spine + s->top--;
229
230 dm_tm_dec(s->tm, dm_block_location(f->b));
231 dm_tm_unlock(s->tm, f->b);
232}
233
234int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
235{
236 int r;
237 struct del_stack *s;
238
239 s = kmalloc(sizeof(*s), GFP_KERNEL);
240 if (!s)
241 return -ENOMEM;
242 s->tm = info->tm;
243 s->top = -1;
244
245 r = push_frame(s, root, 1);
246 if (r)
247 goto out;
248
249 while (unprocessed_frames(s)) {
250 uint32_t flags;
251 struct frame *f;
252 dm_block_t b;
253
254 r = top_frame(s, &f);
255 if (r)
256 goto out;
257
258 if (f->current_child >= f->nr_children) {
259 pop_frame(s);
260 continue;
261 }
262
263 flags = le32_to_cpu(f->n->header.flags);
264 if (flags & INTERNAL_NODE) {
265 b = value64(f->n, f->current_child);
266 f->current_child++;
267 r = push_frame(s, b, f->level);
268 if (r)
269 goto out;
270
271 } else if (f->level != (info->levels - 1)) {
272 b = value64(f->n, f->current_child);
273 f->current_child++;
274 r = push_frame(s, b, f->level + 1);
275 if (r)
276 goto out;
277
278 } else {
279 if (info->value_type.dec) {
280 unsigned i;
281
282 for (i = 0; i < f->nr_children; i++)
283 info->value_type.dec(info->value_type.context,
284 value_ptr(f->n, i, info->value_type.size));
285 }
286 f->current_child = f->nr_children;
287 }
288 }
289
290out:
291 kfree(s);
292 return r;
293}
294EXPORT_SYMBOL_GPL(dm_btree_del);
295
296/*----------------------------------------------------------------*/
297
298static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
299 int (*search_fn)(struct node *, uint64_t),
300 uint64_t *result_key, void *v, size_t value_size)
301{
302 int i, r;
303 uint32_t flags, nr_entries;
304
305 do {
306 r = ro_step(s, block);
307 if (r < 0)
308 return r;
309
310 i = search_fn(ro_node(s), key);
311
312 flags = le32_to_cpu(ro_node(s)->header.flags);
313 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
314 if (i < 0 || i >= nr_entries)
315 return -ENODATA;
316
317 if (flags & INTERNAL_NODE)
318 block = value64(ro_node(s), i);
319
320 } while (!(flags & LEAF_NODE));
321
322 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
323 memcpy(v, value_ptr(ro_node(s), i, value_size), value_size);
324
325 return 0;
326}
327
328int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
329 uint64_t *keys, void *value_le)
330{
331 unsigned level, last_level = info->levels - 1;
332 int r = -ENODATA;
333 uint64_t rkey;
334 __le64 internal_value_le;
335 struct ro_spine spine;
336
337 init_ro_spine(&spine, info);
338 for (level = 0; level < info->levels; level++) {
339 size_t size;
340 void *value_p;
341
342 if (level == last_level) {
343 value_p = value_le;
344 size = info->value_type.size;
345
346 } else {
347 value_p = &internal_value_le;
348 size = sizeof(uint64_t);
349 }
350
351 r = btree_lookup_raw(&spine, root, keys[level],
352 lower_bound, &rkey,
353 value_p, size);
354
355 if (!r) {
356 if (rkey != keys[level]) {
357 exit_ro_spine(&spine);
358 return -ENODATA;
359 }
360 } else {
361 exit_ro_spine(&spine);
362 return r;
363 }
364
365 root = le64_to_cpu(internal_value_le);
366 }
367 exit_ro_spine(&spine);
368
369 return r;
370}
371EXPORT_SYMBOL_GPL(dm_btree_lookup);
372
373/*
374 * Splits a node by creating a sibling node and shifting half the nodes
375 * contents across. Assumes there is a parent node, and it has room for
376 * another child.
377 *
378 * Before:
379 * +--------+
380 * | Parent |
381 * +--------+
382 * |
383 * v
384 * +----------+
385 * | A ++++++ |
386 * +----------+
387 *
388 *
389 * After:
390 * +--------+
391 * | Parent |
392 * +--------+
393 * | |
394 * v +------+
395 * +---------+ |
396 * | A* +++ | v
397 * +---------+ +-------+
398 * | B +++ |
399 * +-------+
400 *
401 * Where A* is a shadow of A.
402 */
403static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
404 unsigned parent_index, uint64_t key)
405{
406 int r;
407 size_t size;
408 unsigned nr_left, nr_right;
409 struct dm_block *left, *right, *parent;
410 struct node *ln, *rn, *pn;
411 __le64 location;
412
413 left = shadow_current(s);
414
415 r = new_block(s->info, &right);
416 if (r < 0)
417 return r;
418
419 ln = dm_block_data(left);
420 rn = dm_block_data(right);
421
422 nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
423 nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
424
425 ln->header.nr_entries = cpu_to_le32(nr_left);
426
427 rn->header.flags = ln->header.flags;
428 rn->header.nr_entries = cpu_to_le32(nr_right);
429 rn->header.max_entries = ln->header.max_entries;
430 rn->header.value_size = ln->header.value_size;
431 memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
432
433 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
434 sizeof(uint64_t) : s->info->value_type.size;
435 memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size),
436 size * nr_right);
437
438 /*
439 * Patch up the parent
440 */
441 parent = shadow_parent(s);
442
443 pn = dm_block_data(parent);
444 location = cpu_to_le64(dm_block_location(left));
445 __dm_bless_for_disk(&location);
446 memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)),
447 &location, sizeof(__le64));
448
449 location = cpu_to_le64(dm_block_location(right));
450 __dm_bless_for_disk(&location);
451
452 r = insert_at(sizeof(__le64), pn, parent_index + 1,
453 le64_to_cpu(rn->keys[0]), &location);
454 if (r)
455 return r;
456
457 if (key < le64_to_cpu(rn->keys[0])) {
458 unlock_block(s->info, right);
459 s->nodes[1] = left;
460 } else {
461 unlock_block(s->info, left);
462 s->nodes[1] = right;
463 }
464
465 return 0;
466}
467
468/*
469 * Splits a node by creating two new children beneath the given node.
470 *
471 * Before:
472 * +----------+
473 * | A ++++++ |
474 * +----------+
475 *
476 *
477 * After:
478 * +------------+
479 * | A (shadow) |
480 * +------------+
481 * | |
482 * +------+ +----+
483 * | |
484 * v v
485 * +-------+ +-------+
486 * | B +++ | | C +++ |
487 * +-------+ +-------+
488 */
489static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
490{
491 int r;
492 size_t size;
493 unsigned nr_left, nr_right;
494 struct dm_block *left, *right, *new_parent;
495 struct node *pn, *ln, *rn;
496 __le64 val;
497
498 new_parent = shadow_current(s);
499
500 r = new_block(s->info, &left);
501 if (r < 0)
502 return r;
503
504 r = new_block(s->info, &right);
505 if (r < 0) {
506 /* FIXME: put left */
507 return r;
508 }
509
510 pn = dm_block_data(new_parent);
511 ln = dm_block_data(left);
512 rn = dm_block_data(right);
513
514 nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
515 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
516
517 ln->header.flags = pn->header.flags;
518 ln->header.nr_entries = cpu_to_le32(nr_left);
519 ln->header.max_entries = pn->header.max_entries;
520 ln->header.value_size = pn->header.value_size;
521
522 rn->header.flags = pn->header.flags;
523 rn->header.nr_entries = cpu_to_le32(nr_right);
524 rn->header.max_entries = pn->header.max_entries;
525 rn->header.value_size = pn->header.value_size;
526
527 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
528 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
529
530 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
531 sizeof(__le64) : s->info->value_type.size;
532 memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size);
533 memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size),
534 nr_right * size);
535
536 /* new_parent should just point to l and r now */
537 pn->header.flags = cpu_to_le32(INTERNAL_NODE);
538 pn->header.nr_entries = cpu_to_le32(2);
539 pn->header.max_entries = cpu_to_le32(
540 calc_max_entries(sizeof(__le64),
541 dm_bm_block_size(
542 dm_tm_get_bm(s->info->tm))));
543 pn->header.value_size = cpu_to_le32(sizeof(__le64));
544
545 val = cpu_to_le64(dm_block_location(left));
546 __dm_bless_for_disk(&val);
547 pn->keys[0] = ln->keys[0];
548 memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64));
549
550 val = cpu_to_le64(dm_block_location(right));
551 __dm_bless_for_disk(&val);
552 pn->keys[1] = rn->keys[0];
553 memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64));
554
555 /*
556 * rejig the spine. This is ugly, since it knows too
557 * much about the spine
558 */
559 if (s->nodes[0] != new_parent) {
560 unlock_block(s->info, s->nodes[0]);
561 s->nodes[0] = new_parent;
562 }
563 if (key < le64_to_cpu(rn->keys[0])) {
564 unlock_block(s->info, right);
565 s->nodes[1] = left;
566 } else {
567 unlock_block(s->info, left);
568 s->nodes[1] = right;
569 }
570 s->count = 2;
571
572 return 0;
573}
574
575static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
576 struct dm_btree_value_type *vt,
577 uint64_t key, unsigned *index)
578{
579 int r, i = *index, top = 1;
580 struct node *node;
581
582 for (;;) {
583 r = shadow_step(s, root, vt);
584 if (r < 0)
585 return r;
586
587 node = dm_block_data(shadow_current(s));
588
589 /*
590 * We have to patch up the parent node, ugly, but I don't
591 * see a way to do this automatically as part of the spine
592 * op.
593 */
594 if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
595 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
596
597 __dm_bless_for_disk(&location);
598 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)),
599 &location, sizeof(__le64));
600 }
601
602 node = dm_block_data(shadow_current(s));
603
604 if (node->header.nr_entries == node->header.max_entries) {
605 if (top)
606 r = btree_split_beneath(s, key);
607 else
608 r = btree_split_sibling(s, root, i, key);
609
610 if (r < 0)
611 return r;
612 }
613
614 node = dm_block_data(shadow_current(s));
615
616 i = lower_bound(node, key);
617
618 if (le32_to_cpu(node->header.flags) & LEAF_NODE)
619 break;
620
621 if (i < 0) {
622 /* change the bounds on the lowest key */
623 node->keys[0] = cpu_to_le64(key);
624 i = 0;
625 }
626
627 root = value64(node, i);
628 top = 0;
629 }
630
631 if (i < 0 || le64_to_cpu(node->keys[i]) != key)
632 i++;
633
634 *index = i;
635 return 0;
636}
637
638static int insert(struct dm_btree_info *info, dm_block_t root,
639 uint64_t *keys, void *value, dm_block_t *new_root,
640 int *inserted)
641 __dm_written_to_disk(value)
642{
643 int r, need_insert;
644 unsigned level, index = -1, last_level = info->levels - 1;
645 dm_block_t block = root;
646 struct shadow_spine spine;
647 struct node *n;
648 struct dm_btree_value_type le64_type;
649
650 le64_type.context = NULL;
651 le64_type.size = sizeof(__le64);
652 le64_type.inc = NULL;
653 le64_type.dec = NULL;
654 le64_type.equal = NULL;
655
656 init_shadow_spine(&spine, info);
657
658 for (level = 0; level < (info->levels - 1); level++) {
659 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
660 if (r < 0)
661 goto bad;
662
663 n = dm_block_data(shadow_current(&spine));
664 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
665 (le64_to_cpu(n->keys[index]) != keys[level]));
666
667 if (need_insert) {
668 dm_block_t new_tree;
669 __le64 new_le;
670
671 r = dm_btree_empty(info, &new_tree);
672 if (r < 0)
673 goto bad;
674
675 new_le = cpu_to_le64(new_tree);
676 __dm_bless_for_disk(&new_le);
677
678 r = insert_at(sizeof(uint64_t), n, index,
679 keys[level], &new_le);
680 if (r)
681 goto bad;
682 }
683
684 if (level < last_level)
685 block = value64(n, index);
686 }
687
688 r = btree_insert_raw(&spine, block, &info->value_type,
689 keys[level], &index);
690 if (r < 0)
691 goto bad;
692
693 n = dm_block_data(shadow_current(&spine));
694 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
695 (le64_to_cpu(n->keys[index]) != keys[level]));
696
697 if (need_insert) {
698 if (inserted)
699 *inserted = 1;
700
701 r = insert_at(info->value_type.size, n, index,
702 keys[level], value);
703 if (r)
704 goto bad_unblessed;
705 } else {
706 if (inserted)
707 *inserted = 0;
708
709 if (info->value_type.dec &&
710 (!info->value_type.equal ||
711 !info->value_type.equal(
712 info->value_type.context,
713 value_ptr(n, index, info->value_type.size),
714 value))) {
715 info->value_type.dec(info->value_type.context,
716 value_ptr(n, index, info->value_type.size));
717 }
718 memcpy_disk(value_ptr(n, index, info->value_type.size),
719 value, info->value_type.size);
720 }
721
722 *new_root = shadow_root(&spine);
723 exit_shadow_spine(&spine);
724
725 return 0;
726
727bad:
728 __dm_unbless_for_disk(value);
729bad_unblessed:
730 exit_shadow_spine(&spine);
731 return r;
732}
733
734int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
735 uint64_t *keys, void *value, dm_block_t *new_root)
736 __dm_written_to_disk(value)
737{
738 return insert(info, root, keys, value, new_root, NULL);
739}
740EXPORT_SYMBOL_GPL(dm_btree_insert);
741
742int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
743 uint64_t *keys, void *value, dm_block_t *new_root,
744 int *inserted)
745 __dm_written_to_disk(value)
746{
747 return insert(info, root, keys, value, new_root, inserted);
748}
749EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
750
751/*----------------------------------------------------------------*/
752
753static int find_highest_key(struct ro_spine *s, dm_block_t block,
754 uint64_t *result_key, dm_block_t *next_block)
755{
756 int i, r;
757 uint32_t flags;
758
759 do {
760 r = ro_step(s, block);
761 if (r < 0)
762 return r;
763
764 flags = le32_to_cpu(ro_node(s)->header.flags);
765 i = le32_to_cpu(ro_node(s)->header.nr_entries);
766 if (!i)
767 return -ENODATA;
768 else
769 i--;
770
771 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
772 if (next_block || flags & INTERNAL_NODE)
773 block = value64(ro_node(s), i);
774
775 } while (flags & INTERNAL_NODE);
776
777 if (next_block)
778 *next_block = block;
779 return 0;
780}
781
782int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
783 uint64_t *result_keys)
784{
785 int r = 0, count = 0, level;
786 struct ro_spine spine;
787
788 init_ro_spine(&spine, info);
789 for (level = 0; level < info->levels; level++) {
790 r = find_highest_key(&spine, root, result_keys + level,
791 level == info->levels - 1 ? NULL : &root);
792 if (r == -ENODATA) {
793 r = 0;
794 break;
795
796 } else if (r)
797 break;
798
799 count++;
800 }
801 exit_ro_spine(&spine);
802
803 return r ? r : count;
804}
805EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
new file mode 100644
index 000000000000..ae02c84410ff
--- /dev/null
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -0,0 +1,145 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6#ifndef _LINUX_DM_BTREE_H
7#define _LINUX_DM_BTREE_H
8
9#include "dm-block-manager.h"
10
11struct dm_transaction_manager;
12
13/*----------------------------------------------------------------*/
14
15/*
16 * Annotations used to check on-disk metadata is handled as little-endian.
17 */
18#ifdef __CHECKER__
19# define __dm_written_to_disk(x) __releases(x)
20# define __dm_reads_from_disk(x) __acquires(x)
21# define __dm_bless_for_disk(x) __acquire(x)
22# define __dm_unbless_for_disk(x) __release(x)
23#else
24# define __dm_written_to_disk(x)
25# define __dm_reads_from_disk(x)
26# define __dm_bless_for_disk(x)
27# define __dm_unbless_for_disk(x)
28#endif
29
30/*----------------------------------------------------------------*/
31
32/*
33 * Manipulates hierarchical B+ trees with 64-bit keys and arbitrary-sized
34 * values.
35 */
36
37/*
38 * Infomation about the values stored within the btree.
39 */
40struct dm_btree_value_type {
41 void *context;
42
43 /*
44 * The size in bytes of each value.
45 */
46 uint32_t size;
47
48 /*
49 * Any of these methods can be safely set to NULL if you do not
50 * need the corresponding feature.
51 */
52
53 /*
54 * The btree is making a duplicate of the value, for instance
55 * because previously-shared btree nodes have now diverged.
56 * @value argument is the new copy that the copy function may modify.
57 * (Probably it just wants to increment a reference count
58 * somewhere.) This method is _not_ called for insertion of a new
59 * value: It is assumed the ref count is already 1.
60 */
61 void (*inc)(void *context, void *value);
62
63 /*
64 * This value is being deleted. The btree takes care of freeing
65 * the memory pointed to by @value. Often the del function just
66 * needs to decrement a reference count somewhere.
67 */
68 void (*dec)(void *context, void *value);
69
70 /*
71 * A test for equality between two values. When a value is
72 * overwritten with a new one, the old one has the dec method
73 * called _unless_ the new and old value are deemed equal.
74 */
75 int (*equal)(void *context, void *value1, void *value2);
76};
77
78/*
79 * The shape and contents of a btree.
80 */
81struct dm_btree_info {
82 struct dm_transaction_manager *tm;
83
84 /*
85 * Number of nested btrees. (Not the depth of a single tree.)
86 */
87 unsigned levels;
88 struct dm_btree_value_type value_type;
89};
90
91/*
92 * Set up an empty tree. O(1).
93 */
94int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root);
95
96/*
97 * Delete a tree. O(n) - this is the slow one! It can also block, so
98 * please don't call it on an IO path.
99 */
100int dm_btree_del(struct dm_btree_info *info, dm_block_t root);
101
102/*
103 * All the lookup functions return -ENODATA if the key cannot be found.
104 */
105
106/*
107 * Tries to find a key that matches exactly. O(ln(n))
108 */
109int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
110 uint64_t *keys, void *value_le);
111
112/*
113 * Insertion (or overwrite an existing value). O(ln(n))
114 */
115int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
116 uint64_t *keys, void *value, dm_block_t *new_root)
117 __dm_written_to_disk(value);
118
119/*
120 * A variant of insert that indicates whether it actually inserted or just
121 * overwrote. Useful if you're keeping track of the number of entries in a
122 * tree.
123 */
124int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
125 uint64_t *keys, void *value, dm_block_t *new_root,
126 int *inserted)
127 __dm_written_to_disk(value);
128
129/*
130 * Remove a key if present. This doesn't remove empty sub trees. Normally
131 * subtrees represent a separate entity, like a snapshot map, so this is
132 * correct behaviour. O(ln(n)).
133 */
134int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
135 uint64_t *keys, dm_block_t *new_root);
136
137/*
138 * Returns < 0 on failure. Otherwise the number of key entries that have
139 * been filled out. Remember trees can have zero entries, and as such have
140 * no highest key.
141 */
142int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
143 uint64_t *result_keys);
144
145#endif /* _LINUX_DM_BTREE_H */
diff --git a/drivers/md/persistent-data/dm-persistent-data-internal.h b/drivers/md/persistent-data/dm-persistent-data-internal.h
new file mode 100644
index 000000000000..c49e26fff36c
--- /dev/null
+++ b/drivers/md/persistent-data/dm-persistent-data-internal.h
@@ -0,0 +1,19 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef _DM_PERSISTENT_DATA_INTERNAL_H
8#define _DM_PERSISTENT_DATA_INTERNAL_H
9
10#include "dm-block-manager.h"
11
12static inline unsigned dm_hash_block(dm_block_t b, unsigned hash_mask)
13{
14 const unsigned BIG_PRIME = 4294967291UL;
15
16 return (((unsigned) b) * BIG_PRIME) & hash_mask;
17}
18
19#endif /* _PERSISTENT_DATA_INTERNAL_H */
diff --git a/drivers/md/persistent-data/dm-space-map-checker.c b/drivers/md/persistent-data/dm-space-map-checker.c
new file mode 100644
index 000000000000..bb44a937fe63
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-checker.c
@@ -0,0 +1,437 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-space-map-checker.h"
8
9#include <linux/device-mapper.h>
10
11#ifdef CONFIG_DM_DEBUG_SPACE_MAPS
12
13#define DM_MSG_PREFIX "space map checker"
14
15/*----------------------------------------------------------------*/
16
17struct count_array {
18 dm_block_t nr;
19 dm_block_t nr_free;
20
21 uint32_t *counts;
22};
23
24static int ca_get_count(struct count_array *ca, dm_block_t b, uint32_t *count)
25{
26 if (b >= ca->nr)
27 return -EINVAL;
28
29 *count = ca->counts[b];
30 return 0;
31}
32
33static int ca_count_more_than_one(struct count_array *ca, dm_block_t b, int *r)
34{
35 if (b >= ca->nr)
36 return -EINVAL;
37
38 *r = ca->counts[b] > 1;
39 return 0;
40}
41
42static int ca_set_count(struct count_array *ca, dm_block_t b, uint32_t count)
43{
44 uint32_t old_count;
45
46 if (b >= ca->nr)
47 return -EINVAL;
48
49 old_count = ca->counts[b];
50
51 if (!count && old_count)
52 ca->nr_free++;
53
54 else if (count && !old_count)
55 ca->nr_free--;
56
57 ca->counts[b] = count;
58 return 0;
59}
60
61static int ca_inc_block(struct count_array *ca, dm_block_t b)
62{
63 if (b >= ca->nr)
64 return -EINVAL;
65
66 ca_set_count(ca, b, ca->counts[b] + 1);
67 return 0;
68}
69
70static int ca_dec_block(struct count_array *ca, dm_block_t b)
71{
72 if (b >= ca->nr)
73 return -EINVAL;
74
75 BUG_ON(ca->counts[b] == 0);
76 ca_set_count(ca, b, ca->counts[b] - 1);
77 return 0;
78}
79
80static int ca_create(struct count_array *ca, struct dm_space_map *sm)
81{
82 int r;
83 dm_block_t nr_blocks;
84
85 r = dm_sm_get_nr_blocks(sm, &nr_blocks);
86 if (r)
87 return r;
88
89 ca->nr = nr_blocks;
90 ca->nr_free = nr_blocks;
91 ca->counts = kzalloc(sizeof(*ca->counts) * nr_blocks, GFP_KERNEL);
92 if (!ca->counts)
93 return -ENOMEM;
94
95 return 0;
96}
97
98static int ca_load(struct count_array *ca, struct dm_space_map *sm)
99{
100 int r;
101 uint32_t count;
102 dm_block_t nr_blocks, i;
103
104 r = dm_sm_get_nr_blocks(sm, &nr_blocks);
105 if (r)
106 return r;
107
108 BUG_ON(ca->nr != nr_blocks);
109
110 DMWARN("Loading debug space map from disk. This may take some time");
111 for (i = 0; i < nr_blocks; i++) {
112 r = dm_sm_get_count(sm, i, &count);
113 if (r) {
114 DMERR("load failed");
115 return r;
116 }
117
118 ca_set_count(ca, i, count);
119 }
120 DMWARN("Load complete");
121
122 return 0;
123}
124
125static int ca_extend(struct count_array *ca, dm_block_t extra_blocks)
126{
127 dm_block_t nr_blocks = ca->nr + extra_blocks;
128 uint32_t *counts = kzalloc(sizeof(*counts) * nr_blocks, GFP_KERNEL);
129 if (!counts)
130 return -ENOMEM;
131
132 memcpy(counts, ca->counts, sizeof(*counts) * ca->nr);
133 kfree(ca->counts);
134 ca->nr = nr_blocks;
135 ca->nr_free += extra_blocks;
136 ca->counts = counts;
137 return 0;
138}
139
140static int ca_commit(struct count_array *old, struct count_array *new)
141{
142 if (old->nr != new->nr) {
143 BUG_ON(old->nr > new->nr);
144 ca_extend(old, new->nr - old->nr);
145 }
146
147 BUG_ON(old->nr != new->nr);
148 old->nr_free = new->nr_free;
149 memcpy(old->counts, new->counts, sizeof(*old->counts) * old->nr);
150 return 0;
151}
152
153static void ca_destroy(struct count_array *ca)
154{
155 kfree(ca->counts);
156}
157
158/*----------------------------------------------------------------*/
159
160struct sm_checker {
161 struct dm_space_map sm;
162
163 struct count_array old_counts;
164 struct count_array counts;
165
166 struct dm_space_map *real_sm;
167};
168
169static void sm_checker_destroy(struct dm_space_map *sm)
170{
171 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
172
173 dm_sm_destroy(smc->real_sm);
174 ca_destroy(&smc->old_counts);
175 ca_destroy(&smc->counts);
176 kfree(smc);
177}
178
179static int sm_checker_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
180{
181 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
182 int r = dm_sm_get_nr_blocks(smc->real_sm, count);
183 if (!r)
184 BUG_ON(smc->old_counts.nr != *count);
185 return r;
186}
187
188static int sm_checker_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
189{
190 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
191 int r = dm_sm_get_nr_free(smc->real_sm, count);
192 if (!r) {
193 /*
194 * Slow, but we know it's correct.
195 */
196 dm_block_t b, n = 0;
197 for (b = 0; b < smc->old_counts.nr; b++)
198 if (smc->old_counts.counts[b] == 0 &&
199 smc->counts.counts[b] == 0)
200 n++;
201
202 if (n != *count)
203 DMERR("free block counts differ, checker %u, sm-disk:%u",
204 (unsigned) n, (unsigned) *count);
205 }
206 return r;
207}
208
209static int sm_checker_new_block(struct dm_space_map *sm, dm_block_t *b)
210{
211 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
212 int r = dm_sm_new_block(smc->real_sm, b);
213
214 if (!r) {
215 BUG_ON(*b >= smc->old_counts.nr);
216 BUG_ON(smc->old_counts.counts[*b] != 0);
217 BUG_ON(*b >= smc->counts.nr);
218 BUG_ON(smc->counts.counts[*b] != 0);
219 ca_set_count(&smc->counts, *b, 1);
220 }
221
222 return r;
223}
224
225static int sm_checker_inc_block(struct dm_space_map *sm, dm_block_t b)
226{
227 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
228 int r = dm_sm_inc_block(smc->real_sm, b);
229 int r2 = ca_inc_block(&smc->counts, b);
230 BUG_ON(r != r2);
231 return r;
232}
233
234static int sm_checker_dec_block(struct dm_space_map *sm, dm_block_t b)
235{
236 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
237 int r = dm_sm_dec_block(smc->real_sm, b);
238 int r2 = ca_dec_block(&smc->counts, b);
239 BUG_ON(r != r2);
240 return r;
241}
242
243static int sm_checker_get_count(struct dm_space_map *sm, dm_block_t b, uint32_t *result)
244{
245 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
246 uint32_t result2 = 0;
247 int r = dm_sm_get_count(smc->real_sm, b, result);
248 int r2 = ca_get_count(&smc->counts, b, &result2);
249
250 BUG_ON(r != r2);
251 if (!r)
252 BUG_ON(*result != result2);
253 return r;
254}
255
256static int sm_checker_count_more_than_one(struct dm_space_map *sm, dm_block_t b, int *result)
257{
258 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
259 int result2 = 0;
260 int r = dm_sm_count_is_more_than_one(smc->real_sm, b, result);
261 int r2 = ca_count_more_than_one(&smc->counts, b, &result2);
262
263 BUG_ON(r != r2);
264 if (!r)
265 BUG_ON(!(*result) && result2);
266 return r;
267}
268
269static int sm_checker_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count)
270{
271 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
272 uint32_t old_rc;
273 int r = dm_sm_set_count(smc->real_sm, b, count);
274 int r2;
275
276 BUG_ON(b >= smc->counts.nr);
277 old_rc = smc->counts.counts[b];
278 r2 = ca_set_count(&smc->counts, b, count);
279 BUG_ON(r != r2);
280
281 return r;
282}
283
284static int sm_checker_commit(struct dm_space_map *sm)
285{
286 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
287 int r;
288
289 r = dm_sm_commit(smc->real_sm);
290 if (r)
291 return r;
292
293 r = ca_commit(&smc->old_counts, &smc->counts);
294 if (r)
295 return r;
296
297 return 0;
298}
299
300static int sm_checker_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
301{
302 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
303 int r = dm_sm_extend(smc->real_sm, extra_blocks);
304 if (r)
305 return r;
306
307 return ca_extend(&smc->counts, extra_blocks);
308}
309
310static int sm_checker_root_size(struct dm_space_map *sm, size_t *result)
311{
312 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
313 return dm_sm_root_size(smc->real_sm, result);
314}
315
316static int sm_checker_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len)
317{
318 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
319 return dm_sm_copy_root(smc->real_sm, copy_to_here_le, len);
320}
321
322/*----------------------------------------------------------------*/
323
324static struct dm_space_map ops_ = {
325 .destroy = sm_checker_destroy,
326 .get_nr_blocks = sm_checker_get_nr_blocks,
327 .get_nr_free = sm_checker_get_nr_free,
328 .inc_block = sm_checker_inc_block,
329 .dec_block = sm_checker_dec_block,
330 .new_block = sm_checker_new_block,
331 .get_count = sm_checker_get_count,
332 .count_is_more_than_one = sm_checker_count_more_than_one,
333 .set_count = sm_checker_set_count,
334 .commit = sm_checker_commit,
335 .extend = sm_checker_extend,
336 .root_size = sm_checker_root_size,
337 .copy_root = sm_checker_copy_root
338};
339
340struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
341{
342 int r;
343 struct sm_checker *smc;
344
345 if (!sm)
346 return NULL;
347
348 smc = kmalloc(sizeof(*smc), GFP_KERNEL);
349 if (!smc)
350 return NULL;
351
352 memcpy(&smc->sm, &ops_, sizeof(smc->sm));
353 r = ca_create(&smc->old_counts, sm);
354 if (r) {
355 kfree(smc);
356 return NULL;
357 }
358
359 r = ca_create(&smc->counts, sm);
360 if (r) {
361 ca_destroy(&smc->old_counts);
362 kfree(smc);
363 return NULL;
364 }
365
366 smc->real_sm = sm;
367
368 r = ca_load(&smc->counts, sm);
369 if (r) {
370 ca_destroy(&smc->counts);
371 ca_destroy(&smc->old_counts);
372 kfree(smc);
373 return NULL;
374 }
375
376 r = ca_commit(&smc->old_counts, &smc->counts);
377 if (r) {
378 ca_destroy(&smc->counts);
379 ca_destroy(&smc->old_counts);
380 kfree(smc);
381 return NULL;
382 }
383
384 return &smc->sm;
385}
386EXPORT_SYMBOL_GPL(dm_sm_checker_create);
387
388struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm)
389{
390 int r;
391 struct sm_checker *smc;
392
393 if (!sm)
394 return NULL;
395
396 smc = kmalloc(sizeof(*smc), GFP_KERNEL);
397 if (!smc)
398 return NULL;
399
400 memcpy(&smc->sm, &ops_, sizeof(smc->sm));
401 r = ca_create(&smc->old_counts, sm);
402 if (r) {
403 kfree(smc);
404 return NULL;
405 }
406
407 r = ca_create(&smc->counts, sm);
408 if (r) {
409 ca_destroy(&smc->old_counts);
410 kfree(smc);
411 return NULL;
412 }
413
414 smc->real_sm = sm;
415 return &smc->sm;
416}
417EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh);
418
419/*----------------------------------------------------------------*/
420
421#else
422
423struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
424{
425 return sm;
426}
427EXPORT_SYMBOL_GPL(dm_sm_checker_create);
428
429struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm)
430{
431 return sm;
432}
433EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh);
434
435/*----------------------------------------------------------------*/
436
437#endif
diff --git a/drivers/md/persistent-data/dm-space-map-checker.h b/drivers/md/persistent-data/dm-space-map-checker.h
new file mode 100644
index 000000000000..444dccf6688c
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-checker.h
@@ -0,0 +1,26 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef SNAPSHOTS_SPACE_MAP_CHECKER_H
8#define SNAPSHOTS_SPACE_MAP_CHECKER_H
9
10#include "dm-space-map.h"
11
12/*----------------------------------------------------------------*/
13
14/*
15 * This space map wraps a real on-disk space map, and verifies all of its
16 * operations. It uses a lot of memory, so only use if you have a specific
17 * problem that you're debugging.
18 *
19 * Ownership of @sm passes.
20 */
21struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm);
22struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm);
23
24/*----------------------------------------------------------------*/
25
26#endif
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
new file mode 100644
index 000000000000..df2494c06cdc
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -0,0 +1,705 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-space-map-common.h"
8#include "dm-transaction-manager.h"
9
10#include <linux/bitops.h>
11#include <linux/device-mapper.h>
12
13#define DM_MSG_PREFIX "space map common"
14
15/*----------------------------------------------------------------*/
16
17/*
18 * Index validator.
19 */
20#define INDEX_CSUM_XOR 160478
21
22static void index_prepare_for_write(struct dm_block_validator *v,
23 struct dm_block *b,
24 size_t block_size)
25{
26 struct disk_metadata_index *mi_le = dm_block_data(b);
27
28 mi_le->blocknr = cpu_to_le64(dm_block_location(b));
29 mi_le->csum = cpu_to_le32(dm_bm_checksum(&mi_le->padding,
30 block_size - sizeof(__le32),
31 INDEX_CSUM_XOR));
32}
33
34static int index_check(struct dm_block_validator *v,
35 struct dm_block *b,
36 size_t block_size)
37{
38 struct disk_metadata_index *mi_le = dm_block_data(b);
39 __le32 csum_disk;
40
41 if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) {
42 DMERR("index_check failed blocknr %llu wanted %llu",
43 le64_to_cpu(mi_le->blocknr), dm_block_location(b));
44 return -ENOTBLK;
45 }
46
47 csum_disk = cpu_to_le32(dm_bm_checksum(&mi_le->padding,
48 block_size - sizeof(__le32),
49 INDEX_CSUM_XOR));
50 if (csum_disk != mi_le->csum) {
51 DMERR("index_check failed csum %u wanted %u",
52 le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum));
53 return -EILSEQ;
54 }
55
56 return 0;
57}
58
59static struct dm_block_validator index_validator = {
60 .name = "index",
61 .prepare_for_write = index_prepare_for_write,
62 .check = index_check
63};
64
65/*----------------------------------------------------------------*/
66
67/*
68 * Bitmap validator
69 */
70#define BITMAP_CSUM_XOR 240779
71
72static void bitmap_prepare_for_write(struct dm_block_validator *v,
73 struct dm_block *b,
74 size_t block_size)
75{
76 struct disk_bitmap_header *disk_header = dm_block_data(b);
77
78 disk_header->blocknr = cpu_to_le64(dm_block_location(b));
79 disk_header->csum = cpu_to_le32(dm_bm_checksum(&disk_header->not_used,
80 block_size - sizeof(__le32),
81 BITMAP_CSUM_XOR));
82}
83
84static int bitmap_check(struct dm_block_validator *v,
85 struct dm_block *b,
86 size_t block_size)
87{
88 struct disk_bitmap_header *disk_header = dm_block_data(b);
89 __le32 csum_disk;
90
91 if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) {
92 DMERR("bitmap check failed blocknr %llu wanted %llu",
93 le64_to_cpu(disk_header->blocknr), dm_block_location(b));
94 return -ENOTBLK;
95 }
96
97 csum_disk = cpu_to_le32(dm_bm_checksum(&disk_header->not_used,
98 block_size - sizeof(__le32),
99 BITMAP_CSUM_XOR));
100 if (csum_disk != disk_header->csum) {
101 DMERR("bitmap check failed csum %u wanted %u",
102 le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum));
103 return -EILSEQ;
104 }
105
106 return 0;
107}
108
109static struct dm_block_validator dm_sm_bitmap_validator = {
110 .name = "sm_bitmap",
111 .prepare_for_write = bitmap_prepare_for_write,
112 .check = bitmap_check
113};
114
115/*----------------------------------------------------------------*/
116
117#define ENTRIES_PER_WORD 32
118#define ENTRIES_SHIFT 5
119
120static void *dm_bitmap_data(struct dm_block *b)
121{
122 return dm_block_data(b) + sizeof(struct disk_bitmap_header);
123}
124
125#define WORD_MASK_HIGH 0xAAAAAAAAAAAAAAAAULL
126
127static unsigned bitmap_word_used(void *addr, unsigned b)
128{
129 __le64 *words_le = addr;
130 __le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
131
132 uint64_t bits = le64_to_cpu(*w_le);
133 uint64_t mask = (bits + WORD_MASK_HIGH + 1) & WORD_MASK_HIGH;
134
135 return !(~bits & mask);
136}
137
138static unsigned sm_lookup_bitmap(void *addr, unsigned b)
139{
140 __le64 *words_le = addr;
141 __le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
142 unsigned hi, lo;
143
144 b = (b & (ENTRIES_PER_WORD - 1)) << 1;
145 hi = !!test_bit_le(b, (void *) w_le);
146 lo = !!test_bit_le(b + 1, (void *) w_le);
147 return (hi << 1) | lo;
148}
149
150static void sm_set_bitmap(void *addr, unsigned b, unsigned val)
151{
152 __le64 *words_le = addr;
153 __le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
154
155 b = (b & (ENTRIES_PER_WORD - 1)) << 1;
156
157 if (val & 2)
158 __set_bit_le(b, (void *) w_le);
159 else
160 __clear_bit_le(b, (void *) w_le);
161
162 if (val & 1)
163 __set_bit_le(b + 1, (void *) w_le);
164 else
165 __clear_bit_le(b + 1, (void *) w_le);
166}
167
168static int sm_find_free(void *addr, unsigned begin, unsigned end,
169 unsigned *result)
170{
171 while (begin < end) {
172 if (!(begin & (ENTRIES_PER_WORD - 1)) &&
173 bitmap_word_used(addr, begin)) {
174 begin += ENTRIES_PER_WORD;
175 continue;
176 }
177
178 if (!sm_lookup_bitmap(addr, begin)) {
179 *result = begin;
180 return 0;
181 }
182
183 begin++;
184 }
185
186 return -ENOSPC;
187}
188
189/*----------------------------------------------------------------*/
190
191static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm)
192{
193 ll->tm = tm;
194
195 ll->bitmap_info.tm = tm;
196 ll->bitmap_info.levels = 1;
197
198 /*
199 * Because the new bitmap blocks are created via a shadow
200 * operation, the old entry has already had its reference count
201 * decremented and we don't need the btree to do any bookkeeping.
202 */
203 ll->bitmap_info.value_type.size = sizeof(struct disk_index_entry);
204 ll->bitmap_info.value_type.inc = NULL;
205 ll->bitmap_info.value_type.dec = NULL;
206 ll->bitmap_info.value_type.equal = NULL;
207
208 ll->ref_count_info.tm = tm;
209 ll->ref_count_info.levels = 1;
210 ll->ref_count_info.value_type.size = sizeof(uint32_t);
211 ll->ref_count_info.value_type.inc = NULL;
212 ll->ref_count_info.value_type.dec = NULL;
213 ll->ref_count_info.value_type.equal = NULL;
214
215 ll->block_size = dm_bm_block_size(dm_tm_get_bm(tm));
216
217 if (ll->block_size > (1 << 30)) {
218 DMERR("block size too big to hold bitmaps");
219 return -EINVAL;
220 }
221
222 ll->entries_per_block = (ll->block_size - sizeof(struct disk_bitmap_header)) *
223 ENTRIES_PER_BYTE;
224 ll->nr_blocks = 0;
225 ll->bitmap_root = 0;
226 ll->ref_count_root = 0;
227
228 return 0;
229}
230
231int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
232{
233 int r;
234 dm_block_t i, nr_blocks, nr_indexes;
235 unsigned old_blocks, blocks;
236
237 nr_blocks = ll->nr_blocks + extra_blocks;
238 old_blocks = dm_sector_div_up(ll->nr_blocks, ll->entries_per_block);
239 blocks = dm_sector_div_up(nr_blocks, ll->entries_per_block);
240
241 nr_indexes = dm_sector_div_up(nr_blocks, ll->entries_per_block);
242 if (nr_indexes > ll->max_entries(ll)) {
243 DMERR("space map too large");
244 return -EINVAL;
245 }
246
247 for (i = old_blocks; i < blocks; i++) {
248 struct dm_block *b;
249 struct disk_index_entry idx;
250
251 r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b);
252 if (r < 0)
253 return r;
254 idx.blocknr = cpu_to_le64(dm_block_location(b));
255
256 r = dm_tm_unlock(ll->tm, b);
257 if (r < 0)
258 return r;
259
260 idx.nr_free = cpu_to_le32(ll->entries_per_block);
261 idx.none_free_before = 0;
262
263 r = ll->save_ie(ll, i, &idx);
264 if (r < 0)
265 return r;
266 }
267
268 ll->nr_blocks = nr_blocks;
269 return 0;
270}
271
272int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result)
273{
274 int r;
275 dm_block_t index = b;
276 struct disk_index_entry ie_disk;
277 struct dm_block *blk;
278
279 b = do_div(index, ll->entries_per_block);
280 r = ll->load_ie(ll, index, &ie_disk);
281 if (r < 0)
282 return r;
283
284 r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr),
285 &dm_sm_bitmap_validator, &blk);
286 if (r < 0)
287 return r;
288
289 *result = sm_lookup_bitmap(dm_bitmap_data(blk), b);
290
291 return dm_tm_unlock(ll->tm, blk);
292}
293
294int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result)
295{
296 __le32 le_rc;
297 int r = sm_ll_lookup_bitmap(ll, b, result);
298
299 if (r)
300 return r;
301
302 if (*result != 3)
303 return r;
304
305 r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc);
306 if (r < 0)
307 return r;
308
309 *result = le32_to_cpu(le_rc);
310
311 return r;
312}
313
314int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
315 dm_block_t end, dm_block_t *result)
316{
317 int r;
318 struct disk_index_entry ie_disk;
319 dm_block_t i, index_begin = begin;
320 dm_block_t index_end = dm_sector_div_up(end, ll->entries_per_block);
321
322 /*
323 * FIXME: Use shifts
324 */
325 begin = do_div(index_begin, ll->entries_per_block);
326 end = do_div(end, ll->entries_per_block);
327
328 for (i = index_begin; i < index_end; i++, begin = 0) {
329 struct dm_block *blk;
330 unsigned position;
331 uint32_t bit_end;
332
333 r = ll->load_ie(ll, i, &ie_disk);
334 if (r < 0)
335 return r;
336
337 if (le32_to_cpu(ie_disk.nr_free) == 0)
338 continue;
339
340 r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr),
341 &dm_sm_bitmap_validator, &blk);
342 if (r < 0)
343 return r;
344
345 bit_end = (i == index_end - 1) ? end : ll->entries_per_block;
346
347 r = sm_find_free(dm_bitmap_data(blk),
348 max_t(unsigned, begin, le32_to_cpu(ie_disk.none_free_before)),
349 bit_end, &position);
350 if (r == -ENOSPC) {
351 /*
352 * This might happen because we started searching
353 * part way through the bitmap.
354 */
355 dm_tm_unlock(ll->tm, blk);
356 continue;
357
358 } else if (r < 0) {
359 dm_tm_unlock(ll->tm, blk);
360 return r;
361 }
362
363 r = dm_tm_unlock(ll->tm, blk);
364 if (r < 0)
365 return r;
366
367 *result = i * ll->entries_per_block + (dm_block_t) position;
368 return 0;
369 }
370
371 return -ENOSPC;
372}
373
374int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
375 uint32_t ref_count, enum allocation_event *ev)
376{
377 int r;
378 uint32_t bit, old;
379 struct dm_block *nb;
380 dm_block_t index = b;
381 struct disk_index_entry ie_disk;
382 void *bm_le;
383 int inc;
384
385 bit = do_div(index, ll->entries_per_block);
386 r = ll->load_ie(ll, index, &ie_disk);
387 if (r < 0)
388 return r;
389
390 r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ie_disk.blocknr),
391 &dm_sm_bitmap_validator, &nb, &inc);
392 if (r < 0) {
393 DMERR("dm_tm_shadow_block() failed");
394 return r;
395 }
396 ie_disk.blocknr = cpu_to_le64(dm_block_location(nb));
397
398 bm_le = dm_bitmap_data(nb);
399 old = sm_lookup_bitmap(bm_le, bit);
400
401 if (ref_count <= 2) {
402 sm_set_bitmap(bm_le, bit, ref_count);
403
404 r = dm_tm_unlock(ll->tm, nb);
405 if (r < 0)
406 return r;
407
408#if 0
409 /* FIXME: dm_btree_remove doesn't handle this yet */
410 if (old > 2) {
411 r = dm_btree_remove(&ll->ref_count_info,
412 ll->ref_count_root,
413 &b, &ll->ref_count_root);
414 if (r)
415 return r;
416 }
417#endif
418
419 } else {
420 __le32 le_rc = cpu_to_le32(ref_count);
421
422 sm_set_bitmap(bm_le, bit, 3);
423 r = dm_tm_unlock(ll->tm, nb);
424 if (r < 0)
425 return r;
426
427 __dm_bless_for_disk(&le_rc);
428 r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root,
429 &b, &le_rc, &ll->ref_count_root);
430 if (r < 0) {
431 DMERR("ref count insert failed");
432 return r;
433 }
434 }
435
436 if (ref_count && !old) {
437 *ev = SM_ALLOC;
438 ll->nr_allocated++;
439 ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) - 1);
440 if (le32_to_cpu(ie_disk.none_free_before) == bit)
441 ie_disk.none_free_before = cpu_to_le32(bit + 1);
442
443 } else if (old && !ref_count) {
444 *ev = SM_FREE;
445 ll->nr_allocated--;
446 ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) + 1);
447 ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit));
448 }
449
450 return ll->save_ie(ll, index, &ie_disk);
451}
452
453int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
454{
455 int r;
456 uint32_t rc;
457
458 r = sm_ll_lookup(ll, b, &rc);
459 if (r)
460 return r;
461
462 return sm_ll_insert(ll, b, rc + 1, ev);
463}
464
465int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
466{
467 int r;
468 uint32_t rc;
469
470 r = sm_ll_lookup(ll, b, &rc);
471 if (r)
472 return r;
473
474 if (!rc)
475 return -EINVAL;
476
477 return sm_ll_insert(ll, b, rc - 1, ev);
478}
479
480int sm_ll_commit(struct ll_disk *ll)
481{
482 return ll->commit(ll);
483}
484
485/*----------------------------------------------------------------*/
486
487static int metadata_ll_load_ie(struct ll_disk *ll, dm_block_t index,
488 struct disk_index_entry *ie)
489{
490 memcpy(ie, ll->mi_le.index + index, sizeof(*ie));
491 return 0;
492}
493
494static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index,
495 struct disk_index_entry *ie)
496{
497 memcpy(ll->mi_le.index + index, ie, sizeof(*ie));
498 return 0;
499}
500
501static int metadata_ll_init_index(struct ll_disk *ll)
502{
503 int r;
504 struct dm_block *b;
505
506 r = dm_tm_new_block(ll->tm, &index_validator, &b);
507 if (r < 0)
508 return r;
509
510 memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
511 ll->bitmap_root = dm_block_location(b);
512
513 return dm_tm_unlock(ll->tm, b);
514}
515
516static int metadata_ll_open(struct ll_disk *ll)
517{
518 int r;
519 struct dm_block *block;
520
521 r = dm_tm_read_lock(ll->tm, ll->bitmap_root,
522 &index_validator, &block);
523 if (r)
524 return r;
525
526 memcpy(&ll->mi_le, dm_block_data(block), sizeof(ll->mi_le));
527 return dm_tm_unlock(ll->tm, block);
528}
529
530static dm_block_t metadata_ll_max_entries(struct ll_disk *ll)
531{
532 return MAX_METADATA_BITMAPS;
533}
534
535static int metadata_ll_commit(struct ll_disk *ll)
536{
537 int r, inc;
538 struct dm_block *b;
539
540 r = dm_tm_shadow_block(ll->tm, ll->bitmap_root, &index_validator, &b, &inc);
541 if (r)
542 return r;
543
544 memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
545 ll->bitmap_root = dm_block_location(b);
546
547 return dm_tm_unlock(ll->tm, b);
548}
549
550int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm)
551{
552 int r;
553
554 r = sm_ll_init(ll, tm);
555 if (r < 0)
556 return r;
557
558 ll->load_ie = metadata_ll_load_ie;
559 ll->save_ie = metadata_ll_save_ie;
560 ll->init_index = metadata_ll_init_index;
561 ll->open_index = metadata_ll_open;
562 ll->max_entries = metadata_ll_max_entries;
563 ll->commit = metadata_ll_commit;
564
565 ll->nr_blocks = 0;
566 ll->nr_allocated = 0;
567
568 r = ll->init_index(ll);
569 if (r < 0)
570 return r;
571
572 r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root);
573 if (r < 0)
574 return r;
575
576 return 0;
577}
578
579int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
580 void *root_le, size_t len)
581{
582 int r;
583 struct disk_sm_root *smr = root_le;
584
585 if (len < sizeof(struct disk_sm_root)) {
586 DMERR("sm_metadata root too small");
587 return -ENOMEM;
588 }
589
590 r = sm_ll_init(ll, tm);
591 if (r < 0)
592 return r;
593
594 ll->load_ie = metadata_ll_load_ie;
595 ll->save_ie = metadata_ll_save_ie;
596 ll->init_index = metadata_ll_init_index;
597 ll->open_index = metadata_ll_open;
598 ll->max_entries = metadata_ll_max_entries;
599 ll->commit = metadata_ll_commit;
600
601 ll->nr_blocks = le64_to_cpu(smr->nr_blocks);
602 ll->nr_allocated = le64_to_cpu(smr->nr_allocated);
603 ll->bitmap_root = le64_to_cpu(smr->bitmap_root);
604 ll->ref_count_root = le64_to_cpu(smr->ref_count_root);
605
606 return ll->open_index(ll);
607}
608
609/*----------------------------------------------------------------*/
610
611static int disk_ll_load_ie(struct ll_disk *ll, dm_block_t index,
612 struct disk_index_entry *ie)
613{
614 return dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie);
615}
616
617static int disk_ll_save_ie(struct ll_disk *ll, dm_block_t index,
618 struct disk_index_entry *ie)
619{
620 __dm_bless_for_disk(ie);
621 return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root,
622 &index, ie, &ll->bitmap_root);
623}
624
625static int disk_ll_init_index(struct ll_disk *ll)
626{
627 return dm_btree_empty(&ll->bitmap_info, &ll->bitmap_root);
628}
629
630static int disk_ll_open(struct ll_disk *ll)
631{
632 /* nothing to do */
633 return 0;
634}
635
636static dm_block_t disk_ll_max_entries(struct ll_disk *ll)
637{
638 return -1ULL;
639}
640
641static int disk_ll_commit(struct ll_disk *ll)
642{
643 return 0;
644}
645
646int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm)
647{
648 int r;
649
650 r = sm_ll_init(ll, tm);
651 if (r < 0)
652 return r;
653
654 ll->load_ie = disk_ll_load_ie;
655 ll->save_ie = disk_ll_save_ie;
656 ll->init_index = disk_ll_init_index;
657 ll->open_index = disk_ll_open;
658 ll->max_entries = disk_ll_max_entries;
659 ll->commit = disk_ll_commit;
660
661 ll->nr_blocks = 0;
662 ll->nr_allocated = 0;
663
664 r = ll->init_index(ll);
665 if (r < 0)
666 return r;
667
668 r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root);
669 if (r < 0)
670 return r;
671
672 return 0;
673}
674
675int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm,
676 void *root_le, size_t len)
677{
678 int r;
679 struct disk_sm_root *smr = root_le;
680
681 if (len < sizeof(struct disk_sm_root)) {
682 DMERR("sm_metadata root too small");
683 return -ENOMEM;
684 }
685
686 r = sm_ll_init(ll, tm);
687 if (r < 0)
688 return r;
689
690 ll->load_ie = disk_ll_load_ie;
691 ll->save_ie = disk_ll_save_ie;
692 ll->init_index = disk_ll_init_index;
693 ll->open_index = disk_ll_open;
694 ll->max_entries = disk_ll_max_entries;
695 ll->commit = disk_ll_commit;
696
697 ll->nr_blocks = le64_to_cpu(smr->nr_blocks);
698 ll->nr_allocated = le64_to_cpu(smr->nr_allocated);
699 ll->bitmap_root = le64_to_cpu(smr->bitmap_root);
700 ll->ref_count_root = le64_to_cpu(smr->ref_count_root);
701
702 return ll->open_index(ll);
703}
704
705/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h
new file mode 100644
index 000000000000..8f220821a9a9
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-common.h
@@ -0,0 +1,126 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_SPACE_MAP_COMMON_H
8#define DM_SPACE_MAP_COMMON_H
9
10#include "dm-btree.h"
11
12/*----------------------------------------------------------------*/
13
14/*
15 * Low level disk format
16 *
17 * Bitmap btree
18 * ------------
19 *
20 * Each value stored in the btree is an index_entry. This points to a
21 * block that is used as a bitmap. Within the bitmap hold 2 bits per
22 * entry, which represent UNUSED = 0, REF_COUNT = 1, REF_COUNT = 2 and
23 * REF_COUNT = many.
24 *
25 * Refcount btree
26 * --------------
27 *
28 * Any entry that has a ref count higher than 2 gets entered in the ref
29 * count tree. The leaf values for this tree is the 32-bit ref count.
30 */
31
32struct disk_index_entry {
33 __le64 blocknr;
34 __le32 nr_free;
35 __le32 none_free_before;
36} __packed;
37
38
39#define MAX_METADATA_BITMAPS 255
40struct disk_metadata_index {
41 __le32 csum;
42 __le32 padding;
43 __le64 blocknr;
44
45 struct disk_index_entry index[MAX_METADATA_BITMAPS];
46} __packed;
47
48struct ll_disk;
49
50typedef int (*load_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *result);
51typedef int (*save_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie);
52typedef int (*init_index_fn)(struct ll_disk *ll);
53typedef int (*open_index_fn)(struct ll_disk *ll);
54typedef dm_block_t (*max_index_entries_fn)(struct ll_disk *ll);
55typedef int (*commit_fn)(struct ll_disk *ll);
56
57struct ll_disk {
58 struct dm_transaction_manager *tm;
59 struct dm_btree_info bitmap_info;
60 struct dm_btree_info ref_count_info;
61
62 uint32_t block_size;
63 uint32_t entries_per_block;
64 dm_block_t nr_blocks;
65 dm_block_t nr_allocated;
66
67 /*
68 * bitmap_root may be a btree root or a simple index.
69 */
70 dm_block_t bitmap_root;
71
72 dm_block_t ref_count_root;
73
74 struct disk_metadata_index mi_le;
75 load_ie_fn load_ie;
76 save_ie_fn save_ie;
77 init_index_fn init_index;
78 open_index_fn open_index;
79 max_index_entries_fn max_entries;
80 commit_fn commit;
81};
82
83struct disk_sm_root {
84 __le64 nr_blocks;
85 __le64 nr_allocated;
86 __le64 bitmap_root;
87 __le64 ref_count_root;
88} __packed;
89
90#define ENTRIES_PER_BYTE 4
91
92struct disk_bitmap_header {
93 __le32 csum;
94 __le32 not_used;
95 __le64 blocknr;
96} __packed;
97
98enum allocation_event {
99 SM_NONE,
100 SM_ALLOC,
101 SM_FREE,
102};
103
104/*----------------------------------------------------------------*/
105
106int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks);
107int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result);
108int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result);
109int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
110 dm_block_t end, dm_block_t *result);
111int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev);
112int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
113int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
114int sm_ll_commit(struct ll_disk *ll);
115
116int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm);
117int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
118 void *root_le, size_t len);
119
120int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm);
121int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm,
122 void *root_le, size_t len);
123
124/*----------------------------------------------------------------*/
125
126#endif /* DM_SPACE_MAP_COMMON_H */
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
new file mode 100644
index 000000000000..aeff7852cf79
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -0,0 +1,335 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-space-map-checker.h"
8#include "dm-space-map-common.h"
9#include "dm-space-map-disk.h"
10#include "dm-space-map.h"
11#include "dm-transaction-manager.h"
12
13#include <linux/list.h>
14#include <linux/slab.h>
15#include <linux/module.h>
16#include <linux/device-mapper.h>
17
18#define DM_MSG_PREFIX "space map disk"
19
20/*----------------------------------------------------------------*/
21
22/*
23 * Space map interface.
24 */
25struct sm_disk {
26 struct dm_space_map sm;
27
28 struct ll_disk ll;
29 struct ll_disk old_ll;
30
31 dm_block_t begin;
32 dm_block_t nr_allocated_this_transaction;
33};
34
35static void sm_disk_destroy(struct dm_space_map *sm)
36{
37 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
38
39 kfree(smd);
40}
41
42static int sm_disk_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
43{
44 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
45
46 return sm_ll_extend(&smd->ll, extra_blocks);
47}
48
49static int sm_disk_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
50{
51 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
52 *count = smd->old_ll.nr_blocks;
53
54 return 0;
55}
56
57static int sm_disk_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
58{
59 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
60 *count = (smd->old_ll.nr_blocks - smd->old_ll.nr_allocated) - smd->nr_allocated_this_transaction;
61
62 return 0;
63}
64
65static int sm_disk_get_count(struct dm_space_map *sm, dm_block_t b,
66 uint32_t *result)
67{
68 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
69 return sm_ll_lookup(&smd->ll, b, result);
70}
71
72static int sm_disk_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b,
73 int *result)
74{
75 int r;
76 uint32_t count;
77
78 r = sm_disk_get_count(sm, b, &count);
79 if (r)
80 return r;
81
82 return count > 1;
83}
84
85static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b,
86 uint32_t count)
87{
88 int r;
89 uint32_t old_count;
90 enum allocation_event ev;
91 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
92
93 r = sm_ll_insert(&smd->ll, b, count, &ev);
94 if (!r) {
95 switch (ev) {
96 case SM_NONE:
97 break;
98
99 case SM_ALLOC:
100 /*
101 * This _must_ be free in the prior transaction
102 * otherwise we've lost atomicity.
103 */
104 smd->nr_allocated_this_transaction++;
105 break;
106
107 case SM_FREE:
108 /*
109 * It's only free if it's also free in the last
110 * transaction.
111 */
112 r = sm_ll_lookup(&smd->old_ll, b, &old_count);
113 if (r)
114 return r;
115
116 if (!old_count)
117 smd->nr_allocated_this_transaction--;
118 break;
119 }
120 }
121
122 return r;
123}
124
125static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
126{
127 int r;
128 enum allocation_event ev;
129 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
130
131 r = sm_ll_inc(&smd->ll, b, &ev);
132 if (!r && (ev == SM_ALLOC))
133 /*
134 * This _must_ be free in the prior transaction
135 * otherwise we've lost atomicity.
136 */
137 smd->nr_allocated_this_transaction++;
138
139 return r;
140}
141
142static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
143{
144 int r;
145 uint32_t old_count;
146 enum allocation_event ev;
147 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
148
149 r = sm_ll_dec(&smd->ll, b, &ev);
150 if (!r && (ev == SM_FREE)) {
151 /*
152 * It's only free if it's also free in the last
153 * transaction.
154 */
155 r = sm_ll_lookup(&smd->old_ll, b, &old_count);
156 if (r)
157 return r;
158
159 if (!old_count)
160 smd->nr_allocated_this_transaction--;
161 }
162
163 return r;
164}
165
166static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
167{
168 int r;
169 enum allocation_event ev;
170 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
171
172 /* FIXME: we should loop round a couple of times */
173 r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b);
174 if (r)
175 return r;
176
177 smd->begin = *b + 1;
178 r = sm_ll_inc(&smd->ll, *b, &ev);
179 if (!r) {
180 BUG_ON(ev != SM_ALLOC);
181 smd->nr_allocated_this_transaction++;
182 }
183
184 return r;
185}
186
187static int sm_disk_commit(struct dm_space_map *sm)
188{
189 int r;
190 dm_block_t nr_free;
191 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
192
193 r = sm_disk_get_nr_free(sm, &nr_free);
194 if (r)
195 return r;
196
197 r = sm_ll_commit(&smd->ll);
198 if (r)
199 return r;
200
201 memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll));
202 smd->begin = 0;
203 smd->nr_allocated_this_transaction = 0;
204
205 r = sm_disk_get_nr_free(sm, &nr_free);
206 if (r)
207 return r;
208
209 return 0;
210}
211
212static int sm_disk_root_size(struct dm_space_map *sm, size_t *result)
213{
214 *result = sizeof(struct disk_sm_root);
215
216 return 0;
217}
218
219static int sm_disk_copy_root(struct dm_space_map *sm, void *where_le, size_t max)
220{
221 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
222 struct disk_sm_root root_le;
223
224 root_le.nr_blocks = cpu_to_le64(smd->ll.nr_blocks);
225 root_le.nr_allocated = cpu_to_le64(smd->ll.nr_allocated);
226 root_le.bitmap_root = cpu_to_le64(smd->ll.bitmap_root);
227 root_le.ref_count_root = cpu_to_le64(smd->ll.ref_count_root);
228
229 if (max < sizeof(root_le))
230 return -ENOSPC;
231
232 memcpy(where_le, &root_le, sizeof(root_le));
233
234 return 0;
235}
236
237/*----------------------------------------------------------------*/
238
239static struct dm_space_map ops = {
240 .destroy = sm_disk_destroy,
241 .extend = sm_disk_extend,
242 .get_nr_blocks = sm_disk_get_nr_blocks,
243 .get_nr_free = sm_disk_get_nr_free,
244 .get_count = sm_disk_get_count,
245 .count_is_more_than_one = sm_disk_count_is_more_than_one,
246 .set_count = sm_disk_set_count,
247 .inc_block = sm_disk_inc_block,
248 .dec_block = sm_disk_dec_block,
249 .new_block = sm_disk_new_block,
250 .commit = sm_disk_commit,
251 .root_size = sm_disk_root_size,
252 .copy_root = sm_disk_copy_root
253};
254
255static struct dm_space_map *dm_sm_disk_create_real(
256 struct dm_transaction_manager *tm,
257 dm_block_t nr_blocks)
258{
259 int r;
260 struct sm_disk *smd;
261
262 smd = kmalloc(sizeof(*smd), GFP_KERNEL);
263 if (!smd)
264 return ERR_PTR(-ENOMEM);
265
266 smd->begin = 0;
267 smd->nr_allocated_this_transaction = 0;
268 memcpy(&smd->sm, &ops, sizeof(smd->sm));
269
270 r = sm_ll_new_disk(&smd->ll, tm);
271 if (r)
272 goto bad;
273
274 r = sm_ll_extend(&smd->ll, nr_blocks);
275 if (r)
276 goto bad;
277
278 r = sm_disk_commit(&smd->sm);
279 if (r)
280 goto bad;
281
282 return &smd->sm;
283
284bad:
285 kfree(smd);
286 return ERR_PTR(r);
287}
288
289struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
290 dm_block_t nr_blocks)
291{
292 struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks);
293 return dm_sm_checker_create_fresh(sm);
294}
295EXPORT_SYMBOL_GPL(dm_sm_disk_create);
296
297static struct dm_space_map *dm_sm_disk_open_real(
298 struct dm_transaction_manager *tm,
299 void *root_le, size_t len)
300{
301 int r;
302 struct sm_disk *smd;
303
304 smd = kmalloc(sizeof(*smd), GFP_KERNEL);
305 if (!smd)
306 return ERR_PTR(-ENOMEM);
307
308 smd->begin = 0;
309 smd->nr_allocated_this_transaction = 0;
310 memcpy(&smd->sm, &ops, sizeof(smd->sm));
311
312 r = sm_ll_open_disk(&smd->ll, tm, root_le, len);
313 if (r)
314 goto bad;
315
316 r = sm_disk_commit(&smd->sm);
317 if (r)
318 goto bad;
319
320 return &smd->sm;
321
322bad:
323 kfree(smd);
324 return ERR_PTR(r);
325}
326
327struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
328 void *root_le, size_t len)
329{
330 return dm_sm_checker_create(
331 dm_sm_disk_open_real(tm, root_le, len));
332}
333EXPORT_SYMBOL_GPL(dm_sm_disk_open);
334
335/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-space-map-disk.h b/drivers/md/persistent-data/dm-space-map-disk.h
new file mode 100644
index 000000000000..447a0a9a2d9f
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-disk.h
@@ -0,0 +1,25 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef _LINUX_DM_SPACE_MAP_DISK_H
8#define _LINUX_DM_SPACE_MAP_DISK_H
9
10#include "dm-block-manager.h"
11
12struct dm_space_map;
13struct dm_transaction_manager;
14
15/*
16 * Unfortunately we have to use two-phase construction due to the cycle
17 * between the tm and sm.
18 */
19struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
20 dm_block_t nr_blocks);
21
22struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
23 void *root, size_t len);
24
25#endif /* _LINUX_DM_SPACE_MAP_DISK_H */
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
new file mode 100644
index 000000000000..e89ae5e7a519
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -0,0 +1,596 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-space-map.h"
8#include "dm-space-map-common.h"
9#include "dm-space-map-metadata.h"
10
11#include <linux/list.h>
12#include <linux/slab.h>
13#include <linux/device-mapper.h>
14
15#define DM_MSG_PREFIX "space map metadata"
16
17/*----------------------------------------------------------------*/
18
19/*
20 * Space map interface.
21 *
22 * The low level disk format is written using the standard btree and
23 * transaction manager. This means that performing disk operations may
24 * cause us to recurse into the space map in order to allocate new blocks.
25 * For this reason we have a pool of pre-allocated blocks large enough to
26 * service any metadata_ll_disk operation.
27 */
28
29/*
30 * FIXME: we should calculate this based on the size of the device.
31 * Only the metadata space map needs this functionality.
32 */
33#define MAX_RECURSIVE_ALLOCATIONS 1024
34
35enum block_op_type {
36 BOP_INC,
37 BOP_DEC
38};
39
40struct block_op {
41 enum block_op_type type;
42 dm_block_t block;
43};
44
45struct sm_metadata {
46 struct dm_space_map sm;
47
48 struct ll_disk ll;
49 struct ll_disk old_ll;
50
51 dm_block_t begin;
52
53 unsigned recursion_count;
54 unsigned allocated_this_transaction;
55 unsigned nr_uncommitted;
56 struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
57};
58
59static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
60{
61 struct block_op *op;
62
63 if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) {
64 DMERR("too many recursive allocations");
65 return -ENOMEM;
66 }
67
68 op = smm->uncommitted + smm->nr_uncommitted++;
69 op->type = type;
70 op->block = b;
71
72 return 0;
73}
74
75static int commit_bop(struct sm_metadata *smm, struct block_op *op)
76{
77 int r = 0;
78 enum allocation_event ev;
79
80 switch (op->type) {
81 case BOP_INC:
82 r = sm_ll_inc(&smm->ll, op->block, &ev);
83 break;
84
85 case BOP_DEC:
86 r = sm_ll_dec(&smm->ll, op->block, &ev);
87 break;
88 }
89
90 return r;
91}
92
93static void in(struct sm_metadata *smm)
94{
95 smm->recursion_count++;
96}
97
98static int out(struct sm_metadata *smm)
99{
100 int r = 0;
101
102 /*
103 * If we're not recursing then very bad things are happening.
104 */
105 if (!smm->recursion_count) {
106 DMERR("lost track of recursion depth");
107 return -ENOMEM;
108 }
109
110 if (smm->recursion_count == 1 && smm->nr_uncommitted) {
111 while (smm->nr_uncommitted && !r) {
112 smm->nr_uncommitted--;
113 r = commit_bop(smm, smm->uncommitted +
114 smm->nr_uncommitted);
115 if (r)
116 break;
117 }
118 }
119
120 smm->recursion_count--;
121
122 return r;
123}
124
125/*
126 * When using the out() function above, we often want to combine an error
127 * code for the operation run in the recursive context with that from
128 * out().
129 */
130static int combine_errors(int r1, int r2)
131{
132 return r1 ? r1 : r2;
133}
134
135static int recursing(struct sm_metadata *smm)
136{
137 return smm->recursion_count;
138}
139
140static void sm_metadata_destroy(struct dm_space_map *sm)
141{
142 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
143
144 kfree(smm);
145}
146
147static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
148{
149 DMERR("doesn't support extend");
150 return -EINVAL;
151}
152
153static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
154{
155 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
156
157 *count = smm->ll.nr_blocks;
158
159 return 0;
160}
161
162static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
163{
164 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
165
166 *count = smm->old_ll.nr_blocks - smm->old_ll.nr_allocated -
167 smm->allocated_this_transaction;
168
169 return 0;
170}
171
172static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
173 uint32_t *result)
174{
175 int r, i;
176 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
177 unsigned adjustment = 0;
178
179 /*
180 * We may have some uncommitted adjustments to add. This list
181 * should always be really short.
182 */
183 for (i = 0; i < smm->nr_uncommitted; i++) {
184 struct block_op *op = smm->uncommitted + i;
185
186 if (op->block != b)
187 continue;
188
189 switch (op->type) {
190 case BOP_INC:
191 adjustment++;
192 break;
193
194 case BOP_DEC:
195 adjustment--;
196 break;
197 }
198 }
199
200 r = sm_ll_lookup(&smm->ll, b, result);
201 if (r)
202 return r;
203
204 *result += adjustment;
205
206 return 0;
207}
208
209static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
210 dm_block_t b, int *result)
211{
212 int r, i, adjustment = 0;
213 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
214 uint32_t rc;
215
216 /*
217 * We may have some uncommitted adjustments to add. This list
218 * should always be really short.
219 */
220 for (i = 0; i < smm->nr_uncommitted; i++) {
221 struct block_op *op = smm->uncommitted + i;
222
223 if (op->block != b)
224 continue;
225
226 switch (op->type) {
227 case BOP_INC:
228 adjustment++;
229 break;
230
231 case BOP_DEC:
232 adjustment--;
233 break;
234 }
235 }
236
237 if (adjustment > 1) {
238 *result = 1;
239 return 0;
240 }
241
242 r = sm_ll_lookup_bitmap(&smm->ll, b, &rc);
243 if (r)
244 return r;
245
246 if (rc == 3)
247 /*
248 * We err on the side of caution, and always return true.
249 */
250 *result = 1;
251 else
252 *result = rc + adjustment > 1;
253
254 return 0;
255}
256
257static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b,
258 uint32_t count)
259{
260 int r, r2;
261 enum allocation_event ev;
262 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
263
264 if (smm->recursion_count) {
265 DMERR("cannot recurse set_count()");
266 return -EINVAL;
267 }
268
269 in(smm);
270 r = sm_ll_insert(&smm->ll, b, count, &ev);
271 r2 = out(smm);
272
273 return combine_errors(r, r2);
274}
275
276static int sm_metadata_inc_block(struct dm_space_map *sm, dm_block_t b)
277{
278 int r, r2 = 0;
279 enum allocation_event ev;
280 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
281
282 if (recursing(smm))
283 r = add_bop(smm, BOP_INC, b);
284 else {
285 in(smm);
286 r = sm_ll_inc(&smm->ll, b, &ev);
287 r2 = out(smm);
288 }
289
290 return combine_errors(r, r2);
291}
292
293static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b)
294{
295 int r, r2 = 0;
296 enum allocation_event ev;
297 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
298
299 if (recursing(smm))
300 r = add_bop(smm, BOP_DEC, b);
301 else {
302 in(smm);
303 r = sm_ll_dec(&smm->ll, b, &ev);
304 r2 = out(smm);
305 }
306
307 return combine_errors(r, r2);
308}
309
310static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
311{
312 int r, r2 = 0;
313 enum allocation_event ev;
314 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
315
316 r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b);
317 if (r)
318 return r;
319
320 smm->begin = *b + 1;
321
322 if (recursing(smm))
323 r = add_bop(smm, BOP_INC, *b);
324 else {
325 in(smm);
326 r = sm_ll_inc(&smm->ll, *b, &ev);
327 r2 = out(smm);
328 }
329
330 if (!r)
331 smm->allocated_this_transaction++;
332
333 return combine_errors(r, r2);
334}
335
336static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
337{
338 int r = sm_metadata_new_block_(sm, b);
339 if (r)
340 DMERR("out of metadata space");
341 return r;
342}
343
344static int sm_metadata_commit(struct dm_space_map *sm)
345{
346 int r;
347 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
348
349 r = sm_ll_commit(&smm->ll);
350 if (r)
351 return r;
352
353 memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
354 smm->begin = 0;
355 smm->allocated_this_transaction = 0;
356
357 return 0;
358}
359
360static int sm_metadata_root_size(struct dm_space_map *sm, size_t *result)
361{
362 *result = sizeof(struct disk_sm_root);
363
364 return 0;
365}
366
367static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t max)
368{
369 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
370 struct disk_sm_root root_le;
371
372 root_le.nr_blocks = cpu_to_le64(smm->ll.nr_blocks);
373 root_le.nr_allocated = cpu_to_le64(smm->ll.nr_allocated);
374 root_le.bitmap_root = cpu_to_le64(smm->ll.bitmap_root);
375 root_le.ref_count_root = cpu_to_le64(smm->ll.ref_count_root);
376
377 if (max < sizeof(root_le))
378 return -ENOSPC;
379
380 memcpy(where_le, &root_le, sizeof(root_le));
381
382 return 0;
383}
384
385static struct dm_space_map ops = {
386 .destroy = sm_metadata_destroy,
387 .extend = sm_metadata_extend,
388 .get_nr_blocks = sm_metadata_get_nr_blocks,
389 .get_nr_free = sm_metadata_get_nr_free,
390 .get_count = sm_metadata_get_count,
391 .count_is_more_than_one = sm_metadata_count_is_more_than_one,
392 .set_count = sm_metadata_set_count,
393 .inc_block = sm_metadata_inc_block,
394 .dec_block = sm_metadata_dec_block,
395 .new_block = sm_metadata_new_block,
396 .commit = sm_metadata_commit,
397 .root_size = sm_metadata_root_size,
398 .copy_root = sm_metadata_copy_root
399};
400
401/*----------------------------------------------------------------*/
402
403/*
404 * When a new space map is created that manages its own space. We use
405 * this tiny bootstrap allocator.
406 */
407static void sm_bootstrap_destroy(struct dm_space_map *sm)
408{
409}
410
411static int sm_bootstrap_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
412{
413 DMERR("boostrap doesn't support extend");
414
415 return -EINVAL;
416}
417
418static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
419{
420 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
421
422 return smm->ll.nr_blocks;
423}
424
425static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
426{
427 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
428
429 *count = smm->ll.nr_blocks - smm->begin;
430
431 return 0;
432}
433
434static int sm_bootstrap_get_count(struct dm_space_map *sm, dm_block_t b,
435 uint32_t *result)
436{
437 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
438
439 return b < smm->begin ? 1 : 0;
440}
441
442static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm,
443 dm_block_t b, int *result)
444{
445 *result = 0;
446
447 return 0;
448}
449
450static int sm_bootstrap_set_count(struct dm_space_map *sm, dm_block_t b,
451 uint32_t count)
452{
453 DMERR("boostrap doesn't support set_count");
454
455 return -EINVAL;
456}
457
458static int sm_bootstrap_new_block(struct dm_space_map *sm, dm_block_t *b)
459{
460 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
461
462 /*
463 * We know the entire device is unused.
464 */
465 if (smm->begin == smm->ll.nr_blocks)
466 return -ENOSPC;
467
468 *b = smm->begin++;
469
470 return 0;
471}
472
473static int sm_bootstrap_inc_block(struct dm_space_map *sm, dm_block_t b)
474{
475 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
476
477 return add_bop(smm, BOP_INC, b);
478}
479
480static int sm_bootstrap_dec_block(struct dm_space_map *sm, dm_block_t b)
481{
482 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
483
484 return add_bop(smm, BOP_DEC, b);
485}
486
487static int sm_bootstrap_commit(struct dm_space_map *sm)
488{
489 return 0;
490}
491
492static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result)
493{
494 DMERR("boostrap doesn't support root_size");
495
496 return -EINVAL;
497}
498
499static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where,
500 size_t max)
501{
502 DMERR("boostrap doesn't support copy_root");
503
504 return -EINVAL;
505}
506
507static struct dm_space_map bootstrap_ops = {
508 .destroy = sm_bootstrap_destroy,
509 .extend = sm_bootstrap_extend,
510 .get_nr_blocks = sm_bootstrap_get_nr_blocks,
511 .get_nr_free = sm_bootstrap_get_nr_free,
512 .get_count = sm_bootstrap_get_count,
513 .count_is_more_than_one = sm_bootstrap_count_is_more_than_one,
514 .set_count = sm_bootstrap_set_count,
515 .inc_block = sm_bootstrap_inc_block,
516 .dec_block = sm_bootstrap_dec_block,
517 .new_block = sm_bootstrap_new_block,
518 .commit = sm_bootstrap_commit,
519 .root_size = sm_bootstrap_root_size,
520 .copy_root = sm_bootstrap_copy_root
521};
522
523/*----------------------------------------------------------------*/
524
525struct dm_space_map *dm_sm_metadata_init(void)
526{
527 struct sm_metadata *smm;
528
529 smm = kmalloc(sizeof(*smm), GFP_KERNEL);
530 if (!smm)
531 return ERR_PTR(-ENOMEM);
532
533 memcpy(&smm->sm, &ops, sizeof(smm->sm));
534
535 return &smm->sm;
536}
537
538int dm_sm_metadata_create(struct dm_space_map *sm,
539 struct dm_transaction_manager *tm,
540 dm_block_t nr_blocks,
541 dm_block_t superblock)
542{
543 int r;
544 dm_block_t i;
545 enum allocation_event ev;
546 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
547
548 smm->begin = superblock + 1;
549 smm->recursion_count = 0;
550 smm->allocated_this_transaction = 0;
551 smm->nr_uncommitted = 0;
552
553 memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
554
555 r = sm_ll_new_metadata(&smm->ll, tm);
556 if (r)
557 return r;
558
559 r = sm_ll_extend(&smm->ll, nr_blocks);
560 if (r)
561 return r;
562
563 memcpy(&smm->sm, &ops, sizeof(smm->sm));
564
565 /*
566 * Now we need to update the newly created data structures with the
567 * allocated blocks that they were built from.
568 */
569 for (i = superblock; !r && i < smm->begin; i++)
570 r = sm_ll_inc(&smm->ll, i, &ev);
571
572 if (r)
573 return r;
574
575 return sm_metadata_commit(sm);
576}
577
578int dm_sm_metadata_open(struct dm_space_map *sm,
579 struct dm_transaction_manager *tm,
580 void *root_le, size_t len)
581{
582 int r;
583 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
584
585 r = sm_ll_open_metadata(&smm->ll, tm, root_le, len);
586 if (r)
587 return r;
588
589 smm->begin = 0;
590 smm->recursion_count = 0;
591 smm->allocated_this_transaction = 0;
592 smm->nr_uncommitted = 0;
593
594 memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
595 return 0;
596}
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h
new file mode 100644
index 000000000000..39bba0801cf2
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-metadata.h
@@ -0,0 +1,33 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_SPACE_MAP_METADATA_H
8#define DM_SPACE_MAP_METADATA_H
9
10#include "dm-transaction-manager.h"
11
12/*
13 * Unfortunately we have to use two-phase construction due to the cycle
14 * between the tm and sm.
15 */
16struct dm_space_map *dm_sm_metadata_init(void);
17
18/*
19 * Create a fresh space map.
20 */
21int dm_sm_metadata_create(struct dm_space_map *sm,
22 struct dm_transaction_manager *tm,
23 dm_block_t nr_blocks,
24 dm_block_t superblock);
25
26/*
27 * Open from a previously-recorded root.
28 */
29int dm_sm_metadata_open(struct dm_space_map *sm,
30 struct dm_transaction_manager *tm,
31 void *root_le, size_t len);
32
33#endif /* DM_SPACE_MAP_METADATA_H */
diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h
new file mode 100644
index 000000000000..1cbfc6b1638a
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map.h
@@ -0,0 +1,134 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef _LINUX_DM_SPACE_MAP_H
8#define _LINUX_DM_SPACE_MAP_H
9
10#include "dm-block-manager.h"
11
12/*
13 * struct dm_space_map keeps a record of how many times each block in a device
14 * is referenced. It needs to be fixed on disk as part of the transaction.
15 */
16struct dm_space_map {
17 void (*destroy)(struct dm_space_map *sm);
18
19 /*
20 * You must commit before allocating the newly added space.
21 */
22 int (*extend)(struct dm_space_map *sm, dm_block_t extra_blocks);
23
24 /*
25 * Extensions do not appear in this count until after commit has
26 * been called.
27 */
28 int (*get_nr_blocks)(struct dm_space_map *sm, dm_block_t *count);
29
30 /*
31 * Space maps must never allocate a block from the previous
32 * transaction, in case we need to rollback. This complicates the
33 * semantics of get_nr_free(), it should return the number of blocks
34 * that are available for allocation _now_. For instance you may
35 * have blocks with a zero reference count that will not be
36 * available for allocation until after the next commit.
37 */
38 int (*get_nr_free)(struct dm_space_map *sm, dm_block_t *count);
39
40 int (*get_count)(struct dm_space_map *sm, dm_block_t b, uint32_t *result);
41 int (*count_is_more_than_one)(struct dm_space_map *sm, dm_block_t b,
42 int *result);
43 int (*set_count)(struct dm_space_map *sm, dm_block_t b, uint32_t count);
44
45 int (*commit)(struct dm_space_map *sm);
46
47 int (*inc_block)(struct dm_space_map *sm, dm_block_t b);
48 int (*dec_block)(struct dm_space_map *sm, dm_block_t b);
49
50 /*
51 * new_block will increment the returned block.
52 */
53 int (*new_block)(struct dm_space_map *sm, dm_block_t *b);
54
55 /*
56 * The root contains all the information needed to fix the space map.
57 * Generally this info is small, so squirrel it away in a disk block
58 * along with other info.
59 */
60 int (*root_size)(struct dm_space_map *sm, size_t *result);
61 int (*copy_root)(struct dm_space_map *sm, void *copy_to_here_le, size_t len);
62};
63
64/*----------------------------------------------------------------*/
65
66static inline void dm_sm_destroy(struct dm_space_map *sm)
67{
68 sm->destroy(sm);
69}
70
71static inline int dm_sm_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
72{
73 return sm->extend(sm, extra_blocks);
74}
75
76static inline int dm_sm_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
77{
78 return sm->get_nr_blocks(sm, count);
79}
80
81static inline int dm_sm_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
82{
83 return sm->get_nr_free(sm, count);
84}
85
86static inline int dm_sm_get_count(struct dm_space_map *sm, dm_block_t b,
87 uint32_t *result)
88{
89 return sm->get_count(sm, b, result);
90}
91
92static inline int dm_sm_count_is_more_than_one(struct dm_space_map *sm,
93 dm_block_t b, int *result)
94{
95 return sm->count_is_more_than_one(sm, b, result);
96}
97
98static inline int dm_sm_set_count(struct dm_space_map *sm, dm_block_t b,
99 uint32_t count)
100{
101 return sm->set_count(sm, b, count);
102}
103
104static inline int dm_sm_commit(struct dm_space_map *sm)
105{
106 return sm->commit(sm);
107}
108
109static inline int dm_sm_inc_block(struct dm_space_map *sm, dm_block_t b)
110{
111 return sm->inc_block(sm, b);
112}
113
114static inline int dm_sm_dec_block(struct dm_space_map *sm, dm_block_t b)
115{
116 return sm->dec_block(sm, b);
117}
118
119static inline int dm_sm_new_block(struct dm_space_map *sm, dm_block_t *b)
120{
121 return sm->new_block(sm, b);
122}
123
124static inline int dm_sm_root_size(struct dm_space_map *sm, size_t *result)
125{
126 return sm->root_size(sm, result);
127}
128
129static inline int dm_sm_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len)
130{
131 return sm->copy_root(sm, copy_to_here_le, len);
132}
133
134#endif /* _LINUX_DM_SPACE_MAP_H */
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
new file mode 100644
index 000000000000..728e89a3f978
--- /dev/null
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -0,0 +1,400 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6#include "dm-transaction-manager.h"
7#include "dm-space-map.h"
8#include "dm-space-map-checker.h"
9#include "dm-space-map-disk.h"
10#include "dm-space-map-metadata.h"
11#include "dm-persistent-data-internal.h"
12
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/device-mapper.h>
16
17#define DM_MSG_PREFIX "transaction manager"
18
19/*----------------------------------------------------------------*/
20
21struct shadow_info {
22 struct hlist_node hlist;
23 dm_block_t where;
24};
25
26/*
27 * It would be nice if we scaled with the size of transaction.
28 */
29#define HASH_SIZE 256
30#define HASH_MASK (HASH_SIZE - 1)
31
32struct dm_transaction_manager {
33 int is_clone;
34 struct dm_transaction_manager *real;
35
36 struct dm_block_manager *bm;
37 struct dm_space_map *sm;
38
39 spinlock_t lock;
40 struct hlist_head buckets[HASH_SIZE];
41};
42
43/*----------------------------------------------------------------*/
44
45static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b)
46{
47 int r = 0;
48 unsigned bucket = dm_hash_block(b, HASH_MASK);
49 struct shadow_info *si;
50 struct hlist_node *n;
51
52 spin_lock(&tm->lock);
53 hlist_for_each_entry(si, n, tm->buckets + bucket, hlist)
54 if (si->where == b) {
55 r = 1;
56 break;
57 }
58 spin_unlock(&tm->lock);
59
60 return r;
61}
62
63/*
64 * This can silently fail if there's no memory. We're ok with this since
65 * creating redundant shadows causes no harm.
66 */
67static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b)
68{
69 unsigned bucket;
70 struct shadow_info *si;
71
72 si = kmalloc(sizeof(*si), GFP_NOIO);
73 if (si) {
74 si->where = b;
75 bucket = dm_hash_block(b, HASH_MASK);
76 spin_lock(&tm->lock);
77 hlist_add_head(&si->hlist, tm->buckets + bucket);
78 spin_unlock(&tm->lock);
79 }
80}
81
82static void wipe_shadow_table(struct dm_transaction_manager *tm)
83{
84 struct shadow_info *si;
85 struct hlist_node *n, *tmp;
86 struct hlist_head *bucket;
87 int i;
88
89 spin_lock(&tm->lock);
90 for (i = 0; i < HASH_SIZE; i++) {
91 bucket = tm->buckets + i;
92 hlist_for_each_entry_safe(si, n, tmp, bucket, hlist)
93 kfree(si);
94
95 INIT_HLIST_HEAD(bucket);
96 }
97
98 spin_unlock(&tm->lock);
99}
100
101/*----------------------------------------------------------------*/
102
103static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
104 struct dm_space_map *sm)
105{
106 int i;
107 struct dm_transaction_manager *tm;
108
109 tm = kmalloc(sizeof(*tm), GFP_KERNEL);
110 if (!tm)
111 return ERR_PTR(-ENOMEM);
112
113 tm->is_clone = 0;
114 tm->real = NULL;
115 tm->bm = bm;
116 tm->sm = sm;
117
118 spin_lock_init(&tm->lock);
119 for (i = 0; i < HASH_SIZE; i++)
120 INIT_HLIST_HEAD(tm->buckets + i);
121
122 return tm;
123}
124
125struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real)
126{
127 struct dm_transaction_manager *tm;
128
129 tm = kmalloc(sizeof(*tm), GFP_KERNEL);
130 if (tm) {
131 tm->is_clone = 1;
132 tm->real = real;
133 }
134
135 return tm;
136}
137EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone);
138
139void dm_tm_destroy(struct dm_transaction_manager *tm)
140{
141 kfree(tm);
142}
143EXPORT_SYMBOL_GPL(dm_tm_destroy);
144
145int dm_tm_pre_commit(struct dm_transaction_manager *tm)
146{
147 int r;
148
149 if (tm->is_clone)
150 return -EWOULDBLOCK;
151
152 r = dm_sm_commit(tm->sm);
153 if (r < 0)
154 return r;
155
156 return 0;
157}
158EXPORT_SYMBOL_GPL(dm_tm_pre_commit);
159
160int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root)
161{
162 if (tm->is_clone)
163 return -EWOULDBLOCK;
164
165 wipe_shadow_table(tm);
166
167 return dm_bm_flush_and_unlock(tm->bm, root);
168}
169EXPORT_SYMBOL_GPL(dm_tm_commit);
170
171int dm_tm_new_block(struct dm_transaction_manager *tm,
172 struct dm_block_validator *v,
173 struct dm_block **result)
174{
175 int r;
176 dm_block_t new_block;
177
178 if (tm->is_clone)
179 return -EWOULDBLOCK;
180
181 r = dm_sm_new_block(tm->sm, &new_block);
182 if (r < 0)
183 return r;
184
185 r = dm_bm_write_lock_zero(tm->bm, new_block, v, result);
186 if (r < 0) {
187 dm_sm_dec_block(tm->sm, new_block);
188 return r;
189 }
190
191 /*
192 * New blocks count as shadows in that they don't need to be
193 * shadowed again.
194 */
195 insert_shadow(tm, new_block);
196
197 return 0;
198}
199
200static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
201 struct dm_block_validator *v,
202 struct dm_block **result)
203{
204 int r;
205 dm_block_t new;
206 struct dm_block *orig_block;
207
208 r = dm_sm_new_block(tm->sm, &new);
209 if (r < 0)
210 return r;
211
212 r = dm_sm_dec_block(tm->sm, orig);
213 if (r < 0)
214 return r;
215
216 r = dm_bm_read_lock(tm->bm, orig, v, &orig_block);
217 if (r < 0)
218 return r;
219
220 r = dm_bm_unlock_move(orig_block, new);
221 if (r < 0) {
222 dm_bm_unlock(orig_block);
223 return r;
224 }
225
226 return dm_bm_write_lock(tm->bm, new, v, result);
227}
228
229int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
230 struct dm_block_validator *v, struct dm_block **result,
231 int *inc_children)
232{
233 int r;
234
235 if (tm->is_clone)
236 return -EWOULDBLOCK;
237
238 r = dm_sm_count_is_more_than_one(tm->sm, orig, inc_children);
239 if (r < 0)
240 return r;
241
242 if (is_shadow(tm, orig) && !*inc_children)
243 return dm_bm_write_lock(tm->bm, orig, v, result);
244
245 r = __shadow_block(tm, orig, v, result);
246 if (r < 0)
247 return r;
248 insert_shadow(tm, dm_block_location(*result));
249
250 return r;
251}
252
253int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
254 struct dm_block_validator *v,
255 struct dm_block **blk)
256{
257 if (tm->is_clone)
258 return dm_bm_read_try_lock(tm->real->bm, b, v, blk);
259
260 return dm_bm_read_lock(tm->bm, b, v, blk);
261}
262
263int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b)
264{
265 return dm_bm_unlock(b);
266}
267EXPORT_SYMBOL_GPL(dm_tm_unlock);
268
269void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b)
270{
271 /*
272 * The non-blocking clone doesn't support this.
273 */
274 BUG_ON(tm->is_clone);
275
276 dm_sm_inc_block(tm->sm, b);
277}
278EXPORT_SYMBOL_GPL(dm_tm_inc);
279
280void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b)
281{
282 /*
283 * The non-blocking clone doesn't support this.
284 */
285 BUG_ON(tm->is_clone);
286
287 dm_sm_dec_block(tm->sm, b);
288}
289EXPORT_SYMBOL_GPL(dm_tm_dec);
290
291int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
292 uint32_t *result)
293{
294 if (tm->is_clone)
295 return -EWOULDBLOCK;
296
297 return dm_sm_get_count(tm->sm, b, result);
298}
299
300struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
301{
302 return tm->bm;
303}
304
305/*----------------------------------------------------------------*/
306
307static int dm_tm_create_internal(struct dm_block_manager *bm,
308 dm_block_t sb_location,
309 struct dm_block_validator *sb_validator,
310 size_t root_offset, size_t root_max_len,
311 struct dm_transaction_manager **tm,
312 struct dm_space_map **sm,
313 struct dm_block **sblock,
314 int create)
315{
316 int r;
317 struct dm_space_map *inner;
318
319 inner = dm_sm_metadata_init();
320 if (IS_ERR(inner))
321 return PTR_ERR(inner);
322
323 *tm = dm_tm_create(bm, inner);
324 if (IS_ERR(*tm)) {
325 dm_sm_destroy(inner);
326 return PTR_ERR(*tm);
327 }
328
329 if (create) {
330 r = dm_bm_write_lock_zero(dm_tm_get_bm(*tm), sb_location,
331 sb_validator, sblock);
332 if (r < 0) {
333 DMERR("couldn't lock superblock");
334 goto bad1;
335 }
336
337 r = dm_sm_metadata_create(inner, *tm, dm_bm_nr_blocks(bm),
338 sb_location);
339 if (r) {
340 DMERR("couldn't create metadata space map");
341 goto bad2;
342 }
343
344 *sm = dm_sm_checker_create(inner);
345 if (!*sm)
346 goto bad2;
347
348 } else {
349 r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location,
350 sb_validator, sblock);
351 if (r < 0) {
352 DMERR("couldn't lock superblock");
353 goto bad1;
354 }
355
356 r = dm_sm_metadata_open(inner, *tm,
357 dm_block_data(*sblock) + root_offset,
358 root_max_len);
359 if (r) {
360 DMERR("couldn't open metadata space map");
361 goto bad2;
362 }
363
364 *sm = dm_sm_checker_create(inner);
365 if (!*sm)
366 goto bad2;
367 }
368
369 return 0;
370
371bad2:
372 dm_tm_unlock(*tm, *sblock);
373bad1:
374 dm_tm_destroy(*tm);
375 dm_sm_destroy(inner);
376 return r;
377}
378
379int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
380 struct dm_block_validator *sb_validator,
381 struct dm_transaction_manager **tm,
382 struct dm_space_map **sm, struct dm_block **sblock)
383{
384 return dm_tm_create_internal(bm, sb_location, sb_validator,
385 0, 0, tm, sm, sblock, 1);
386}
387EXPORT_SYMBOL_GPL(dm_tm_create_with_sm);
388
389int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
390 struct dm_block_validator *sb_validator,
391 size_t root_offset, size_t root_max_len,
392 struct dm_transaction_manager **tm,
393 struct dm_space_map **sm, struct dm_block **sblock)
394{
395 return dm_tm_create_internal(bm, sb_location, sb_validator, root_offset,
396 root_max_len, tm, sm, sblock, 0);
397}
398EXPORT_SYMBOL_GPL(dm_tm_open_with_sm);
399
400/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
new file mode 100644
index 000000000000..6da784871db4
--- /dev/null
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -0,0 +1,130 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef _LINUX_DM_TRANSACTION_MANAGER_H
8#define _LINUX_DM_TRANSACTION_MANAGER_H
9
10#include "dm-block-manager.h"
11
12struct dm_transaction_manager;
13struct dm_space_map;
14
15/*----------------------------------------------------------------*/
16
17/*
18 * This manages the scope of a transaction. It also enforces immutability
19 * of the on-disk data structures by limiting access to writeable blocks.
20 *
21 * Clients should not fiddle with the block manager directly.
22 */
23
24void dm_tm_destroy(struct dm_transaction_manager *tm);
25
26/*
27 * The non-blocking version of a transaction manager is intended for use in
28 * fast path code that needs to do lookups e.g. a dm mapping function.
29 * You create the non-blocking variant from a normal tm. The interface is
30 * the same, except that most functions will just return -EWOULDBLOCK.
31 * Methods that return void yet may block should not be called on a clone
32 * viz. dm_tm_inc, dm_tm_dec. Call dm_tm_destroy() as you would with a normal
33 * tm when you've finished with it. You may not destroy the original prior
34 * to clones.
35 */
36struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real);
37
38/*
39 * We use a 2-phase commit here.
40 *
41 * i) In the first phase the block manager is told to start flushing, and
42 * the changes to the space map are written to disk. You should interrogate
43 * your particular space map to get detail of its root node etc. to be
44 * included in your superblock.
45 *
46 * ii) @root will be committed last. You shouldn't use more than the
47 * first 512 bytes of @root if you wish the transaction to survive a power
48 * failure. You *must* have a write lock held on @root for both stage (i)
49 * and (ii). The commit will drop the write lock.
50 */
51int dm_tm_pre_commit(struct dm_transaction_manager *tm);
52int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root);
53
54/*
55 * These methods are the only way to get hold of a writeable block.
56 */
57
58/*
59 * dm_tm_new_block() is pretty self-explanatory. Make sure you do actually
60 * write to the whole of @data before you unlock, otherwise you could get
61 * a data leak. (The other option is for tm_new_block() to zero new blocks
62 * before handing them out, which will be redundant in most, if not all,
63 * cases).
64 * Zeroes the new block and returns with write lock held.
65 */
66int dm_tm_new_block(struct dm_transaction_manager *tm,
67 struct dm_block_validator *v,
68 struct dm_block **result);
69
70/*
71 * dm_tm_shadow_block() allocates a new block and copies the data from @orig
72 * to it. It then decrements the reference count on original block. Use
73 * this to update the contents of a block in a data structure, don't
74 * confuse this with a clone - you shouldn't access the orig block after
75 * this operation. Because the tm knows the scope of the transaction it
76 * can optimise requests for a shadow of a shadow to a no-op. Don't forget
77 * to unlock when you've finished with the shadow.
78 *
79 * The @inc_children flag is used to tell the caller whether it needs to
80 * adjust reference counts for children. (Data in the block may refer to
81 * other blocks.)
82 *
83 * Shadowing implicitly drops a reference on @orig so you must not have
84 * it locked when you call this.
85 */
86int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
87 struct dm_block_validator *v,
88 struct dm_block **result, int *inc_children);
89
90/*
91 * Read access. You can lock any block you want. If there's a write lock
92 * on it outstanding then it'll block.
93 */
94int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
95 struct dm_block_validator *v,
96 struct dm_block **result);
97
98int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b);
99
100/*
101 * Functions for altering the reference count of a block directly.
102 */
103void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b);
104
105void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b);
106
107int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
108 uint32_t *result);
109
110struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
111
112/*
113 * A little utility that ties the knot by producing a transaction manager
114 * that has a space map managed by the transaction manager...
115 *
116 * Returns a tm that has an open transaction to write the new disk sm.
117 * Caller should store the new sm root and commit.
118 */
119int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
120 struct dm_block_validator *sb_validator,
121 struct dm_transaction_manager **tm,
122 struct dm_space_map **sm, struct dm_block **sblock);
123
124int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
125 struct dm_block_validator *sb_validator,
126 size_t root_offset, size_t root_max_len,
127 struct dm_transaction_manager **tm,
128 struct dm_space_map **sm, struct dm_block **sblock);
129
130#endif /* _LINUX_DM_TRANSACTION_MANAGER_H */