aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/device-mapper/persistent-data.txt84
-rw-r--r--drivers/md/persistent-data/Kconfig8
-rw-r--r--drivers/md/persistent-data/Makefile11
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c620
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h123
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h137
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c566
-rw-r--r--drivers/md/persistent-data/dm-btree-spine.c244
-rw-r--r--drivers/md/persistent-data/dm-btree.c805
-rw-r--r--drivers/md/persistent-data/dm-btree.h145
-rw-r--r--drivers/md/persistent-data/dm-persistent-data-internal.h19
-rw-r--r--drivers/md/persistent-data/dm-space-map-checker.c437
-rw-r--r--drivers/md/persistent-data/dm-space-map-checker.h26
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c705
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.h126
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c335
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.h25
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c596
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.h33
-rw-r--r--drivers/md/persistent-data/dm-space-map.h134
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c400
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h130
22 files changed, 5709 insertions, 0 deletions
diff --git a/Documentation/device-mapper/persistent-data.txt b/Documentation/device-mapper/persistent-data.txt
new file mode 100644
index 000000000000..0e5df9b04ad2
--- /dev/null
+++ b/Documentation/device-mapper/persistent-data.txt
@@ -0,0 +1,84 @@
1Introduction
2============
3
4The more-sophisticated device-mapper targets require complex metadata
5that is managed in kernel. In late 2010 we were seeing that various
6different targets were rolling their own data strutures, for example:
7
8- Mikulas Patocka's multisnap implementation
9- Heinz Mauelshagen's thin provisioning target
10- Another btree-based caching target posted to dm-devel
11- Another multi-snapshot target based on a design of Daniel Phillips
12
13Maintaining these data structures takes a lot of work, so if possible
14we'd like to reduce the number.
15
16The persistent-data library is an attempt to provide a re-usable
17framework for people who want to store metadata in device-mapper
18targets. It's currently used by the thin-provisioning target and an
19upcoming hierarchical storage target.
20
21Overview
22========
23
24The main documentation is in the header files which can all be found
25under drivers/md/persistent-data.
26
27The block manager
28-----------------
29
30dm-block-manager.[hc]
31
32This provides access to the data on disk in fixed sized-blocks. There
33is a read/write locking interface to prevent concurrent accesses, and
34keep data that is being used in the cache.
35
36Clients of persistent-data are unlikely to use this directly.
37
38The transaction manager
39-----------------------
40
41dm-transaction-manager.[hc]
42
43This restricts access to blocks and enforces copy-on-write semantics.
44The only way you can get hold of a writable block through the
45transaction manager is by shadowing an existing block (ie. doing
46copy-on-write) or allocating a fresh one. Shadowing is elided within
47the same transaction so performance is reasonable. The commit method
48ensures that all data is flushed before it writes the superblock.
49On power failure your metadata will be as it was when last committed.
50
51The Space Maps
52--------------
53
54dm-space-map.h
55dm-space-map-metadata.[hc]
56dm-space-map-disk.[hc]
57
58On-disk data structures that keep track of reference counts of blocks.
59Also acts as the allocator of new blocks. Currently two
60implementations: a simpler one for managing blocks on a different
61device (eg. thinly-provisioned data blocks); and one for managing
62the metadata space. The latter is complicated by the need to store
63its own data within the space it's managing.
64
65The data structures
66-------------------
67
68dm-btree.[hc]
69dm-btree-remove.c
70dm-btree-spine.c
71dm-btree-internal.h
72
73Currently there is only one data structure, a hierarchical btree.
74There are plans to add more. For example, something with an
75array-like interface would see a lot of use.
76
77The btree is 'hierarchical' in that you can define it to be composed
78of nested btrees, and take multiple keys. For example, the
79thin-provisioning target uses a btree with two levels of nesting.
80The first maps a device id to a mapping tree, and that in turn maps a
81virtual block to a physical block.
82
83Values stored in the btrees can have arbitrary size. Keys are always
8464bits, although nesting allows you to use multiple keys.
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
new file mode 100644
index 000000000000..ceb359050a59
--- /dev/null
+++ b/drivers/md/persistent-data/Kconfig
@@ -0,0 +1,8 @@
1config DM_PERSISTENT_DATA
2 tristate
3 depends on BLK_DEV_DM && EXPERIMENTAL
4 select LIBCRC32C
5 select DM_BUFIO
6 ---help---
7 Library providing immutable on-disk data structure support for
8 device-mapper targets such as the thin provisioning target.
diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile
new file mode 100644
index 000000000000..cfa95f662230
--- /dev/null
+++ b/drivers/md/persistent-data/Makefile
@@ -0,0 +1,11 @@
1obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o
2dm-persistent-data-objs := \
3 dm-block-manager.o \
4 dm-space-map-checker.o \
5 dm-space-map-common.o \
6 dm-space-map-disk.o \
7 dm-space-map-metadata.o \
8 dm-transaction-manager.o \
9 dm-btree.o \
10 dm-btree-remove.o \
11 dm-btree-spine.o
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
new file mode 100644
index 000000000000..0317ecdc6e53
--- /dev/null
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -0,0 +1,620 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6#include "dm-block-manager.h"
7#include "dm-persistent-data-internal.h"
8#include "../dm-bufio.h"
9
10#include <linux/crc32c.h>
11#include <linux/module.h>
12#include <linux/slab.h>
13#include <linux/rwsem.h>
14#include <linux/device-mapper.h>
15#include <linux/stacktrace.h>
16
17#define DM_MSG_PREFIX "block manager"
18
19/*----------------------------------------------------------------*/
20
21/*
22 * This is a read/write semaphore with a couple of differences.
23 *
24 * i) There is a restriction on the number of concurrent read locks that
25 * may be held at once. This is just an implementation detail.
26 *
27 * ii) Recursive locking attempts are detected and return EINVAL. A stack
28 * trace is also emitted for the previous lock aquisition.
29 *
30 * iii) Priority is given to write locks.
31 */
32#define MAX_HOLDERS 4
33#define MAX_STACK 10
34
35typedef unsigned long stack_entries[MAX_STACK];
36
37struct block_lock {
38 spinlock_t lock;
39 __s32 count;
40 struct list_head waiters;
41 struct task_struct *holders[MAX_HOLDERS];
42
43#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
44 struct stack_trace traces[MAX_HOLDERS];
45 stack_entries entries[MAX_HOLDERS];
46#endif
47};
48
49struct waiter {
50 struct list_head list;
51 struct task_struct *task;
52 int wants_write;
53};
54
55static unsigned __find_holder(struct block_lock *lock,
56 struct task_struct *task)
57{
58 unsigned i;
59
60 for (i = 0; i < MAX_HOLDERS; i++)
61 if (lock->holders[i] == task)
62 break;
63
64 BUG_ON(i == MAX_HOLDERS);
65 return i;
66}
67
68/* call this *after* you increment lock->count */
69static void __add_holder(struct block_lock *lock, struct task_struct *task)
70{
71 unsigned h = __find_holder(lock, NULL);
72#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
73 struct stack_trace *t;
74#endif
75
76 get_task_struct(task);
77 lock->holders[h] = task;
78
79#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
80 t = lock->traces + h;
81 t->nr_entries = 0;
82 t->max_entries = MAX_STACK;
83 t->entries = lock->entries[h];
84 t->skip = 2;
85 save_stack_trace(t);
86#endif
87}
88
89/* call this *before* you decrement lock->count */
90static void __del_holder(struct block_lock *lock, struct task_struct *task)
91{
92 unsigned h = __find_holder(lock, task);
93 lock->holders[h] = NULL;
94 put_task_struct(task);
95}
96
97static int __check_holder(struct block_lock *lock)
98{
99 unsigned i;
100#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
101 static struct stack_trace t;
102 static stack_entries entries;
103#endif
104
105 for (i = 0; i < MAX_HOLDERS; i++) {
106 if (lock->holders[i] == current) {
107 DMERR("recursive lock detected in pool metadata");
108#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
109 DMERR("previously held here:");
110 print_stack_trace(lock->traces + i, 4);
111
112 DMERR("subsequent aquisition attempted here:");
113 t.nr_entries = 0;
114 t.max_entries = MAX_STACK;
115 t.entries = entries;
116 t.skip = 3;
117 save_stack_trace(&t);
118 print_stack_trace(&t, 4);
119#endif
120 return -EINVAL;
121 }
122 }
123
124 return 0;
125}
126
127static void __wait(struct waiter *w)
128{
129 for (;;) {
130 set_task_state(current, TASK_UNINTERRUPTIBLE);
131
132 if (!w->task)
133 break;
134
135 schedule();
136 }
137
138 set_task_state(current, TASK_RUNNING);
139}
140
141static void __wake_waiter(struct waiter *w)
142{
143 struct task_struct *task;
144
145 list_del(&w->list);
146 task = w->task;
147 smp_mb();
148 w->task = NULL;
149 wake_up_process(task);
150}
151
152/*
153 * We either wake a few readers or a single writer.
154 */
155static void __wake_many(struct block_lock *lock)
156{
157 struct waiter *w, *tmp;
158
159 BUG_ON(lock->count < 0);
160 list_for_each_entry_safe(w, tmp, &lock->waiters, list) {
161 if (lock->count >= MAX_HOLDERS)
162 return;
163
164 if (w->wants_write) {
165 if (lock->count > 0)
166 return; /* still read locked */
167
168 lock->count = -1;
169 __add_holder(lock, w->task);
170 __wake_waiter(w);
171 return;
172 }
173
174 lock->count++;
175 __add_holder(lock, w->task);
176 __wake_waiter(w);
177 }
178}
179
180static void bl_init(struct block_lock *lock)
181{
182 int i;
183
184 spin_lock_init(&lock->lock);
185 lock->count = 0;
186 INIT_LIST_HEAD(&lock->waiters);
187 for (i = 0; i < MAX_HOLDERS; i++)
188 lock->holders[i] = NULL;
189}
190
191static int __available_for_read(struct block_lock *lock)
192{
193 return lock->count >= 0 &&
194 lock->count < MAX_HOLDERS &&
195 list_empty(&lock->waiters);
196}
197
198static int bl_down_read(struct block_lock *lock)
199{
200 int r;
201 struct waiter w;
202
203 spin_lock(&lock->lock);
204 r = __check_holder(lock);
205 if (r) {
206 spin_unlock(&lock->lock);
207 return r;
208 }
209
210 if (__available_for_read(lock)) {
211 lock->count++;
212 __add_holder(lock, current);
213 spin_unlock(&lock->lock);
214 return 0;
215 }
216
217 get_task_struct(current);
218
219 w.task = current;
220 w.wants_write = 0;
221 list_add_tail(&w.list, &lock->waiters);
222 spin_unlock(&lock->lock);
223
224 __wait(&w);
225 put_task_struct(current);
226 return 0;
227}
228
229static int bl_down_read_nonblock(struct block_lock *lock)
230{
231 int r;
232
233 spin_lock(&lock->lock);
234 r = __check_holder(lock);
235 if (r)
236 goto out;
237
238 if (__available_for_read(lock)) {
239 lock->count++;
240 __add_holder(lock, current);
241 r = 0;
242 } else
243 r = -EWOULDBLOCK;
244
245out:
246 spin_unlock(&lock->lock);
247 return r;
248}
249
250static void bl_up_read(struct block_lock *lock)
251{
252 spin_lock(&lock->lock);
253 BUG_ON(lock->count <= 0);
254 __del_holder(lock, current);
255 --lock->count;
256 if (!list_empty(&lock->waiters))
257 __wake_many(lock);
258 spin_unlock(&lock->lock);
259}
260
261static int bl_down_write(struct block_lock *lock)
262{
263 int r;
264 struct waiter w;
265
266 spin_lock(&lock->lock);
267 r = __check_holder(lock);
268 if (r) {
269 spin_unlock(&lock->lock);
270 return r;
271 }
272
273 if (lock->count == 0 && list_empty(&lock->waiters)) {
274 lock->count = -1;
275 __add_holder(lock, current);
276 spin_unlock(&lock->lock);
277 return 0;
278 }
279
280 get_task_struct(current);
281 w.task = current;
282 w.wants_write = 1;
283
284 /*
285 * Writers given priority. We know there's only one mutator in the
286 * system, so ignoring the ordering reversal.
287 */
288 list_add(&w.list, &lock->waiters);
289 spin_unlock(&lock->lock);
290
291 __wait(&w);
292 put_task_struct(current);
293
294 return 0;
295}
296
297static void bl_up_write(struct block_lock *lock)
298{
299 spin_lock(&lock->lock);
300 __del_holder(lock, current);
301 lock->count = 0;
302 if (!list_empty(&lock->waiters))
303 __wake_many(lock);
304 spin_unlock(&lock->lock);
305}
306
307static void report_recursive_bug(dm_block_t b, int r)
308{
309 if (r == -EINVAL)
310 DMERR("recursive acquisition of block %llu requested.",
311 (unsigned long long) b);
312}
313
314/*----------------------------------------------------------------*/
315
316/*
317 * Block manager is currently implemented using dm-bufio. struct
318 * dm_block_manager and struct dm_block map directly onto a couple of
319 * structs in the bufio interface. I want to retain the freedom to move
320 * away from bufio in the future. So these structs are just cast within
321 * this .c file, rather than making it through to the public interface.
322 */
323static struct dm_buffer *to_buffer(struct dm_block *b)
324{
325 return (struct dm_buffer *) b;
326}
327
328static struct dm_bufio_client *to_bufio(struct dm_block_manager *bm)
329{
330 return (struct dm_bufio_client *) bm;
331}
332
333dm_block_t dm_block_location(struct dm_block *b)
334{
335 return dm_bufio_get_block_number(to_buffer(b));
336}
337EXPORT_SYMBOL_GPL(dm_block_location);
338
339void *dm_block_data(struct dm_block *b)
340{
341 return dm_bufio_get_block_data(to_buffer(b));
342}
343EXPORT_SYMBOL_GPL(dm_block_data);
344
345struct buffer_aux {
346 struct dm_block_validator *validator;
347 struct block_lock lock;
348 int write_locked;
349};
350
351static void dm_block_manager_alloc_callback(struct dm_buffer *buf)
352{
353 struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
354 aux->validator = NULL;
355 bl_init(&aux->lock);
356}
357
358static void dm_block_manager_write_callback(struct dm_buffer *buf)
359{
360 struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
361 if (aux->validator) {
362 aux->validator->prepare_for_write(aux->validator, (struct dm_block *) buf,
363 dm_bufio_get_block_size(dm_bufio_get_client(buf)));
364 }
365}
366
367/*----------------------------------------------------------------
368 * Public interface
369 *--------------------------------------------------------------*/
370struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
371 unsigned block_size,
372 unsigned cache_size,
373 unsigned max_held_per_thread)
374{
375 return (struct dm_block_manager *)
376 dm_bufio_client_create(bdev, block_size, max_held_per_thread,
377 sizeof(struct buffer_aux),
378 dm_block_manager_alloc_callback,
379 dm_block_manager_write_callback);
380}
381EXPORT_SYMBOL_GPL(dm_block_manager_create);
382
383void dm_block_manager_destroy(struct dm_block_manager *bm)
384{
385 return dm_bufio_client_destroy(to_bufio(bm));
386}
387EXPORT_SYMBOL_GPL(dm_block_manager_destroy);
388
389unsigned dm_bm_block_size(struct dm_block_manager *bm)
390{
391 return dm_bufio_get_block_size(to_bufio(bm));
392}
393EXPORT_SYMBOL_GPL(dm_bm_block_size);
394
395dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm)
396{
397 return dm_bufio_get_device_size(to_bufio(bm));
398}
399
400static int dm_bm_validate_buffer(struct dm_block_manager *bm,
401 struct dm_buffer *buf,
402 struct buffer_aux *aux,
403 struct dm_block_validator *v)
404{
405 if (unlikely(!aux->validator)) {
406 int r;
407 if (!v)
408 return 0;
409 r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(to_bufio(bm)));
410 if (unlikely(r))
411 return r;
412 aux->validator = v;
413 } else {
414 if (unlikely(aux->validator != v)) {
415 DMERR("validator mismatch (old=%s vs new=%s) for block %llu",
416 aux->validator->name, v ? v->name : "NULL",
417 (unsigned long long)
418 dm_bufio_get_block_number(buf));
419 return -EINVAL;
420 }
421 }
422
423 return 0;
424}
425int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
426 struct dm_block_validator *v,
427 struct dm_block **result)
428{
429 struct buffer_aux *aux;
430 void *p;
431 int r;
432
433 p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result);
434 if (unlikely(IS_ERR(p)))
435 return PTR_ERR(p);
436
437 aux = dm_bufio_get_aux_data(to_buffer(*result));
438 r = bl_down_read(&aux->lock);
439 if (unlikely(r)) {
440 dm_bufio_release(to_buffer(*result));
441 report_recursive_bug(b, r);
442 return r;
443 }
444
445 aux->write_locked = 0;
446
447 r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
448 if (unlikely(r)) {
449 bl_up_read(&aux->lock);
450 dm_bufio_release(to_buffer(*result));
451 return r;
452 }
453
454 return 0;
455}
456EXPORT_SYMBOL_GPL(dm_bm_read_lock);
457
458int dm_bm_write_lock(struct dm_block_manager *bm,
459 dm_block_t b, struct dm_block_validator *v,
460 struct dm_block **result)
461{
462 struct buffer_aux *aux;
463 void *p;
464 int r;
465
466 p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result);
467 if (unlikely(IS_ERR(p)))
468 return PTR_ERR(p);
469
470 aux = dm_bufio_get_aux_data(to_buffer(*result));
471 r = bl_down_write(&aux->lock);
472 if (r) {
473 dm_bufio_release(to_buffer(*result));
474 report_recursive_bug(b, r);
475 return r;
476 }
477
478 aux->write_locked = 1;
479
480 r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
481 if (unlikely(r)) {
482 bl_up_write(&aux->lock);
483 dm_bufio_release(to_buffer(*result));
484 return r;
485 }
486
487 return 0;
488}
489EXPORT_SYMBOL_GPL(dm_bm_write_lock);
490
491int dm_bm_read_try_lock(struct dm_block_manager *bm,
492 dm_block_t b, struct dm_block_validator *v,
493 struct dm_block **result)
494{
495 struct buffer_aux *aux;
496 void *p;
497 int r;
498
499 p = dm_bufio_get(to_bufio(bm), b, (struct dm_buffer **) result);
500 if (unlikely(IS_ERR(p)))
501 return PTR_ERR(p);
502 if (unlikely(!p))
503 return -EWOULDBLOCK;
504
505 aux = dm_bufio_get_aux_data(to_buffer(*result));
506 r = bl_down_read_nonblock(&aux->lock);
507 if (r < 0) {
508 dm_bufio_release(to_buffer(*result));
509 report_recursive_bug(b, r);
510 return r;
511 }
512 aux->write_locked = 0;
513
514 r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
515 if (unlikely(r)) {
516 bl_up_read(&aux->lock);
517 dm_bufio_release(to_buffer(*result));
518 return r;
519 }
520
521 return 0;
522}
523
524int dm_bm_write_lock_zero(struct dm_block_manager *bm,
525 dm_block_t b, struct dm_block_validator *v,
526 struct dm_block **result)
527{
528 int r;
529 struct buffer_aux *aux;
530 void *p;
531
532 p = dm_bufio_new(to_bufio(bm), b, (struct dm_buffer **) result);
533 if (unlikely(IS_ERR(p)))
534 return PTR_ERR(p);
535
536 memset(p, 0, dm_bm_block_size(bm));
537
538 aux = dm_bufio_get_aux_data(to_buffer(*result));
539 r = bl_down_write(&aux->lock);
540 if (r) {
541 dm_bufio_release(to_buffer(*result));
542 return r;
543 }
544
545 aux->write_locked = 1;
546 aux->validator = v;
547
548 return 0;
549}
550
551int dm_bm_unlock(struct dm_block *b)
552{
553 struct buffer_aux *aux;
554 aux = dm_bufio_get_aux_data(to_buffer(b));
555
556 if (aux->write_locked) {
557 dm_bufio_mark_buffer_dirty(to_buffer(b));
558 bl_up_write(&aux->lock);
559 } else
560 bl_up_read(&aux->lock);
561
562 dm_bufio_release(to_buffer(b));
563
564 return 0;
565}
566EXPORT_SYMBOL_GPL(dm_bm_unlock);
567
568int dm_bm_unlock_move(struct dm_block *b, dm_block_t n)
569{
570 struct buffer_aux *aux;
571
572 aux = dm_bufio_get_aux_data(to_buffer(b));
573
574 if (aux->write_locked) {
575 dm_bufio_mark_buffer_dirty(to_buffer(b));
576 bl_up_write(&aux->lock);
577 } else
578 bl_up_read(&aux->lock);
579
580 dm_bufio_release_move(to_buffer(b), n);
581 return 0;
582}
583
584int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
585 struct dm_block *superblock)
586{
587 int r;
588
589 r = dm_bufio_write_dirty_buffers(to_bufio(bm));
590 if (unlikely(r))
591 return r;
592 r = dm_bufio_issue_flush(to_bufio(bm));
593 if (unlikely(r))
594 return r;
595
596 dm_bm_unlock(superblock);
597
598 r = dm_bufio_write_dirty_buffers(to_bufio(bm));
599 if (unlikely(r))
600 return r;
601 r = dm_bufio_issue_flush(to_bufio(bm));
602 if (unlikely(r))
603 return r;
604
605 return 0;
606}
607
608u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
609{
610 return crc32c(~(u32) 0, data, len) ^ init_xor;
611}
612EXPORT_SYMBOL_GPL(dm_bm_checksum);
613
614/*----------------------------------------------------------------*/
615
616MODULE_LICENSE("GPL");
617MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
618MODULE_DESCRIPTION("Immutable metadata library for dm");
619
620/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
new file mode 100644
index 000000000000..924833d2dfa6
--- /dev/null
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -0,0 +1,123 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef _LINUX_DM_BLOCK_MANAGER_H
8#define _LINUX_DM_BLOCK_MANAGER_H
9
10#include <linux/types.h>
11#include <linux/blkdev.h>
12
13/*----------------------------------------------------------------*/
14
15/*
16 * Block number.
17 */
18typedef uint64_t dm_block_t;
19struct dm_block;
20
21dm_block_t dm_block_location(struct dm_block *b);
22void *dm_block_data(struct dm_block *b);
23
24/*----------------------------------------------------------------*/
25
26/*
27 * @name should be a unique identifier for the block manager, no longer
28 * than 32 chars.
29 *
30 * @max_held_per_thread should be the maximum number of locks, read or
31 * write, that an individual thread holds at any one time.
32 */
33struct dm_block_manager;
34struct dm_block_manager *dm_block_manager_create(
35 struct block_device *bdev, unsigned block_size,
36 unsigned cache_size, unsigned max_held_per_thread);
37void dm_block_manager_destroy(struct dm_block_manager *bm);
38
39unsigned dm_bm_block_size(struct dm_block_manager *bm);
40dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm);
41
42/*----------------------------------------------------------------*/
43
44/*
45 * The validator allows the caller to verify newly-read data and modify
46 * the data just before writing, e.g. to calculate checksums. It's
47 * important to be consistent with your use of validators. The only time
48 * you can change validators is if you call dm_bm_write_lock_zero.
49 */
50struct dm_block_validator {
51 const char *name;
52 void (*prepare_for_write)(struct dm_block_validator *v, struct dm_block *b, size_t block_size);
53
54 /*
55 * Return 0 if the checksum is valid or < 0 on error.
56 */
57 int (*check)(struct dm_block_validator *v, struct dm_block *b, size_t block_size);
58};
59
60/*----------------------------------------------------------------*/
61
62/*
63 * You can have multiple concurrent readers or a single writer holding a
64 * block lock.
65 */
66
67/*
68 * dm_bm_lock() locks a block and returns through @result a pointer to
69 * memory that holds a copy of that block. If you have write-locked the
70 * block then any changes you make to memory pointed to by @result will be
71 * written back to the disk sometime after dm_bm_unlock is called.
72 */
73int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
74 struct dm_block_validator *v,
75 struct dm_block **result);
76
77int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b,
78 struct dm_block_validator *v,
79 struct dm_block **result);
80
81/*
82 * The *_try_lock variants return -EWOULDBLOCK if the block isn't
83 * available immediately.
84 */
85int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b,
86 struct dm_block_validator *v,
87 struct dm_block **result);
88
89/*
90 * Use dm_bm_write_lock_zero() when you know you're going to
91 * overwrite the block completely. It saves a disk read.
92 */
93int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b,
94 struct dm_block_validator *v,
95 struct dm_block **result);
96
97int dm_bm_unlock(struct dm_block *b);
98
99/*
100 * An optimisation; we often want to copy a block's contents to a new
101 * block. eg, as part of the shadowing operation. It's far better for
102 * bufio to do this move behind the scenes than hold 2 locks and memcpy the
103 * data.
104 */
105int dm_bm_unlock_move(struct dm_block *b, dm_block_t n);
106
107/*
108 * It's a common idiom to have a superblock that should be committed last.
109 *
110 * @superblock should be write-locked on entry. It will be unlocked during
111 * this function. All dirty blocks are guaranteed to be written and flushed
112 * before the superblock.
113 *
114 * This method always blocks.
115 */
116int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
117 struct dm_block *superblock);
118
119u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
120
121/*----------------------------------------------------------------*/
122
123#endif /* _LINUX_DM_BLOCK_MANAGER_H */
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
new file mode 100644
index 000000000000..d279c768f8f1
--- /dev/null
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -0,0 +1,137 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_BTREE_INTERNAL_H
8#define DM_BTREE_INTERNAL_H
9
10#include "dm-btree.h"
11
12/*----------------------------------------------------------------*/
13
14/*
15 * We'll need 2 accessor functions for n->csum and n->blocknr
16 * to support dm-btree-spine.c in that case.
17 */
18
19enum node_flags {
20 INTERNAL_NODE = 1,
21 LEAF_NODE = 1 << 1
22};
23
24/*
25 * Every btree node begins with this structure. Make sure it's a multiple
26 * of 8-bytes in size, otherwise the 64bit keys will be mis-aligned.
27 */
28struct node_header {
29 __le32 csum;
30 __le32 flags;
31 __le64 blocknr; /* Block this node is supposed to live in. */
32
33 __le32 nr_entries;
34 __le32 max_entries;
35 __le32 value_size;
36 __le32 padding;
37} __packed;
38
39struct node {
40 struct node_header header;
41 __le64 keys[0];
42} __packed;
43
44
45void inc_children(struct dm_transaction_manager *tm, struct node *n,
46 struct dm_btree_value_type *vt);
47
48int new_block(struct dm_btree_info *info, struct dm_block **result);
49int unlock_block(struct dm_btree_info *info, struct dm_block *b);
50
51/*
52 * Spines keep track of the rolling locks. There are 2 variants, read-only
53 * and one that uses shadowing. These are separate structs to allow the
54 * type checker to spot misuse, for example accidentally calling read_lock
55 * on a shadow spine.
56 */
57struct ro_spine {
58 struct dm_btree_info *info;
59
60 int count;
61 struct dm_block *nodes[2];
62};
63
64void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info);
65int exit_ro_spine(struct ro_spine *s);
66int ro_step(struct ro_spine *s, dm_block_t new_child);
67struct node *ro_node(struct ro_spine *s);
68
69struct shadow_spine {
70 struct dm_btree_info *info;
71
72 int count;
73 struct dm_block *nodes[2];
74
75 dm_block_t root;
76};
77
78void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info);
79int exit_shadow_spine(struct shadow_spine *s);
80
81int shadow_step(struct shadow_spine *s, dm_block_t b,
82 struct dm_btree_value_type *vt);
83
84/*
85 * The spine must have at least one entry before calling this.
86 */
87struct dm_block *shadow_current(struct shadow_spine *s);
88
89/*
90 * The spine must have at least two entries before calling this.
91 */
92struct dm_block *shadow_parent(struct shadow_spine *s);
93
94int shadow_has_parent(struct shadow_spine *s);
95
96int shadow_root(struct shadow_spine *s);
97
98/*
99 * Some inlines.
100 */
101static inline __le64 *key_ptr(struct node *n, uint32_t index)
102{
103 return n->keys + index;
104}
105
106static inline void *value_base(struct node *n)
107{
108 return &n->keys[le32_to_cpu(n->header.max_entries)];
109}
110
111/*
112 * FIXME: Now that value size is stored in node we don't need the third parm.
113 */
114static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size)
115{
116 BUG_ON(value_size != le32_to_cpu(n->header.value_size));
117 return value_base(n) + (value_size * index);
118}
119
120/*
121 * Assumes the values are suitably-aligned and converts to core format.
122 */
123static inline uint64_t value64(struct node *n, uint32_t index)
124{
125 __le64 *values_le = value_base(n);
126
127 return le64_to_cpu(values_le[index]);
128}
129
130/*
131 * Searching for a key within a single node.
132 */
133int lower_bound(struct node *n, uint64_t key);
134
135extern struct dm_block_validator btree_node_validator;
136
137#endif /* DM_BTREE_INTERNAL_H */
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
new file mode 100644
index 000000000000..65fd85ec6514
--- /dev/null
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -0,0 +1,566 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-btree.h"
8#include "dm-btree-internal.h"
9#include "dm-transaction-manager.h"
10
11#include <linux/module.h>
12
13/*
14 * Removing an entry from a btree
15 * ==============================
16 *
17 * A very important constraint for our btree is that no node, except the
18 * root, may have fewer than a certain number of entries.
19 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
20 *
21 * Ensuring this is complicated by the way we want to only ever hold the
22 * locks on 2 nodes concurrently, and only change nodes in a top to bottom
23 * fashion.
24 *
25 * Each node may have a left or right sibling. When decending the spine,
26 * if a node contains only MIN_ENTRIES then we try and increase this to at
27 * least MIN_ENTRIES + 1. We do this in the following ways:
28 *
29 * [A] No siblings => this can only happen if the node is the root, in which
30 * case we copy the childs contents over the root.
31 *
32 * [B] No left sibling
33 * ==> rebalance(node, right sibling)
34 *
35 * [C] No right sibling
36 * ==> rebalance(left sibling, node)
37 *
38 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
39 * ==> delete node adding it's contents to left and right
40 *
41 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
42 * ==> rebalance(left, node, right)
43 *
44 * After these operations it's possible that the our original node no
45 * longer contains the desired sub tree. For this reason this rebalancing
46 * is performed on the children of the current node. This also avoids
47 * having a special case for the root.
48 *
49 * Once this rebalancing has occurred we can then step into the child node
50 * for internal nodes. Or delete the entry for leaf nodes.
51 */
52
53/*
54 * Some little utilities for moving node data around.
55 */
56static void node_shift(struct node *n, int shift)
57{
58 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
59 uint32_t value_size = le32_to_cpu(n->header.value_size);
60
61 if (shift < 0) {
62 shift = -shift;
63 BUG_ON(shift > nr_entries);
64 BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size));
65 memmove(key_ptr(n, 0),
66 key_ptr(n, shift),
67 (nr_entries - shift) * sizeof(__le64));
68 memmove(value_ptr(n, 0, value_size),
69 value_ptr(n, shift, value_size),
70 (nr_entries - shift) * value_size);
71 } else {
72 BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
73 memmove(key_ptr(n, shift),
74 key_ptr(n, 0),
75 nr_entries * sizeof(__le64));
76 memmove(value_ptr(n, shift, value_size),
77 value_ptr(n, 0, value_size),
78 nr_entries * value_size);
79 }
80}
81
82static void node_copy(struct node *left, struct node *right, int shift)
83{
84 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
85 uint32_t value_size = le32_to_cpu(left->header.value_size);
86 BUG_ON(value_size != le32_to_cpu(right->header.value_size));
87
88 if (shift < 0) {
89 shift = -shift;
90 BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries));
91 memcpy(key_ptr(left, nr_left),
92 key_ptr(right, 0),
93 shift * sizeof(__le64));
94 memcpy(value_ptr(left, nr_left, value_size),
95 value_ptr(right, 0, value_size),
96 shift * value_size);
97 } else {
98 BUG_ON(shift > le32_to_cpu(right->header.max_entries));
99 memcpy(key_ptr(right, 0),
100 key_ptr(left, nr_left - shift),
101 shift * sizeof(__le64));
102 memcpy(value_ptr(right, 0, value_size),
103 value_ptr(left, nr_left - shift, value_size),
104 shift * value_size);
105 }
106}
107
108/*
109 * Delete a specific entry from a leaf node.
110 */
111static void delete_at(struct node *n, unsigned index)
112{
113 unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
114 unsigned nr_to_copy = nr_entries - (index + 1);
115 uint32_t value_size = le32_to_cpu(n->header.value_size);
116 BUG_ON(index >= nr_entries);
117
118 if (nr_to_copy) {
119 memmove(key_ptr(n, index),
120 key_ptr(n, index + 1),
121 nr_to_copy * sizeof(__le64));
122
123 memmove(value_ptr(n, index, value_size),
124 value_ptr(n, index + 1, value_size),
125 nr_to_copy * value_size);
126 }
127
128 n->header.nr_entries = cpu_to_le32(nr_entries - 1);
129}
130
131static unsigned del_threshold(struct node *n)
132{
133 return le32_to_cpu(n->header.max_entries) / 3;
134}
135
136static unsigned merge_threshold(struct node *n)
137{
138 /*
139 * The extra one is because we know we're potentially going to
140 * delete an entry.
141 */
142 return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1;
143}
144
145struct child {
146 unsigned index;
147 struct dm_block *block;
148 struct node *n;
149};
150
151static struct dm_btree_value_type le64_type = {
152 .context = NULL,
153 .size = sizeof(__le64),
154 .inc = NULL,
155 .dec = NULL,
156 .equal = NULL
157};
158
159static int init_child(struct dm_btree_info *info, struct node *parent,
160 unsigned index, struct child *result)
161{
162 int r, inc;
163 dm_block_t root;
164
165 result->index = index;
166 root = value64(parent, index);
167
168 r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
169 &result->block, &inc);
170 if (r)
171 return r;
172
173 result->n = dm_block_data(result->block);
174
175 if (inc)
176 inc_children(info->tm, result->n, &le64_type);
177
178 *((__le64 *) value_ptr(parent, index, sizeof(__le64))) =
179 cpu_to_le64(dm_block_location(result->block));
180
181 return 0;
182}
183
184static int exit_child(struct dm_btree_info *info, struct child *c)
185{
186 return dm_tm_unlock(info->tm, c->block);
187}
188
189static void shift(struct node *left, struct node *right, int count)
190{
191 if (!count)
192 return;
193
194 if (count > 0) {
195 node_shift(right, count);
196 node_copy(left, right, count);
197 } else {
198 node_copy(left, right, count);
199 node_shift(right, count);
200 }
201
202 left->header.nr_entries =
203 cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count);
204 BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries));
205
206 right->header.nr_entries =
207 cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count);
208 BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries));
209}
210
211static void __rebalance2(struct dm_btree_info *info, struct node *parent,
212 struct child *l, struct child *r)
213{
214 struct node *left = l->n;
215 struct node *right = r->n;
216 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
217 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
218
219 if (nr_left + nr_right <= merge_threshold(left)) {
220 /*
221 * Merge
222 */
223 node_copy(left, right, -nr_right);
224 left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
225 delete_at(parent, r->index);
226
227 /*
228 * We need to decrement the right block, but not it's
229 * children, since they're still referenced by left.
230 */
231 dm_tm_dec(info->tm, dm_block_location(r->block));
232 } else {
233 /*
234 * Rebalance.
235 */
236 unsigned target_left = (nr_left + nr_right) / 2;
237 unsigned shift_ = nr_left - target_left;
238 BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_);
239 BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_);
240 shift(left, right, nr_left - target_left);
241 *key_ptr(parent, r->index) = right->keys[0];
242 }
243}
244
245static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
246 unsigned left_index)
247{
248 int r;
249 struct node *parent;
250 struct child left, right;
251
252 parent = dm_block_data(shadow_current(s));
253
254 r = init_child(info, parent, left_index, &left);
255 if (r)
256 return r;
257
258 r = init_child(info, parent, left_index + 1, &right);
259 if (r) {
260 exit_child(info, &left);
261 return r;
262 }
263
264 __rebalance2(info, parent, &left, &right);
265
266 r = exit_child(info, &left);
267 if (r) {
268 exit_child(info, &right);
269 return r;
270 }
271
272 return exit_child(info, &right);
273}
274
275static void __rebalance3(struct dm_btree_info *info, struct node *parent,
276 struct child *l, struct child *c, struct child *r)
277{
278 struct node *left = l->n;
279 struct node *center = c->n;
280 struct node *right = r->n;
281
282 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
283 uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
284 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
285 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
286
287 unsigned target;
288
289 BUG_ON(left->header.max_entries != center->header.max_entries);
290 BUG_ON(center->header.max_entries != right->header.max_entries);
291
292 if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) {
293 /*
294 * Delete center node:
295 *
296 * We dump as many entries from center as possible into
297 * left, then the rest in right, then rebalance2. This
298 * wastes some cpu, but I want something simple atm.
299 */
300 unsigned shift = min(max_entries - nr_left, nr_center);
301
302 BUG_ON(nr_left + shift > max_entries);
303 node_copy(left, center, -shift);
304 left->header.nr_entries = cpu_to_le32(nr_left + shift);
305
306 if (shift != nr_center) {
307 shift = nr_center - shift;
308 BUG_ON((nr_right + shift) >= max_entries);
309 node_shift(right, shift);
310 node_copy(center, right, shift);
311 right->header.nr_entries = cpu_to_le32(nr_right + shift);
312 }
313 *key_ptr(parent, r->index) = right->keys[0];
314
315 delete_at(parent, c->index);
316 r->index--;
317
318 dm_tm_dec(info->tm, dm_block_location(c->block));
319 __rebalance2(info, parent, l, r);
320
321 return;
322 }
323
324 /*
325 * Rebalance
326 */
327 target = (nr_left + nr_center + nr_right) / 3;
328 BUG_ON(target > max_entries);
329
330 /*
331 * Adjust the left node
332 */
333 shift(left, center, nr_left - target);
334
335 /*
336 * Adjust the right node
337 */
338 shift(center, right, target - nr_right);
339 *key_ptr(parent, c->index) = center->keys[0];
340 *key_ptr(parent, r->index) = right->keys[0];
341}
342
343static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
344 unsigned left_index)
345{
346 int r;
347 struct node *parent = dm_block_data(shadow_current(s));
348 struct child left, center, right;
349
350 /*
351 * FIXME: fill out an array?
352 */
353 r = init_child(info, parent, left_index, &left);
354 if (r)
355 return r;
356
357 r = init_child(info, parent, left_index + 1, &center);
358 if (r) {
359 exit_child(info, &left);
360 return r;
361 }
362
363 r = init_child(info, parent, left_index + 2, &right);
364 if (r) {
365 exit_child(info, &left);
366 exit_child(info, &center);
367 return r;
368 }
369
370 __rebalance3(info, parent, &left, &center, &right);
371
372 r = exit_child(info, &left);
373 if (r) {
374 exit_child(info, &center);
375 exit_child(info, &right);
376 return r;
377 }
378
379 r = exit_child(info, &center);
380 if (r) {
381 exit_child(info, &right);
382 return r;
383 }
384
385 r = exit_child(info, &right);
386 if (r)
387 return r;
388
389 return 0;
390}
391
392static int get_nr_entries(struct dm_transaction_manager *tm,
393 dm_block_t b, uint32_t *result)
394{
395 int r;
396 struct dm_block *block;
397 struct node *n;
398
399 r = dm_tm_read_lock(tm, b, &btree_node_validator, &block);
400 if (r)
401 return r;
402
403 n = dm_block_data(block);
404 *result = le32_to_cpu(n->header.nr_entries);
405
406 return dm_tm_unlock(tm, block);
407}
408
409static int rebalance_children(struct shadow_spine *s,
410 struct dm_btree_info *info, uint64_t key)
411{
412 int i, r, has_left_sibling, has_right_sibling;
413 uint32_t child_entries;
414 struct node *n;
415
416 n = dm_block_data(shadow_current(s));
417
418 if (le32_to_cpu(n->header.nr_entries) == 1) {
419 struct dm_block *child;
420 dm_block_t b = value64(n, 0);
421
422 r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
423 if (r)
424 return r;
425
426 memcpy(n, dm_block_data(child),
427 dm_bm_block_size(dm_tm_get_bm(info->tm)));
428 r = dm_tm_unlock(info->tm, child);
429 if (r)
430 return r;
431
432 dm_tm_dec(info->tm, dm_block_location(child));
433 return 0;
434 }
435
436 i = lower_bound(n, key);
437 if (i < 0)
438 return -ENODATA;
439
440 r = get_nr_entries(info->tm, value64(n, i), &child_entries);
441 if (r)
442 return r;
443
444 if (child_entries > del_threshold(n))
445 return 0;
446
447 has_left_sibling = i > 0;
448 has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
449
450 if (!has_left_sibling)
451 r = rebalance2(s, info, i);
452
453 else if (!has_right_sibling)
454 r = rebalance2(s, info, i - 1);
455
456 else
457 r = rebalance3(s, info, i - 1);
458
459 return r;
460}
461
462static int do_leaf(struct node *n, uint64_t key, unsigned *index)
463{
464 int i = lower_bound(n, key);
465
466 if ((i < 0) ||
467 (i >= le32_to_cpu(n->header.nr_entries)) ||
468 (le64_to_cpu(n->keys[i]) != key))
469 return -ENODATA;
470
471 *index = i;
472
473 return 0;
474}
475
476/*
477 * Prepares for removal from one level of the hierarchy. The caller must
478 * call delete_at() to remove the entry at index.
479 */
480static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
481 struct dm_btree_value_type *vt, dm_block_t root,
482 uint64_t key, unsigned *index)
483{
484 int i = *index, r;
485 struct node *n;
486
487 for (;;) {
488 r = shadow_step(s, root, vt);
489 if (r < 0)
490 break;
491
492 /*
493 * We have to patch up the parent node, ugly, but I don't
494 * see a way to do this automatically as part of the spine
495 * op.
496 */
497 if (shadow_has_parent(s)) {
498 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
499 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)),
500 &location, sizeof(__le64));
501 }
502
503 n = dm_block_data(shadow_current(s));
504
505 if (le32_to_cpu(n->header.flags) & LEAF_NODE)
506 return do_leaf(n, key, index);
507
508 r = rebalance_children(s, info, key);
509 if (r)
510 break;
511
512 n = dm_block_data(shadow_current(s));
513 if (le32_to_cpu(n->header.flags) & LEAF_NODE)
514 return do_leaf(n, key, index);
515
516 i = lower_bound(n, key);
517
518 /*
519 * We know the key is present, or else
520 * rebalance_children would have returned
521 * -ENODATA
522 */
523 root = value64(n, i);
524 }
525
526 return r;
527}
528
529int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
530 uint64_t *keys, dm_block_t *new_root)
531{
532 unsigned level, last_level = info->levels - 1;
533 int index = 0, r = 0;
534 struct shadow_spine spine;
535 struct node *n;
536
537 init_shadow_spine(&spine, info);
538 for (level = 0; level < info->levels; level++) {
539 r = remove_raw(&spine, info,
540 (level == last_level ?
541 &info->value_type : &le64_type),
542 root, keys[level], (unsigned *)&index);
543 if (r < 0)
544 break;
545
546 n = dm_block_data(shadow_current(&spine));
547 if (level != last_level) {
548 root = value64(n, index);
549 continue;
550 }
551
552 BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
553
554 if (info->value_type.dec)
555 info->value_type.dec(info->value_type.context,
556 value_ptr(n, index, info->value_type.size));
557
558 delete_at(n, index);
559 }
560
561 *new_root = shadow_root(&spine);
562 exit_shadow_spine(&spine);
563
564 return r;
565}
566EXPORT_SYMBOL_GPL(dm_btree_remove);
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c
new file mode 100644
index 000000000000..d9a7912ee8ee
--- /dev/null
+++ b/drivers/md/persistent-data/dm-btree-spine.c
@@ -0,0 +1,244 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-btree-internal.h"
8#include "dm-transaction-manager.h"
9
10#include <linux/device-mapper.h>
11
12#define DM_MSG_PREFIX "btree spine"
13
14/*----------------------------------------------------------------*/
15
16#define BTREE_CSUM_XOR 121107
17
18static int node_check(struct dm_block_validator *v,
19 struct dm_block *b,
20 size_t block_size);
21
22static void node_prepare_for_write(struct dm_block_validator *v,
23 struct dm_block *b,
24 size_t block_size)
25{
26 struct node *n = dm_block_data(b);
27 struct node_header *h = &n->header;
28
29 h->blocknr = cpu_to_le64(dm_block_location(b));
30 h->csum = cpu_to_le32(dm_bm_checksum(&h->flags,
31 block_size - sizeof(__le32),
32 BTREE_CSUM_XOR));
33
34 BUG_ON(node_check(v, b, 4096));
35}
36
37static int node_check(struct dm_block_validator *v,
38 struct dm_block *b,
39 size_t block_size)
40{
41 struct node *n = dm_block_data(b);
42 struct node_header *h = &n->header;
43 size_t value_size;
44 __le32 csum_disk;
45 uint32_t flags;
46
47 if (dm_block_location(b) != le64_to_cpu(h->blocknr)) {
48 DMERR("node_check failed blocknr %llu wanted %llu",
49 le64_to_cpu(h->blocknr), dm_block_location(b));
50 return -ENOTBLK;
51 }
52
53 csum_disk = cpu_to_le32(dm_bm_checksum(&h->flags,
54 block_size - sizeof(__le32),
55 BTREE_CSUM_XOR));
56 if (csum_disk != h->csum) {
57 DMERR("node_check failed csum %u wanted %u",
58 le32_to_cpu(csum_disk), le32_to_cpu(h->csum));
59 return -EILSEQ;
60 }
61
62 value_size = le32_to_cpu(h->value_size);
63
64 if (sizeof(struct node_header) +
65 (sizeof(__le64) + value_size) * le32_to_cpu(h->max_entries) > block_size) {
66 DMERR("node_check failed: max_entries too large");
67 return -EILSEQ;
68 }
69
70 if (le32_to_cpu(h->nr_entries) > le32_to_cpu(h->max_entries)) {
71 DMERR("node_check failed, too many entries");
72 return -EILSEQ;
73 }
74
75 /*
76 * The node must be either INTERNAL or LEAF.
77 */
78 flags = le32_to_cpu(h->flags);
79 if (!(flags & INTERNAL_NODE) && !(flags & LEAF_NODE)) {
80 DMERR("node_check failed, node is neither INTERNAL or LEAF");
81 return -EILSEQ;
82 }
83
84 return 0;
85}
86
87struct dm_block_validator btree_node_validator = {
88 .name = "btree_node",
89 .prepare_for_write = node_prepare_for_write,
90 .check = node_check
91};
92
93/*----------------------------------------------------------------*/
94
95static int bn_read_lock(struct dm_btree_info *info, dm_block_t b,
96 struct dm_block **result)
97{
98 return dm_tm_read_lock(info->tm, b, &btree_node_validator, result);
99}
100
101static int bn_shadow(struct dm_btree_info *info, dm_block_t orig,
102 struct dm_btree_value_type *vt,
103 struct dm_block **result)
104{
105 int r, inc;
106
107 r = dm_tm_shadow_block(info->tm, orig, &btree_node_validator,
108 result, &inc);
109 if (!r && inc)
110 inc_children(info->tm, dm_block_data(*result), vt);
111
112 return r;
113}
114
115int new_block(struct dm_btree_info *info, struct dm_block **result)
116{
117 return dm_tm_new_block(info->tm, &btree_node_validator, result);
118}
119
120int unlock_block(struct dm_btree_info *info, struct dm_block *b)
121{
122 return dm_tm_unlock(info->tm, b);
123}
124
125/*----------------------------------------------------------------*/
126
127void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info)
128{
129 s->info = info;
130 s->count = 0;
131 s->nodes[0] = NULL;
132 s->nodes[1] = NULL;
133}
134
135int exit_ro_spine(struct ro_spine *s)
136{
137 int r = 0, i;
138
139 for (i = 0; i < s->count; i++) {
140 int r2 = unlock_block(s->info, s->nodes[i]);
141 if (r2 < 0)
142 r = r2;
143 }
144
145 return r;
146}
147
148int ro_step(struct ro_spine *s, dm_block_t new_child)
149{
150 int r;
151
152 if (s->count == 2) {
153 r = unlock_block(s->info, s->nodes[0]);
154 if (r < 0)
155 return r;
156 s->nodes[0] = s->nodes[1];
157 s->count--;
158 }
159
160 r = bn_read_lock(s->info, new_child, s->nodes + s->count);
161 if (!r)
162 s->count++;
163
164 return r;
165}
166
167struct node *ro_node(struct ro_spine *s)
168{
169 struct dm_block *block;
170
171 BUG_ON(!s->count);
172 block = s->nodes[s->count - 1];
173
174 return dm_block_data(block);
175}
176
177/*----------------------------------------------------------------*/
178
179void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info)
180{
181 s->info = info;
182 s->count = 0;
183}
184
185int exit_shadow_spine(struct shadow_spine *s)
186{
187 int r = 0, i;
188
189 for (i = 0; i < s->count; i++) {
190 int r2 = unlock_block(s->info, s->nodes[i]);
191 if (r2 < 0)
192 r = r2;
193 }
194
195 return r;
196}
197
198int shadow_step(struct shadow_spine *s, dm_block_t b,
199 struct dm_btree_value_type *vt)
200{
201 int r;
202
203 if (s->count == 2) {
204 r = unlock_block(s->info, s->nodes[0]);
205 if (r < 0)
206 return r;
207 s->nodes[0] = s->nodes[1];
208 s->count--;
209 }
210
211 r = bn_shadow(s->info, b, vt, s->nodes + s->count);
212 if (!r) {
213 if (!s->count)
214 s->root = dm_block_location(s->nodes[0]);
215
216 s->count++;
217 }
218
219 return r;
220}
221
222struct dm_block *shadow_current(struct shadow_spine *s)
223{
224 BUG_ON(!s->count);
225
226 return s->nodes[s->count - 1];
227}
228
229struct dm_block *shadow_parent(struct shadow_spine *s)
230{
231 BUG_ON(s->count != 2);
232
233 return s->count == 2 ? s->nodes[0] : NULL;
234}
235
236int shadow_has_parent(struct shadow_spine *s)
237{
238 return s->count >= 2;
239}
240
241int shadow_root(struct shadow_spine *s)
242{
243 return s->root;
244}
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
new file mode 100644
index 000000000000..e0638be53ea4
--- /dev/null
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -0,0 +1,805 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-btree-internal.h"
8#include "dm-space-map.h"
9#include "dm-transaction-manager.h"
10
11#include <linux/module.h>
12#include <linux/device-mapper.h>
13
14#define DM_MSG_PREFIX "btree"
15
16/*----------------------------------------------------------------
17 * Array manipulation
18 *--------------------------------------------------------------*/
19static void memcpy_disk(void *dest, const void *src, size_t len)
20 __dm_written_to_disk(src)
21{
22 memcpy(dest, src, len);
23 __dm_unbless_for_disk(src);
24}
25
26static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27 unsigned index, void *elt)
28 __dm_written_to_disk(elt)
29{
30 if (index < nr_elts)
31 memmove(base + (elt_size * (index + 1)),
32 base + (elt_size * index),
33 (nr_elts - index) * elt_size);
34
35 memcpy_disk(base + (elt_size * index), elt, elt_size);
36}
37
38/*----------------------------------------------------------------*/
39
40/* makes the assumption that no two keys are the same. */
41static int bsearch(struct node *n, uint64_t key, int want_hi)
42{
43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44
45 while (hi - lo > 1) {
46 int mid = lo + ((hi - lo) / 2);
47 uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48
49 if (mid_key == key)
50 return mid;
51
52 if (mid_key < key)
53 lo = mid;
54 else
55 hi = mid;
56 }
57
58 return want_hi ? hi : lo;
59}
60
61int lower_bound(struct node *n, uint64_t key)
62{
63 return bsearch(n, key, 0);
64}
65
66void inc_children(struct dm_transaction_manager *tm, struct node *n,
67 struct dm_btree_value_type *vt)
68{
69 unsigned i;
70 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
71
72 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
73 for (i = 0; i < nr_entries; i++)
74 dm_tm_inc(tm, value64(n, i));
75 else if (vt->inc)
76 for (i = 0; i < nr_entries; i++)
77 vt->inc(vt->context,
78 value_ptr(n, i, vt->size));
79}
80
81static int insert_at(size_t value_size, struct node *node, unsigned index,
82 uint64_t key, void *value)
83 __dm_written_to_disk(value)
84{
85 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
86 __le64 key_le = cpu_to_le64(key);
87
88 if (index > nr_entries ||
89 index >= le32_to_cpu(node->header.max_entries)) {
90 DMERR("too many entries in btree node for insert");
91 __dm_unbless_for_disk(value);
92 return -ENOMEM;
93 }
94
95 __dm_bless_for_disk(&key_le);
96
97 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
98 array_insert(value_base(node), value_size, nr_entries, index, value);
99 node->header.nr_entries = cpu_to_le32(nr_entries + 1);
100
101 return 0;
102}
103
104/*----------------------------------------------------------------*/
105
106/*
107 * We want 3n entries (for some n). This works more nicely for repeated
108 * insert remove loops than (2n + 1).
109 */
110static uint32_t calc_max_entries(size_t value_size, size_t block_size)
111{
112 uint32_t total, n;
113 size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
114
115 block_size -= sizeof(struct node_header);
116 total = block_size / elt_size;
117 n = total / 3; /* rounds down */
118
119 return 3 * n;
120}
121
122int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
123{
124 int r;
125 struct dm_block *b;
126 struct node *n;
127 size_t block_size;
128 uint32_t max_entries;
129
130 r = new_block(info, &b);
131 if (r < 0)
132 return r;
133
134 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
135 max_entries = calc_max_entries(info->value_type.size, block_size);
136
137 n = dm_block_data(b);
138 memset(n, 0, block_size);
139 n->header.flags = cpu_to_le32(LEAF_NODE);
140 n->header.nr_entries = cpu_to_le32(0);
141 n->header.max_entries = cpu_to_le32(max_entries);
142 n->header.value_size = cpu_to_le32(info->value_type.size);
143
144 *root = dm_block_location(b);
145 return unlock_block(info, b);
146}
147EXPORT_SYMBOL_GPL(dm_btree_empty);
148
149/*----------------------------------------------------------------*/
150
151/*
152 * Deletion uses a recursive algorithm, since we have limited stack space
153 * we explicitly manage our own stack on the heap.
154 */
155#define MAX_SPINE_DEPTH 64
156struct frame {
157 struct dm_block *b;
158 struct node *n;
159 unsigned level;
160 unsigned nr_children;
161 unsigned current_child;
162};
163
164struct del_stack {
165 struct dm_transaction_manager *tm;
166 int top;
167 struct frame spine[MAX_SPINE_DEPTH];
168};
169
170static int top_frame(struct del_stack *s, struct frame **f)
171{
172 if (s->top < 0) {
173 DMERR("btree deletion stack empty");
174 return -EINVAL;
175 }
176
177 *f = s->spine + s->top;
178
179 return 0;
180}
181
182static int unprocessed_frames(struct del_stack *s)
183{
184 return s->top >= 0;
185}
186
187static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
188{
189 int r;
190 uint32_t ref_count;
191
192 if (s->top >= MAX_SPINE_DEPTH - 1) {
193 DMERR("btree deletion stack out of memory");
194 return -ENOMEM;
195 }
196
197 r = dm_tm_ref(s->tm, b, &ref_count);
198 if (r)
199 return r;
200
201 if (ref_count > 1)
202 /*
203 * This is a shared node, so we can just decrement it's
204 * reference counter and leave the children.
205 */
206 dm_tm_dec(s->tm, b);
207
208 else {
209 struct frame *f = s->spine + ++s->top;
210
211 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
212 if (r) {
213 s->top--;
214 return r;
215 }
216
217 f->n = dm_block_data(f->b);
218 f->level = level;
219 f->nr_children = le32_to_cpu(f->n->header.nr_entries);
220 f->current_child = 0;
221 }
222
223 return 0;
224}
225
226static void pop_frame(struct del_stack *s)
227{
228 struct frame *f = s->spine + s->top--;
229
230 dm_tm_dec(s->tm, dm_block_location(f->b));
231 dm_tm_unlock(s->tm, f->b);
232}
233
234int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
235{
236 int r;
237 struct del_stack *s;
238
239 s = kmalloc(sizeof(*s), GFP_KERNEL);
240 if (!s)
241 return -ENOMEM;
242 s->tm = info->tm;
243 s->top = -1;
244
245 r = push_frame(s, root, 1);
246 if (r)
247 goto out;
248
249 while (unprocessed_frames(s)) {
250 uint32_t flags;
251 struct frame *f;
252 dm_block_t b;
253
254 r = top_frame(s, &f);
255 if (r)
256 goto out;
257
258 if (f->current_child >= f->nr_children) {
259 pop_frame(s);
260 continue;
261 }
262
263 flags = le32_to_cpu(f->n->header.flags);
264 if (flags & INTERNAL_NODE) {
265 b = value64(f->n, f->current_child);
266 f->current_child++;
267 r = push_frame(s, b, f->level);
268 if (r)
269 goto out;
270
271 } else if (f->level != (info->levels - 1)) {
272 b = value64(f->n, f->current_child);
273 f->current_child++;
274 r = push_frame(s, b, f->level + 1);
275 if (r)
276 goto out;
277
278 } else {
279 if (info->value_type.dec) {
280 unsigned i;
281
282 for (i = 0; i < f->nr_children; i++)
283 info->value_type.dec(info->value_type.context,
284 value_ptr(f->n, i, info->value_type.size));
285 }
286 f->current_child = f->nr_children;
287 }
288 }
289
290out:
291 kfree(s);
292 return r;
293}
294EXPORT_SYMBOL_GPL(dm_btree_del);
295
296/*----------------------------------------------------------------*/
297
298static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
299 int (*search_fn)(struct node *, uint64_t),
300 uint64_t *result_key, void *v, size_t value_size)
301{
302 int i, r;
303 uint32_t flags, nr_entries;
304
305 do {
306 r = ro_step(s, block);
307 if (r < 0)
308 return r;
309
310 i = search_fn(ro_node(s), key);
311
312 flags = le32_to_cpu(ro_node(s)->header.flags);
313 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
314 if (i < 0 || i >= nr_entries)
315 return -ENODATA;
316
317 if (flags & INTERNAL_NODE)
318 block = value64(ro_node(s), i);
319
320 } while (!(flags & LEAF_NODE));
321
322 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
323 memcpy(v, value_ptr(ro_node(s), i, value_size), value_size);
324
325 return 0;
326}
327
328int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
329 uint64_t *keys, void *value_le)
330{
331 unsigned level, last_level = info->levels - 1;
332 int r = -ENODATA;
333 uint64_t rkey;
334 __le64 internal_value_le;
335 struct ro_spine spine;
336
337 init_ro_spine(&spine, info);
338 for (level = 0; level < info->levels; level++) {
339 size_t size;
340 void *value_p;
341
342 if (level == last_level) {
343 value_p = value_le;
344 size = info->value_type.size;
345
346 } else {
347 value_p = &internal_value_le;
348 size = sizeof(uint64_t);
349 }
350
351 r = btree_lookup_raw(&spine, root, keys[level],
352 lower_bound, &rkey,
353 value_p, size);
354
355 if (!r) {
356 if (rkey != keys[level]) {
357 exit_ro_spine(&spine);
358 return -ENODATA;
359 }
360 } else {
361 exit_ro_spine(&spine);
362 return r;
363 }
364
365 root = le64_to_cpu(internal_value_le);
366 }
367 exit_ro_spine(&spine);
368
369 return r;
370}
371EXPORT_SYMBOL_GPL(dm_btree_lookup);
372
373/*
374 * Splits a node by creating a sibling node and shifting half the nodes
375 * contents across. Assumes there is a parent node, and it has room for
376 * another child.
377 *
378 * Before:
379 * +--------+
380 * | Parent |
381 * +--------+
382 * |
383 * v
384 * +----------+
385 * | A ++++++ |
386 * +----------+
387 *
388 *
389 * After:
390 * +--------+
391 * | Parent |
392 * +--------+
393 * | |
394 * v +------+
395 * +---------+ |
396 * | A* +++ | v
397 * +---------+ +-------+
398 * | B +++ |
399 * +-------+
400 *
401 * Where A* is a shadow of A.
402 */
403static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
404 unsigned parent_index, uint64_t key)
405{
406 int r;
407 size_t size;
408 unsigned nr_left, nr_right;
409 struct dm_block *left, *right, *parent;
410 struct node *ln, *rn, *pn;
411 __le64 location;
412
413 left = shadow_current(s);
414
415 r = new_block(s->info, &right);
416 if (r < 0)
417 return r;
418
419 ln = dm_block_data(left);
420 rn = dm_block_data(right);
421
422 nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
423 nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
424
425 ln->header.nr_entries = cpu_to_le32(nr_left);
426
427 rn->header.flags = ln->header.flags;
428 rn->header.nr_entries = cpu_to_le32(nr_right);
429 rn->header.max_entries = ln->header.max_entries;
430 rn->header.value_size = ln->header.value_size;
431 memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
432
433 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
434 sizeof(uint64_t) : s->info->value_type.size;
435 memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size),
436 size * nr_right);
437
438 /*
439 * Patch up the parent
440 */
441 parent = shadow_parent(s);
442
443 pn = dm_block_data(parent);
444 location = cpu_to_le64(dm_block_location(left));
445 __dm_bless_for_disk(&location);
446 memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)),
447 &location, sizeof(__le64));
448
449 location = cpu_to_le64(dm_block_location(right));
450 __dm_bless_for_disk(&location);
451
452 r = insert_at(sizeof(__le64), pn, parent_index + 1,
453 le64_to_cpu(rn->keys[0]), &location);
454 if (r)
455 return r;
456
457 if (key < le64_to_cpu(rn->keys[0])) {
458 unlock_block(s->info, right);
459 s->nodes[1] = left;
460 } else {
461 unlock_block(s->info, left);
462 s->nodes[1] = right;
463 }
464
465 return 0;
466}
467
468/*
469 * Splits a node by creating two new children beneath the given node.
470 *
471 * Before:
472 * +----------+
473 * | A ++++++ |
474 * +----------+
475 *
476 *
477 * After:
478 * +------------+
479 * | A (shadow) |
480 * +------------+
481 * | |
482 * +------+ +----+
483 * | |
484 * v v
485 * +-------+ +-------+
486 * | B +++ | | C +++ |
487 * +-------+ +-------+
488 */
489static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
490{
491 int r;
492 size_t size;
493 unsigned nr_left, nr_right;
494 struct dm_block *left, *right, *new_parent;
495 struct node *pn, *ln, *rn;
496 __le64 val;
497
498 new_parent = shadow_current(s);
499
500 r = new_block(s->info, &left);
501 if (r < 0)
502 return r;
503
504 r = new_block(s->info, &right);
505 if (r < 0) {
506 /* FIXME: put left */
507 return r;
508 }
509
510 pn = dm_block_data(new_parent);
511 ln = dm_block_data(left);
512 rn = dm_block_data(right);
513
514 nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
515 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
516
517 ln->header.flags = pn->header.flags;
518 ln->header.nr_entries = cpu_to_le32(nr_left);
519 ln->header.max_entries = pn->header.max_entries;
520 ln->header.value_size = pn->header.value_size;
521
522 rn->header.flags = pn->header.flags;
523 rn->header.nr_entries = cpu_to_le32(nr_right);
524 rn->header.max_entries = pn->header.max_entries;
525 rn->header.value_size = pn->header.value_size;
526
527 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
528 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
529
530 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
531 sizeof(__le64) : s->info->value_type.size;
532 memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size);
533 memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size),
534 nr_right * size);
535
536 /* new_parent should just point to l and r now */
537 pn->header.flags = cpu_to_le32(INTERNAL_NODE);
538 pn->header.nr_entries = cpu_to_le32(2);
539 pn->header.max_entries = cpu_to_le32(
540 calc_max_entries(sizeof(__le64),
541 dm_bm_block_size(
542 dm_tm_get_bm(s->info->tm))));
543 pn->header.value_size = cpu_to_le32(sizeof(__le64));
544
545 val = cpu_to_le64(dm_block_location(left));
546 __dm_bless_for_disk(&val);
547 pn->keys[0] = ln->keys[0];
548 memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64));
549
550 val = cpu_to_le64(dm_block_location(right));
551 __dm_bless_for_disk(&val);
552 pn->keys[1] = rn->keys[0];
553 memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64));
554
555 /*
556 * rejig the spine. This is ugly, since it knows too
557 * much about the spine
558 */
559 if (s->nodes[0] != new_parent) {
560 unlock_block(s->info, s->nodes[0]);
561 s->nodes[0] = new_parent;
562 }
563 if (key < le64_to_cpu(rn->keys[0])) {
564 unlock_block(s->info, right);
565 s->nodes[1] = left;
566 } else {
567 unlock_block(s->info, left);
568 s->nodes[1] = right;
569 }
570 s->count = 2;
571
572 return 0;
573}
574
575static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
576 struct dm_btree_value_type *vt,
577 uint64_t key, unsigned *index)
578{
579 int r, i = *index, top = 1;
580 struct node *node;
581
582 for (;;) {
583 r = shadow_step(s, root, vt);
584 if (r < 0)
585 return r;
586
587 node = dm_block_data(shadow_current(s));
588
589 /*
590 * We have to patch up the parent node, ugly, but I don't
591 * see a way to do this automatically as part of the spine
592 * op.
593 */
594 if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
595 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
596
597 __dm_bless_for_disk(&location);
598 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)),
599 &location, sizeof(__le64));
600 }
601
602 node = dm_block_data(shadow_current(s));
603
604 if (node->header.nr_entries == node->header.max_entries) {
605 if (top)
606 r = btree_split_beneath(s, key);
607 else
608 r = btree_split_sibling(s, root, i, key);
609
610 if (r < 0)
611 return r;
612 }
613
614 node = dm_block_data(shadow_current(s));
615
616 i = lower_bound(node, key);
617
618 if (le32_to_cpu(node->header.flags) & LEAF_NODE)
619 break;
620
621 if (i < 0) {
622 /* change the bounds on the lowest key */
623 node->keys[0] = cpu_to_le64(key);
624 i = 0;
625 }
626
627 root = value64(node, i);
628 top = 0;
629 }
630
631 if (i < 0 || le64_to_cpu(node->keys[i]) != key)
632 i++;
633
634 *index = i;
635 return 0;
636}
637
638static int insert(struct dm_btree_info *info, dm_block_t root,
639 uint64_t *keys, void *value, dm_block_t *new_root,
640 int *inserted)
641 __dm_written_to_disk(value)
642{
643 int r, need_insert;
644 unsigned level, index = -1, last_level = info->levels - 1;
645 dm_block_t block = root;
646 struct shadow_spine spine;
647 struct node *n;
648 struct dm_btree_value_type le64_type;
649
650 le64_type.context = NULL;
651 le64_type.size = sizeof(__le64);
652 le64_type.inc = NULL;
653 le64_type.dec = NULL;
654 le64_type.equal = NULL;
655
656 init_shadow_spine(&spine, info);
657
658 for (level = 0; level < (info->levels - 1); level++) {
659 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
660 if (r < 0)
661 goto bad;
662
663 n = dm_block_data(shadow_current(&spine));
664 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
665 (le64_to_cpu(n->keys[index]) != keys[level]));
666
667 if (need_insert) {
668 dm_block_t new_tree;
669 __le64 new_le;
670
671 r = dm_btree_empty(info, &new_tree);
672 if (r < 0)
673 goto bad;
674
675 new_le = cpu_to_le64(new_tree);
676 __dm_bless_for_disk(&new_le);
677
678 r = insert_at(sizeof(uint64_t), n, index,
679 keys[level], &new_le);
680 if (r)
681 goto bad;
682 }
683
684 if (level < last_level)
685 block = value64(n, index);
686 }
687
688 r = btree_insert_raw(&spine, block, &info->value_type,
689 keys[level], &index);
690 if (r < 0)
691 goto bad;
692
693 n = dm_block_data(shadow_current(&spine));
694 need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
695 (le64_to_cpu(n->keys[index]) != keys[level]));
696
697 if (need_insert) {
698 if (inserted)
699 *inserted = 1;
700
701 r = insert_at(info->value_type.size, n, index,
702 keys[level], value);
703 if (r)
704 goto bad_unblessed;
705 } else {
706 if (inserted)
707 *inserted = 0;
708
709 if (info->value_type.dec &&
710 (!info->value_type.equal ||
711 !info->value_type.equal(
712 info->value_type.context,
713 value_ptr(n, index, info->value_type.size),
714 value))) {
715 info->value_type.dec(info->value_type.context,
716 value_ptr(n, index, info->value_type.size));
717 }
718 memcpy_disk(value_ptr(n, index, info->value_type.size),
719 value, info->value_type.size);
720 }
721
722 *new_root = shadow_root(&spine);
723 exit_shadow_spine(&spine);
724
725 return 0;
726
727bad:
728 __dm_unbless_for_disk(value);
729bad_unblessed:
730 exit_shadow_spine(&spine);
731 return r;
732}
733
734int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
735 uint64_t *keys, void *value, dm_block_t *new_root)
736 __dm_written_to_disk(value)
737{
738 return insert(info, root, keys, value, new_root, NULL);
739}
740EXPORT_SYMBOL_GPL(dm_btree_insert);
741
742int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
743 uint64_t *keys, void *value, dm_block_t *new_root,
744 int *inserted)
745 __dm_written_to_disk(value)
746{
747 return insert(info, root, keys, value, new_root, inserted);
748}
749EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
750
751/*----------------------------------------------------------------*/
752
753static int find_highest_key(struct ro_spine *s, dm_block_t block,
754 uint64_t *result_key, dm_block_t *next_block)
755{
756 int i, r;
757 uint32_t flags;
758
759 do {
760 r = ro_step(s, block);
761 if (r < 0)
762 return r;
763
764 flags = le32_to_cpu(ro_node(s)->header.flags);
765 i = le32_to_cpu(ro_node(s)->header.nr_entries);
766 if (!i)
767 return -ENODATA;
768 else
769 i--;
770
771 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
772 if (next_block || flags & INTERNAL_NODE)
773 block = value64(ro_node(s), i);
774
775 } while (flags & INTERNAL_NODE);
776
777 if (next_block)
778 *next_block = block;
779 return 0;
780}
781
782int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
783 uint64_t *result_keys)
784{
785 int r = 0, count = 0, level;
786 struct ro_spine spine;
787
788 init_ro_spine(&spine, info);
789 for (level = 0; level < info->levels; level++) {
790 r = find_highest_key(&spine, root, result_keys + level,
791 level == info->levels - 1 ? NULL : &root);
792 if (r == -ENODATA) {
793 r = 0;
794 break;
795
796 } else if (r)
797 break;
798
799 count++;
800 }
801 exit_ro_spine(&spine);
802
803 return r ? r : count;
804}
805EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
new file mode 100644
index 000000000000..ae02c84410ff
--- /dev/null
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -0,0 +1,145 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6#ifndef _LINUX_DM_BTREE_H
7#define _LINUX_DM_BTREE_H
8
9#include "dm-block-manager.h"
10
11struct dm_transaction_manager;
12
13/*----------------------------------------------------------------*/
14
15/*
16 * Annotations used to check on-disk metadata is handled as little-endian.
17 */
18#ifdef __CHECKER__
19# define __dm_written_to_disk(x) __releases(x)
20# define __dm_reads_from_disk(x) __acquires(x)
21# define __dm_bless_for_disk(x) __acquire(x)
22# define __dm_unbless_for_disk(x) __release(x)
23#else
24# define __dm_written_to_disk(x)
25# define __dm_reads_from_disk(x)
26# define __dm_bless_for_disk(x)
27# define __dm_unbless_for_disk(x)
28#endif
29
30/*----------------------------------------------------------------*/
31
32/*
33 * Manipulates hierarchical B+ trees with 64-bit keys and arbitrary-sized
34 * values.
35 */
36
37/*
38 * Infomation about the values stored within the btree.
39 */
40struct dm_btree_value_type {
41 void *context;
42
43 /*
44 * The size in bytes of each value.
45 */
46 uint32_t size;
47
48 /*
49 * Any of these methods can be safely set to NULL if you do not
50 * need the corresponding feature.
51 */
52
53 /*
54 * The btree is making a duplicate of the value, for instance
55 * because previously-shared btree nodes have now diverged.
56 * @value argument is the new copy that the copy function may modify.
57 * (Probably it just wants to increment a reference count
58 * somewhere.) This method is _not_ called for insertion of a new
59 * value: It is assumed the ref count is already 1.
60 */
61 void (*inc)(void *context, void *value);
62
63 /*
64 * This value is being deleted. The btree takes care of freeing
65 * the memory pointed to by @value. Often the del function just
66 * needs to decrement a reference count somewhere.
67 */
68 void (*dec)(void *context, void *value);
69
70 /*
71 * A test for equality between two values. When a value is
72 * overwritten with a new one, the old one has the dec method
73 * called _unless_ the new and old value are deemed equal.
74 */
75 int (*equal)(void *context, void *value1, void *value2);
76};
77
78/*
79 * The shape and contents of a btree.
80 */
81struct dm_btree_info {
82 struct dm_transaction_manager *tm;
83
84 /*
85 * Number of nested btrees. (Not the depth of a single tree.)
86 */
87 unsigned levels;
88 struct dm_btree_value_type value_type;
89};
90
91/*
92 * Set up an empty tree. O(1).
93 */
94int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root);
95
96/*
97 * Delete a tree. O(n) - this is the slow one! It can also block, so
98 * please don't call it on an IO path.
99 */
100int dm_btree_del(struct dm_btree_info *info, dm_block_t root);
101
102/*
103 * All the lookup functions return -ENODATA if the key cannot be found.
104 */
105
106/*
107 * Tries to find a key that matches exactly. O(ln(n))
108 */
109int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
110 uint64_t *keys, void *value_le);
111
112/*
113 * Insertion (or overwrite an existing value). O(ln(n))
114 */
115int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
116 uint64_t *keys, void *value, dm_block_t *new_root)
117 __dm_written_to_disk(value);
118
119/*
120 * A variant of insert that indicates whether it actually inserted or just
121 * overwrote. Useful if you're keeping track of the number of entries in a
122 * tree.
123 */
124int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
125 uint64_t *keys, void *value, dm_block_t *new_root,
126 int *inserted)
127 __dm_written_to_disk(value);
128
129/*
130 * Remove a key if present. This doesn't remove empty sub trees. Normally
131 * subtrees represent a separate entity, like a snapshot map, so this is
132 * correct behaviour. O(ln(n)).
133 */
134int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
135 uint64_t *keys, dm_block_t *new_root);
136
137/*
138 * Returns < 0 on failure. Otherwise the number of key entries that have
139 * been filled out. Remember trees can have zero entries, and as such have
140 * no highest key.
141 */
142int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
143 uint64_t *result_keys);
144
145#endif /* _LINUX_DM_BTREE_H */
diff --git a/drivers/md/persistent-data/dm-persistent-data-internal.h b/drivers/md/persistent-data/dm-persistent-data-internal.h
new file mode 100644
index 000000000000..c49e26fff36c
--- /dev/null
+++ b/drivers/md/persistent-data/dm-persistent-data-internal.h
@@ -0,0 +1,19 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef _DM_PERSISTENT_DATA_INTERNAL_H
8#define _DM_PERSISTENT_DATA_INTERNAL_H
9
10#include "dm-block-manager.h"
11
12static inline unsigned dm_hash_block(dm_block_t b, unsigned hash_mask)
13{
14 const unsigned BIG_PRIME = 4294967291UL;
15
16 return (((unsigned) b) * BIG_PRIME) & hash_mask;
17}
18
19#endif /* _PERSISTENT_DATA_INTERNAL_H */
diff --git a/drivers/md/persistent-data/dm-space-map-checker.c b/drivers/md/persistent-data/dm-space-map-checker.c
new file mode 100644
index 000000000000..bb44a937fe63
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-checker.c
@@ -0,0 +1,437 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-space-map-checker.h"
8
9#include <linux/device-mapper.h>
10
11#ifdef CONFIG_DM_DEBUG_SPACE_MAPS
12
13#define DM_MSG_PREFIX "space map checker"
14
15/*----------------------------------------------------------------*/
16
17struct count_array {
18 dm_block_t nr;
19 dm_block_t nr_free;
20
21 uint32_t *counts;
22};
23
24static int ca_get_count(struct count_array *ca, dm_block_t b, uint32_t *count)
25{
26 if (b >= ca->nr)
27 return -EINVAL;
28
29 *count = ca->counts[b];
30 return 0;
31}
32
33static int ca_count_more_than_one(struct count_array *ca, dm_block_t b, int *r)
34{
35 if (b >= ca->nr)
36 return -EINVAL;
37
38 *r = ca->counts[b] > 1;
39 return 0;
40}
41
42static int ca_set_count(struct count_array *ca, dm_block_t b, uint32_t count)
43{
44 uint32_t old_count;
45
46 if (b >= ca->nr)
47 return -EINVAL;
48
49 old_count = ca->counts[b];
50
51 if (!count && old_count)
52 ca->nr_free++;
53
54 else if (count && !old_count)
55 ca->nr_free--;
56
57 ca->counts[b] = count;
58 return 0;
59}
60
61static int ca_inc_block(struct count_array *ca, dm_block_t b)
62{
63 if (b >= ca->nr)
64 return -EINVAL;
65
66 ca_set_count(ca, b, ca->counts[b] + 1);
67 return 0;
68}
69
70static int ca_dec_block(struct count_array *ca, dm_block_t b)
71{
72 if (b >= ca->nr)
73 return -EINVAL;
74
75 BUG_ON(ca->counts[b] == 0);
76 ca_set_count(ca, b, ca->counts[b] - 1);
77 return 0;
78}
79
80static int ca_create(struct count_array *ca, struct dm_space_map *sm)
81{
82 int r;
83 dm_block_t nr_blocks;
84
85 r = dm_sm_get_nr_blocks(sm, &nr_blocks);
86 if (r)
87 return r;
88
89 ca->nr = nr_blocks;
90 ca->nr_free = nr_blocks;
91 ca->counts = kzalloc(sizeof(*ca->counts) * nr_blocks, GFP_KERNEL);
92 if (!ca->counts)
93 return -ENOMEM;
94
95 return 0;
96}
97
98static int ca_load(struct count_array *ca, struct dm_space_map *sm)
99{
100 int r;
101 uint32_t count;
102 dm_block_t nr_blocks, i;
103
104 r = dm_sm_get_nr_blocks(sm, &nr_blocks);
105 if (r)
106 return r;
107
108 BUG_ON(ca->nr != nr_blocks);
109
110 DMWARN("Loading debug space map from disk. This may take some time");
111 for (i = 0; i < nr_blocks; i++) {
112 r = dm_sm_get_count(sm, i, &count);
113 if (r) {
114 DMERR("load failed");
115 return r;
116 }
117
118 ca_set_count(ca, i, count);
119 }
120 DMWARN("Load complete");
121
122 return 0;
123}
124
125static int ca_extend(struct count_array *ca, dm_block_t extra_blocks)
126{
127 dm_block_t nr_blocks = ca->nr + extra_blocks;
128 uint32_t *counts = kzalloc(sizeof(*counts) * nr_blocks, GFP_KERNEL);
129 if (!counts)
130 return -ENOMEM;
131
132 memcpy(counts, ca->counts, sizeof(*counts) * ca->nr);
133 kfree(ca->counts);
134 ca->nr = nr_blocks;
135 ca->nr_free += extra_blocks;
136 ca->counts = counts;
137 return 0;
138}
139
140static int ca_commit(struct count_array *old, struct count_array *new)
141{
142 if (old->nr != new->nr) {
143 BUG_ON(old->nr > new->nr);
144 ca_extend(old, new->nr - old->nr);
145 }
146
147 BUG_ON(old->nr != new->nr);
148 old->nr_free = new->nr_free;
149 memcpy(old->counts, new->counts, sizeof(*old->counts) * old->nr);
150 return 0;
151}
152
153static void ca_destroy(struct count_array *ca)
154{
155 kfree(ca->counts);
156}
157
158/*----------------------------------------------------------------*/
159
160struct sm_checker {
161 struct dm_space_map sm;
162
163 struct count_array old_counts;
164 struct count_array counts;
165
166 struct dm_space_map *real_sm;
167};
168
169static void sm_checker_destroy(struct dm_space_map *sm)
170{
171 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
172
173 dm_sm_destroy(smc->real_sm);
174 ca_destroy(&smc->old_counts);
175 ca_destroy(&smc->counts);
176 kfree(smc);
177}
178
179static int sm_checker_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
180{
181 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
182 int r = dm_sm_get_nr_blocks(smc->real_sm, count);
183 if (!r)
184 BUG_ON(smc->old_counts.nr != *count);
185 return r;
186}
187
188static int sm_checker_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
189{
190 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
191 int r = dm_sm_get_nr_free(smc->real_sm, count);
192 if (!r) {
193 /*
194 * Slow, but we know it's correct.
195 */
196 dm_block_t b, n = 0;
197 for (b = 0; b < smc->old_counts.nr; b++)
198 if (smc->old_counts.counts[b] == 0 &&
199 smc->counts.counts[b] == 0)
200 n++;
201
202 if (n != *count)
203 DMERR("free block counts differ, checker %u, sm-disk:%u",
204 (unsigned) n, (unsigned) *count);
205 }
206 return r;
207}
208
209static int sm_checker_new_block(struct dm_space_map *sm, dm_block_t *b)
210{
211 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
212 int r = dm_sm_new_block(smc->real_sm, b);
213
214 if (!r) {
215 BUG_ON(*b >= smc->old_counts.nr);
216 BUG_ON(smc->old_counts.counts[*b] != 0);
217 BUG_ON(*b >= smc->counts.nr);
218 BUG_ON(smc->counts.counts[*b] != 0);
219 ca_set_count(&smc->counts, *b, 1);
220 }
221
222 return r;
223}
224
225static int sm_checker_inc_block(struct dm_space_map *sm, dm_block_t b)
226{
227 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
228 int r = dm_sm_inc_block(smc->real_sm, b);
229 int r2 = ca_inc_block(&smc->counts, b);
230 BUG_ON(r != r2);
231 return r;
232}
233
234static int sm_checker_dec_block(struct dm_space_map *sm, dm_block_t b)
235{
236 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
237 int r = dm_sm_dec_block(smc->real_sm, b);
238 int r2 = ca_dec_block(&smc->counts, b);
239 BUG_ON(r != r2);
240 return r;
241}
242
243static int sm_checker_get_count(struct dm_space_map *sm, dm_block_t b, uint32_t *result)
244{
245 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
246 uint32_t result2 = 0;
247 int r = dm_sm_get_count(smc->real_sm, b, result);
248 int r2 = ca_get_count(&smc->counts, b, &result2);
249
250 BUG_ON(r != r2);
251 if (!r)
252 BUG_ON(*result != result2);
253 return r;
254}
255
256static int sm_checker_count_more_than_one(struct dm_space_map *sm, dm_block_t b, int *result)
257{
258 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
259 int result2 = 0;
260 int r = dm_sm_count_is_more_than_one(smc->real_sm, b, result);
261 int r2 = ca_count_more_than_one(&smc->counts, b, &result2);
262
263 BUG_ON(r != r2);
264 if (!r)
265 BUG_ON(!(*result) && result2);
266 return r;
267}
268
269static int sm_checker_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count)
270{
271 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
272 uint32_t old_rc;
273 int r = dm_sm_set_count(smc->real_sm, b, count);
274 int r2;
275
276 BUG_ON(b >= smc->counts.nr);
277 old_rc = smc->counts.counts[b];
278 r2 = ca_set_count(&smc->counts, b, count);
279 BUG_ON(r != r2);
280
281 return r;
282}
283
284static int sm_checker_commit(struct dm_space_map *sm)
285{
286 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
287 int r;
288
289 r = dm_sm_commit(smc->real_sm);
290 if (r)
291 return r;
292
293 r = ca_commit(&smc->old_counts, &smc->counts);
294 if (r)
295 return r;
296
297 return 0;
298}
299
300static int sm_checker_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
301{
302 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
303 int r = dm_sm_extend(smc->real_sm, extra_blocks);
304 if (r)
305 return r;
306
307 return ca_extend(&smc->counts, extra_blocks);
308}
309
310static int sm_checker_root_size(struct dm_space_map *sm, size_t *result)
311{
312 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
313 return dm_sm_root_size(smc->real_sm, result);
314}
315
316static int sm_checker_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len)
317{
318 struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
319 return dm_sm_copy_root(smc->real_sm, copy_to_here_le, len);
320}
321
322/*----------------------------------------------------------------*/
323
324static struct dm_space_map ops_ = {
325 .destroy = sm_checker_destroy,
326 .get_nr_blocks = sm_checker_get_nr_blocks,
327 .get_nr_free = sm_checker_get_nr_free,
328 .inc_block = sm_checker_inc_block,
329 .dec_block = sm_checker_dec_block,
330 .new_block = sm_checker_new_block,
331 .get_count = sm_checker_get_count,
332 .count_is_more_than_one = sm_checker_count_more_than_one,
333 .set_count = sm_checker_set_count,
334 .commit = sm_checker_commit,
335 .extend = sm_checker_extend,
336 .root_size = sm_checker_root_size,
337 .copy_root = sm_checker_copy_root
338};
339
340struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
341{
342 int r;
343 struct sm_checker *smc;
344
345 if (!sm)
346 return NULL;
347
348 smc = kmalloc(sizeof(*smc), GFP_KERNEL);
349 if (!smc)
350 return NULL;
351
352 memcpy(&smc->sm, &ops_, sizeof(smc->sm));
353 r = ca_create(&smc->old_counts, sm);
354 if (r) {
355 kfree(smc);
356 return NULL;
357 }
358
359 r = ca_create(&smc->counts, sm);
360 if (r) {
361 ca_destroy(&smc->old_counts);
362 kfree(smc);
363 return NULL;
364 }
365
366 smc->real_sm = sm;
367
368 r = ca_load(&smc->counts, sm);
369 if (r) {
370 ca_destroy(&smc->counts);
371 ca_destroy(&smc->old_counts);
372 kfree(smc);
373 return NULL;
374 }
375
376 r = ca_commit(&smc->old_counts, &smc->counts);
377 if (r) {
378 ca_destroy(&smc->counts);
379 ca_destroy(&smc->old_counts);
380 kfree(smc);
381 return NULL;
382 }
383
384 return &smc->sm;
385}
386EXPORT_SYMBOL_GPL(dm_sm_checker_create);
387
388struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm)
389{
390 int r;
391 struct sm_checker *smc;
392
393 if (!sm)
394 return NULL;
395
396 smc = kmalloc(sizeof(*smc), GFP_KERNEL);
397 if (!smc)
398 return NULL;
399
400 memcpy(&smc->sm, &ops_, sizeof(smc->sm));
401 r = ca_create(&smc->old_counts, sm);
402 if (r) {
403 kfree(smc);
404 return NULL;
405 }
406
407 r = ca_create(&smc->counts, sm);
408 if (r) {
409 ca_destroy(&smc->old_counts);
410 kfree(smc);
411 return NULL;
412 }
413
414 smc->real_sm = sm;
415 return &smc->sm;
416}
417EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh);
418
419/*----------------------------------------------------------------*/
420
421#else
422
423struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
424{
425 return sm;
426}
427EXPORT_SYMBOL_GPL(dm_sm_checker_create);
428
429struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm)
430{
431 return sm;
432}
433EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh);
434
435/*----------------------------------------------------------------*/
436
437#endif
diff --git a/drivers/md/persistent-data/dm-space-map-checker.h b/drivers/md/persistent-data/dm-space-map-checker.h
new file mode 100644
index 000000000000..444dccf6688c
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-checker.h
@@ -0,0 +1,26 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef SNAPSHOTS_SPACE_MAP_CHECKER_H
8#define SNAPSHOTS_SPACE_MAP_CHECKER_H
9
10#include "dm-space-map.h"
11
12/*----------------------------------------------------------------*/
13
14/*
15 * This space map wraps a real on-disk space map, and verifies all of its
16 * operations. It uses a lot of memory, so only use if you have a specific
17 * problem that you're debugging.
18 *
19 * Ownership of @sm passes.
20 */
21struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm);
22struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm);
23
24/*----------------------------------------------------------------*/
25
26#endif
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
new file mode 100644
index 000000000000..df2494c06cdc
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -0,0 +1,705 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-space-map-common.h"
8#include "dm-transaction-manager.h"
9
10#include <linux/bitops.h>
11#include <linux/device-mapper.h>
12
13#define DM_MSG_PREFIX "space map common"
14
15/*----------------------------------------------------------------*/
16
17/*
18 * Index validator.
19 */
20#define INDEX_CSUM_XOR 160478
21
22static void index_prepare_for_write(struct dm_block_validator *v,
23 struct dm_block *b,
24 size_t block_size)
25{
26 struct disk_metadata_index *mi_le = dm_block_data(b);
27
28 mi_le->blocknr = cpu_to_le64(dm_block_location(b));
29 mi_le->csum = cpu_to_le32(dm_bm_checksum(&mi_le->padding,
30 block_size - sizeof(__le32),
31 INDEX_CSUM_XOR));
32}
33
34static int index_check(struct dm_block_validator *v,
35 struct dm_block *b,
36 size_t block_size)
37{
38 struct disk_metadata_index *mi_le = dm_block_data(b);
39 __le32 csum_disk;
40
41 if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) {
42 DMERR("index_check failed blocknr %llu wanted %llu",
43 le64_to_cpu(mi_le->blocknr), dm_block_location(b));
44 return -ENOTBLK;
45 }
46
47 csum_disk = cpu_to_le32(dm_bm_checksum(&mi_le->padding,
48 block_size - sizeof(__le32),
49 INDEX_CSUM_XOR));
50 if (csum_disk != mi_le->csum) {
51 DMERR("index_check failed csum %u wanted %u",
52 le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum));
53 return -EILSEQ;
54 }
55
56 return 0;
57}
58
59static struct dm_block_validator index_validator = {
60 .name = "index",
61 .prepare_for_write = index_prepare_for_write,
62 .check = index_check
63};
64
65/*----------------------------------------------------------------*/
66
67/*
68 * Bitmap validator
69 */
70#define BITMAP_CSUM_XOR 240779
71
72static void bitmap_prepare_for_write(struct dm_block_validator *v,
73 struct dm_block *b,
74 size_t block_size)
75{
76 struct disk_bitmap_header *disk_header = dm_block_data(b);
77
78 disk_header->blocknr = cpu_to_le64(dm_block_location(b));
79 disk_header->csum = cpu_to_le32(dm_bm_checksum(&disk_header->not_used,
80 block_size - sizeof(__le32),
81 BITMAP_CSUM_XOR));
82}
83
84static int bitmap_check(struct dm_block_validator *v,
85 struct dm_block *b,
86 size_t block_size)
87{
88 struct disk_bitmap_header *disk_header = dm_block_data(b);
89 __le32 csum_disk;
90
91 if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) {
92 DMERR("bitmap check failed blocknr %llu wanted %llu",
93 le64_to_cpu(disk_header->blocknr), dm_block_location(b));
94 return -ENOTBLK;
95 }
96
97 csum_disk = cpu_to_le32(dm_bm_checksum(&disk_header->not_used,
98 block_size - sizeof(__le32),
99 BITMAP_CSUM_XOR));
100 if (csum_disk != disk_header->csum) {
101 DMERR("bitmap check failed csum %u wanted %u",
102 le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum));
103 return -EILSEQ;
104 }
105
106 return 0;
107}
108
109static struct dm_block_validator dm_sm_bitmap_validator = {
110 .name = "sm_bitmap",
111 .prepare_for_write = bitmap_prepare_for_write,
112 .check = bitmap_check
113};
114
115/*----------------------------------------------------------------*/
116
117#define ENTRIES_PER_WORD 32
118#define ENTRIES_SHIFT 5
119
120static void *dm_bitmap_data(struct dm_block *b)
121{
122 return dm_block_data(b) + sizeof(struct disk_bitmap_header);
123}
124
125#define WORD_MASK_HIGH 0xAAAAAAAAAAAAAAAAULL
126
127static unsigned bitmap_word_used(void *addr, unsigned b)
128{
129 __le64 *words_le = addr;
130 __le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
131
132 uint64_t bits = le64_to_cpu(*w_le);
133 uint64_t mask = (bits + WORD_MASK_HIGH + 1) & WORD_MASK_HIGH;
134
135 return !(~bits & mask);
136}
137
138static unsigned sm_lookup_bitmap(void *addr, unsigned b)
139{
140 __le64 *words_le = addr;
141 __le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
142 unsigned hi, lo;
143
144 b = (b & (ENTRIES_PER_WORD - 1)) << 1;
145 hi = !!test_bit_le(b, (void *) w_le);
146 lo = !!test_bit_le(b + 1, (void *) w_le);
147 return (hi << 1) | lo;
148}
149
150static void sm_set_bitmap(void *addr, unsigned b, unsigned val)
151{
152 __le64 *words_le = addr;
153 __le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
154
155 b = (b & (ENTRIES_PER_WORD - 1)) << 1;
156
157 if (val & 2)
158 __set_bit_le(b, (void *) w_le);
159 else
160 __clear_bit_le(b, (void *) w_le);
161
162 if (val & 1)
163 __set_bit_le(b + 1, (void *) w_le);
164 else
165 __clear_bit_le(b + 1, (void *) w_le);
166}
167
168static int sm_find_free(void *addr, unsigned begin, unsigned end,
169 unsigned *result)
170{
171 while (begin < end) {
172 if (!(begin & (ENTRIES_PER_WORD - 1)) &&
173 bitmap_word_used(addr, begin)) {
174 begin += ENTRIES_PER_WORD;
175 continue;
176 }
177
178 if (!sm_lookup_bitmap(addr, begin)) {
179 *result = begin;
180 return 0;
181 }
182
183 begin++;
184 }
185
186 return -ENOSPC;
187}
188
189/*----------------------------------------------------------------*/
190
191static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm)
192{
193 ll->tm = tm;
194
195 ll->bitmap_info.tm = tm;
196 ll->bitmap_info.levels = 1;
197
198 /*
199 * Because the new bitmap blocks are created via a shadow
200 * operation, the old entry has already had its reference count
201 * decremented and we don't need the btree to do any bookkeeping.
202 */
203 ll->bitmap_info.value_type.size = sizeof(struct disk_index_entry);
204 ll->bitmap_info.value_type.inc = NULL;
205 ll->bitmap_info.value_type.dec = NULL;
206 ll->bitmap_info.value_type.equal = NULL;
207
208 ll->ref_count_info.tm = tm;
209 ll->ref_count_info.levels = 1;
210 ll->ref_count_info.value_type.size = sizeof(uint32_t);
211 ll->ref_count_info.value_type.inc = NULL;
212 ll->ref_count_info.value_type.dec = NULL;
213 ll->ref_count_info.value_type.equal = NULL;
214
215 ll->block_size = dm_bm_block_size(dm_tm_get_bm(tm));
216
217 if (ll->block_size > (1 << 30)) {
218 DMERR("block size too big to hold bitmaps");
219 return -EINVAL;
220 }
221
222 ll->entries_per_block = (ll->block_size - sizeof(struct disk_bitmap_header)) *
223 ENTRIES_PER_BYTE;
224 ll->nr_blocks = 0;
225 ll->bitmap_root = 0;
226 ll->ref_count_root = 0;
227
228 return 0;
229}
230
231int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
232{
233 int r;
234 dm_block_t i, nr_blocks, nr_indexes;
235 unsigned old_blocks, blocks;
236
237 nr_blocks = ll->nr_blocks + extra_blocks;
238 old_blocks = dm_sector_div_up(ll->nr_blocks, ll->entries_per_block);
239 blocks = dm_sector_div_up(nr_blocks, ll->entries_per_block);
240
241 nr_indexes = dm_sector_div_up(nr_blocks, ll->entries_per_block);
242 if (nr_indexes > ll->max_entries(ll)) {
243 DMERR("space map too large");
244 return -EINVAL;
245 }
246
247 for (i = old_blocks; i < blocks; i++) {
248 struct dm_block *b;
249 struct disk_index_entry idx;
250
251 r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b);
252 if (r < 0)
253 return r;
254 idx.blocknr = cpu_to_le64(dm_block_location(b));
255
256 r = dm_tm_unlock(ll->tm, b);
257 if (r < 0)
258 return r;
259
260 idx.nr_free = cpu_to_le32(ll->entries_per_block);
261 idx.none_free_before = 0;
262
263 r = ll->save_ie(ll, i, &idx);
264 if (r < 0)
265 return r;
266 }
267
268 ll->nr_blocks = nr_blocks;
269 return 0;
270}
271
272int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result)
273{
274 int r;
275 dm_block_t index = b;
276 struct disk_index_entry ie_disk;
277 struct dm_block *blk;
278
279 b = do_div(index, ll->entries_per_block);
280 r = ll->load_ie(ll, index, &ie_disk);
281 if (r < 0)
282 return r;
283
284 r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr),
285 &dm_sm_bitmap_validator, &blk);
286 if (r < 0)
287 return r;
288
289 *result = sm_lookup_bitmap(dm_bitmap_data(blk), b);
290
291 return dm_tm_unlock(ll->tm, blk);
292}
293
294int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result)
295{
296 __le32 le_rc;
297 int r = sm_ll_lookup_bitmap(ll, b, result);
298
299 if (r)
300 return r;
301
302 if (*result != 3)
303 return r;
304
305 r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc);
306 if (r < 0)
307 return r;
308
309 *result = le32_to_cpu(le_rc);
310
311 return r;
312}
313
314int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
315 dm_block_t end, dm_block_t *result)
316{
317 int r;
318 struct disk_index_entry ie_disk;
319 dm_block_t i, index_begin = begin;
320 dm_block_t index_end = dm_sector_div_up(end, ll->entries_per_block);
321
322 /*
323 * FIXME: Use shifts
324 */
325 begin = do_div(index_begin, ll->entries_per_block);
326 end = do_div(end, ll->entries_per_block);
327
328 for (i = index_begin; i < index_end; i++, begin = 0) {
329 struct dm_block *blk;
330 unsigned position;
331 uint32_t bit_end;
332
333 r = ll->load_ie(ll, i, &ie_disk);
334 if (r < 0)
335 return r;
336
337 if (le32_to_cpu(ie_disk.nr_free) == 0)
338 continue;
339
340 r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr),
341 &dm_sm_bitmap_validator, &blk);
342 if (r < 0)
343 return r;
344
345 bit_end = (i == index_end - 1) ? end : ll->entries_per_block;
346
347 r = sm_find_free(dm_bitmap_data(blk),
348 max_t(unsigned, begin, le32_to_cpu(ie_disk.none_free_before)),
349 bit_end, &position);
350 if (r == -ENOSPC) {
351 /*
352 * This might happen because we started searching
353 * part way through the bitmap.
354 */
355 dm_tm_unlock(ll->tm, blk);
356 continue;
357
358 } else if (r < 0) {
359 dm_tm_unlock(ll->tm, blk);
360 return r;
361 }
362
363 r = dm_tm_unlock(ll->tm, blk);
364 if (r < 0)
365 return r;
366
367 *result = i * ll->entries_per_block + (dm_block_t) position;
368 return 0;
369 }
370
371 return -ENOSPC;
372}
373
374int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
375 uint32_t ref_count, enum allocation_event *ev)
376{
377 int r;
378 uint32_t bit, old;
379 struct dm_block *nb;
380 dm_block_t index = b;
381 struct disk_index_entry ie_disk;
382 void *bm_le;
383 int inc;
384
385 bit = do_div(index, ll->entries_per_block);
386 r = ll->load_ie(ll, index, &ie_disk);
387 if (r < 0)
388 return r;
389
390 r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ie_disk.blocknr),
391 &dm_sm_bitmap_validator, &nb, &inc);
392 if (r < 0) {
393 DMERR("dm_tm_shadow_block() failed");
394 return r;
395 }
396 ie_disk.blocknr = cpu_to_le64(dm_block_location(nb));
397
398 bm_le = dm_bitmap_data(nb);
399 old = sm_lookup_bitmap(bm_le, bit);
400
401 if (ref_count <= 2) {
402 sm_set_bitmap(bm_le, bit, ref_count);
403
404 r = dm_tm_unlock(ll->tm, nb);
405 if (r < 0)
406 return r;
407
408#if 0
409 /* FIXME: dm_btree_remove doesn't handle this yet */
410 if (old > 2) {
411 r = dm_btree_remove(&ll->ref_count_info,
412 ll->ref_count_root,
413 &b, &ll->ref_count_root);
414 if (r)
415 return r;
416 }
417#endif
418
419 } else {
420 __le32 le_rc = cpu_to_le32(ref_count);
421
422 sm_set_bitmap(bm_le, bit, 3);
423 r = dm_tm_unlock(ll->tm, nb);
424 if (r < 0)
425 return r;
426
427 __dm_bless_for_disk(&le_rc);
428 r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root,
429 &b, &le_rc, &ll->ref_count_root);
430 if (r < 0) {
431 DMERR("ref count insert failed");
432 return r;
433 }
434 }
435
436 if (ref_count && !old) {
437 *ev = SM_ALLOC;
438 ll->nr_allocated++;
439 ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) - 1);
440 if (le32_to_cpu(ie_disk.none_free_before) == bit)
441 ie_disk.none_free_before = cpu_to_le32(bit + 1);
442
443 } else if (old && !ref_count) {
444 *ev = SM_FREE;
445 ll->nr_allocated--;
446 ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) + 1);
447 ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit));
448 }
449
450 return ll->save_ie(ll, index, &ie_disk);
451}
452
453int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
454{
455 int r;
456 uint32_t rc;
457
458 r = sm_ll_lookup(ll, b, &rc);
459 if (r)
460 return r;
461
462 return sm_ll_insert(ll, b, rc + 1, ev);
463}
464
465int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
466{
467 int r;
468 uint32_t rc;
469
470 r = sm_ll_lookup(ll, b, &rc);
471 if (r)
472 return r;
473
474 if (!rc)
475 return -EINVAL;
476
477 return sm_ll_insert(ll, b, rc - 1, ev);
478}
479
480int sm_ll_commit(struct ll_disk *ll)
481{
482 return ll->commit(ll);
483}
484
485/*----------------------------------------------------------------*/
486
487static int metadata_ll_load_ie(struct ll_disk *ll, dm_block_t index,
488 struct disk_index_entry *ie)
489{
490 memcpy(ie, ll->mi_le.index + index, sizeof(*ie));
491 return 0;
492}
493
494static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index,
495 struct disk_index_entry *ie)
496{
497 memcpy(ll->mi_le.index + index, ie, sizeof(*ie));
498 return 0;
499}
500
501static int metadata_ll_init_index(struct ll_disk *ll)
502{
503 int r;
504 struct dm_block *b;
505
506 r = dm_tm_new_block(ll->tm, &index_validator, &b);
507 if (r < 0)
508 return r;
509
510 memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
511 ll->bitmap_root = dm_block_location(b);
512
513 return dm_tm_unlock(ll->tm, b);
514}
515
516static int metadata_ll_open(struct ll_disk *ll)
517{
518 int r;
519 struct dm_block *block;
520
521 r = dm_tm_read_lock(ll->tm, ll->bitmap_root,
522 &index_validator, &block);
523 if (r)
524 return r;
525
526 memcpy(&ll->mi_le, dm_block_data(block), sizeof(ll->mi_le));
527 return dm_tm_unlock(ll->tm, block);
528}
529
530static dm_block_t metadata_ll_max_entries(struct ll_disk *ll)
531{
532 return MAX_METADATA_BITMAPS;
533}
534
535static int metadata_ll_commit(struct ll_disk *ll)
536{
537 int r, inc;
538 struct dm_block *b;
539
540 r = dm_tm_shadow_block(ll->tm, ll->bitmap_root, &index_validator, &b, &inc);
541 if (r)
542 return r;
543
544 memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
545 ll->bitmap_root = dm_block_location(b);
546
547 return dm_tm_unlock(ll->tm, b);
548}
549
550int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm)
551{
552 int r;
553
554 r = sm_ll_init(ll, tm);
555 if (r < 0)
556 return r;
557
558 ll->load_ie = metadata_ll_load_ie;
559 ll->save_ie = metadata_ll_save_ie;
560 ll->init_index = metadata_ll_init_index;
561 ll->open_index = metadata_ll_open;
562 ll->max_entries = metadata_ll_max_entries;
563 ll->commit = metadata_ll_commit;
564
565 ll->nr_blocks = 0;
566 ll->nr_allocated = 0;
567
568 r = ll->init_index(ll);
569 if (r < 0)
570 return r;
571
572 r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root);
573 if (r < 0)
574 return r;
575
576 return 0;
577}
578
579int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
580 void *root_le, size_t len)
581{
582 int r;
583 struct disk_sm_root *smr = root_le;
584
585 if (len < sizeof(struct disk_sm_root)) {
586 DMERR("sm_metadata root too small");
587 return -ENOMEM;
588 }
589
590 r = sm_ll_init(ll, tm);
591 if (r < 0)
592 return r;
593
594 ll->load_ie = metadata_ll_load_ie;
595 ll->save_ie = metadata_ll_save_ie;
596 ll->init_index = metadata_ll_init_index;
597 ll->open_index = metadata_ll_open;
598 ll->max_entries = metadata_ll_max_entries;
599 ll->commit = metadata_ll_commit;
600
601 ll->nr_blocks = le64_to_cpu(smr->nr_blocks);
602 ll->nr_allocated = le64_to_cpu(smr->nr_allocated);
603 ll->bitmap_root = le64_to_cpu(smr->bitmap_root);
604 ll->ref_count_root = le64_to_cpu(smr->ref_count_root);
605
606 return ll->open_index(ll);
607}
608
609/*----------------------------------------------------------------*/
610
611static int disk_ll_load_ie(struct ll_disk *ll, dm_block_t index,
612 struct disk_index_entry *ie)
613{
614 return dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie);
615}
616
617static int disk_ll_save_ie(struct ll_disk *ll, dm_block_t index,
618 struct disk_index_entry *ie)
619{
620 __dm_bless_for_disk(ie);
621 return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root,
622 &index, ie, &ll->bitmap_root);
623}
624
625static int disk_ll_init_index(struct ll_disk *ll)
626{
627 return dm_btree_empty(&ll->bitmap_info, &ll->bitmap_root);
628}
629
630static int disk_ll_open(struct ll_disk *ll)
631{
632 /* nothing to do */
633 return 0;
634}
635
636static dm_block_t disk_ll_max_entries(struct ll_disk *ll)
637{
638 return -1ULL;
639}
640
641static int disk_ll_commit(struct ll_disk *ll)
642{
643 return 0;
644}
645
646int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm)
647{
648 int r;
649
650 r = sm_ll_init(ll, tm);
651 if (r < 0)
652 return r;
653
654 ll->load_ie = disk_ll_load_ie;
655 ll->save_ie = disk_ll_save_ie;
656 ll->init_index = disk_ll_init_index;
657 ll->open_index = disk_ll_open;
658 ll->max_entries = disk_ll_max_entries;
659 ll->commit = disk_ll_commit;
660
661 ll->nr_blocks = 0;
662 ll->nr_allocated = 0;
663
664 r = ll->init_index(ll);
665 if (r < 0)
666 return r;
667
668 r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root);
669 if (r < 0)
670 return r;
671
672 return 0;
673}
674
675int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm,
676 void *root_le, size_t len)
677{
678 int r;
679 struct disk_sm_root *smr = root_le;
680
681 if (len < sizeof(struct disk_sm_root)) {
682 DMERR("sm_metadata root too small");
683 return -ENOMEM;
684 }
685
686 r = sm_ll_init(ll, tm);
687 if (r < 0)
688 return r;
689
690 ll->load_ie = disk_ll_load_ie;
691 ll->save_ie = disk_ll_save_ie;
692 ll->init_index = disk_ll_init_index;
693 ll->open_index = disk_ll_open;
694 ll->max_entries = disk_ll_max_entries;
695 ll->commit = disk_ll_commit;
696
697 ll->nr_blocks = le64_to_cpu(smr->nr_blocks);
698 ll->nr_allocated = le64_to_cpu(smr->nr_allocated);
699 ll->bitmap_root = le64_to_cpu(smr->bitmap_root);
700 ll->ref_count_root = le64_to_cpu(smr->ref_count_root);
701
702 return ll->open_index(ll);
703}
704
705/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h
new file mode 100644
index 000000000000..8f220821a9a9
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-common.h
@@ -0,0 +1,126 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_SPACE_MAP_COMMON_H
8#define DM_SPACE_MAP_COMMON_H
9
10#include "dm-btree.h"
11
12/*----------------------------------------------------------------*/
13
14/*
15 * Low level disk format
16 *
17 * Bitmap btree
18 * ------------
19 *
20 * Each value stored in the btree is an index_entry. This points to a
21 * block that is used as a bitmap. Within the bitmap hold 2 bits per
22 * entry, which represent UNUSED = 0, REF_COUNT = 1, REF_COUNT = 2 and
23 * REF_COUNT = many.
24 *
25 * Refcount btree
26 * --------------
27 *
28 * Any entry that has a ref count higher than 2 gets entered in the ref
29 * count tree. The leaf values for this tree is the 32-bit ref count.
30 */
31
32struct disk_index_entry {
33 __le64 blocknr;
34 __le32 nr_free;
35 __le32 none_free_before;
36} __packed;
37
38
39#define MAX_METADATA_BITMAPS 255
40struct disk_metadata_index {
41 __le32 csum;
42 __le32 padding;
43 __le64 blocknr;
44
45 struct disk_index_entry index[MAX_METADATA_BITMAPS];
46} __packed;
47
48struct ll_disk;
49
50typedef int (*load_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *result);
51typedef int (*save_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie);
52typedef int (*init_index_fn)(struct ll_disk *ll);
53typedef int (*open_index_fn)(struct ll_disk *ll);
54typedef dm_block_t (*max_index_entries_fn)(struct ll_disk *ll);
55typedef int (*commit_fn)(struct ll_disk *ll);
56
57struct ll_disk {
58 struct dm_transaction_manager *tm;
59 struct dm_btree_info bitmap_info;
60 struct dm_btree_info ref_count_info;
61
62 uint32_t block_size;
63 uint32_t entries_per_block;
64 dm_block_t nr_blocks;
65 dm_block_t nr_allocated;
66
67 /*
68 * bitmap_root may be a btree root or a simple index.
69 */
70 dm_block_t bitmap_root;
71
72 dm_block_t ref_count_root;
73
74 struct disk_metadata_index mi_le;
75 load_ie_fn load_ie;
76 save_ie_fn save_ie;
77 init_index_fn init_index;
78 open_index_fn open_index;
79 max_index_entries_fn max_entries;
80 commit_fn commit;
81};
82
83struct disk_sm_root {
84 __le64 nr_blocks;
85 __le64 nr_allocated;
86 __le64 bitmap_root;
87 __le64 ref_count_root;
88} __packed;
89
90#define ENTRIES_PER_BYTE 4
91
92struct disk_bitmap_header {
93 __le32 csum;
94 __le32 not_used;
95 __le64 blocknr;
96} __packed;
97
98enum allocation_event {
99 SM_NONE,
100 SM_ALLOC,
101 SM_FREE,
102};
103
104/*----------------------------------------------------------------*/
105
106int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks);
107int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result);
108int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result);
109int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
110 dm_block_t end, dm_block_t *result);
111int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev);
112int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
113int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
114int sm_ll_commit(struct ll_disk *ll);
115
116int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm);
117int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
118 void *root_le, size_t len);
119
120int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm);
121int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm,
122 void *root_le, size_t len);
123
124/*----------------------------------------------------------------*/
125
126#endif /* DM_SPACE_MAP_COMMON_H */
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
new file mode 100644
index 000000000000..aeff7852cf79
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -0,0 +1,335 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-space-map-checker.h"
8#include "dm-space-map-common.h"
9#include "dm-space-map-disk.h"
10#include "dm-space-map.h"
11#include "dm-transaction-manager.h"
12
13#include <linux/list.h>
14#include <linux/slab.h>
15#include <linux/module.h>
16#include <linux/device-mapper.h>
17
18#define DM_MSG_PREFIX "space map disk"
19
20/*----------------------------------------------------------------*/
21
22/*
23 * Space map interface.
24 */
25struct sm_disk {
26 struct dm_space_map sm;
27
28 struct ll_disk ll;
29 struct ll_disk old_ll;
30
31 dm_block_t begin;
32 dm_block_t nr_allocated_this_transaction;
33};
34
35static void sm_disk_destroy(struct dm_space_map *sm)
36{
37 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
38
39 kfree(smd);
40}
41
42static int sm_disk_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
43{
44 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
45
46 return sm_ll_extend(&smd->ll, extra_blocks);
47}
48
49static int sm_disk_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
50{
51 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
52 *count = smd->old_ll.nr_blocks;
53
54 return 0;
55}
56
57static int sm_disk_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
58{
59 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
60 *count = (smd->old_ll.nr_blocks - smd->old_ll.nr_allocated) - smd->nr_allocated_this_transaction;
61
62 return 0;
63}
64
65static int sm_disk_get_count(struct dm_space_map *sm, dm_block_t b,
66 uint32_t *result)
67{
68 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
69 return sm_ll_lookup(&smd->ll, b, result);
70}
71
72static int sm_disk_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b,
73 int *result)
74{
75 int r;
76 uint32_t count;
77
78 r = sm_disk_get_count(sm, b, &count);
79 if (r)
80 return r;
81
82 return count > 1;
83}
84
85static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b,
86 uint32_t count)
87{
88 int r;
89 uint32_t old_count;
90 enum allocation_event ev;
91 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
92
93 r = sm_ll_insert(&smd->ll, b, count, &ev);
94 if (!r) {
95 switch (ev) {
96 case SM_NONE:
97 break;
98
99 case SM_ALLOC:
100 /*
101 * This _must_ be free in the prior transaction
102 * otherwise we've lost atomicity.
103 */
104 smd->nr_allocated_this_transaction++;
105 break;
106
107 case SM_FREE:
108 /*
109 * It's only free if it's also free in the last
110 * transaction.
111 */
112 r = sm_ll_lookup(&smd->old_ll, b, &old_count);
113 if (r)
114 return r;
115
116 if (!old_count)
117 smd->nr_allocated_this_transaction--;
118 break;
119 }
120 }
121
122 return r;
123}
124
125static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
126{
127 int r;
128 enum allocation_event ev;
129 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
130
131 r = sm_ll_inc(&smd->ll, b, &ev);
132 if (!r && (ev == SM_ALLOC))
133 /*
134 * This _must_ be free in the prior transaction
135 * otherwise we've lost atomicity.
136 */
137 smd->nr_allocated_this_transaction++;
138
139 return r;
140}
141
142static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
143{
144 int r;
145 uint32_t old_count;
146 enum allocation_event ev;
147 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
148
149 r = sm_ll_dec(&smd->ll, b, &ev);
150 if (!r && (ev == SM_FREE)) {
151 /*
152 * It's only free if it's also free in the last
153 * transaction.
154 */
155 r = sm_ll_lookup(&smd->old_ll, b, &old_count);
156 if (r)
157 return r;
158
159 if (!old_count)
160 smd->nr_allocated_this_transaction--;
161 }
162
163 return r;
164}
165
166static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
167{
168 int r;
169 enum allocation_event ev;
170 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
171
172 /* FIXME: we should loop round a couple of times */
173 r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b);
174 if (r)
175 return r;
176
177 smd->begin = *b + 1;
178 r = sm_ll_inc(&smd->ll, *b, &ev);
179 if (!r) {
180 BUG_ON(ev != SM_ALLOC);
181 smd->nr_allocated_this_transaction++;
182 }
183
184 return r;
185}
186
187static int sm_disk_commit(struct dm_space_map *sm)
188{
189 int r;
190 dm_block_t nr_free;
191 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
192
193 r = sm_disk_get_nr_free(sm, &nr_free);
194 if (r)
195 return r;
196
197 r = sm_ll_commit(&smd->ll);
198 if (r)
199 return r;
200
201 memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll));
202 smd->begin = 0;
203 smd->nr_allocated_this_transaction = 0;
204
205 r = sm_disk_get_nr_free(sm, &nr_free);
206 if (r)
207 return r;
208
209 return 0;
210}
211
212static int sm_disk_root_size(struct dm_space_map *sm, size_t *result)
213{
214 *result = sizeof(struct disk_sm_root);
215
216 return 0;
217}
218
219static int sm_disk_copy_root(struct dm_space_map *sm, void *where_le, size_t max)
220{
221 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
222 struct disk_sm_root root_le;
223
224 root_le.nr_blocks = cpu_to_le64(smd->ll.nr_blocks);
225 root_le.nr_allocated = cpu_to_le64(smd->ll.nr_allocated);
226 root_le.bitmap_root = cpu_to_le64(smd->ll.bitmap_root);
227 root_le.ref_count_root = cpu_to_le64(smd->ll.ref_count_root);
228
229 if (max < sizeof(root_le))
230 return -ENOSPC;
231
232 memcpy(where_le, &root_le, sizeof(root_le));
233
234 return 0;
235}
236
237/*----------------------------------------------------------------*/
238
239static struct dm_space_map ops = {
240 .destroy = sm_disk_destroy,
241 .extend = sm_disk_extend,
242 .get_nr_blocks = sm_disk_get_nr_blocks,
243 .get_nr_free = sm_disk_get_nr_free,
244 .get_count = sm_disk_get_count,
245 .count_is_more_than_one = sm_disk_count_is_more_than_one,
246 .set_count = sm_disk_set_count,
247 .inc_block = sm_disk_inc_block,
248 .dec_block = sm_disk_dec_block,
249 .new_block = sm_disk_new_block,
250 .commit = sm_disk_commit,
251 .root_size = sm_disk_root_size,
252 .copy_root = sm_disk_copy_root
253};
254
255static struct dm_space_map *dm_sm_disk_create_real(
256 struct dm_transaction_manager *tm,
257 dm_block_t nr_blocks)
258{
259 int r;
260 struct sm_disk *smd;
261
262 smd = kmalloc(sizeof(*smd), GFP_KERNEL);
263 if (!smd)
264 return ERR_PTR(-ENOMEM);
265
266 smd->begin = 0;
267 smd->nr_allocated_this_transaction = 0;
268 memcpy(&smd->sm, &ops, sizeof(smd->sm));
269
270 r = sm_ll_new_disk(&smd->ll, tm);
271 if (r)
272 goto bad;
273
274 r = sm_ll_extend(&smd->ll, nr_blocks);
275 if (r)
276 goto bad;
277
278 r = sm_disk_commit(&smd->sm);
279 if (r)
280 goto bad;
281
282 return &smd->sm;
283
284bad:
285 kfree(smd);
286 return ERR_PTR(r);
287}
288
289struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
290 dm_block_t nr_blocks)
291{
292 struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks);
293 return dm_sm_checker_create_fresh(sm);
294}
295EXPORT_SYMBOL_GPL(dm_sm_disk_create);
296
297static struct dm_space_map *dm_sm_disk_open_real(
298 struct dm_transaction_manager *tm,
299 void *root_le, size_t len)
300{
301 int r;
302 struct sm_disk *smd;
303
304 smd = kmalloc(sizeof(*smd), GFP_KERNEL);
305 if (!smd)
306 return ERR_PTR(-ENOMEM);
307
308 smd->begin = 0;
309 smd->nr_allocated_this_transaction = 0;
310 memcpy(&smd->sm, &ops, sizeof(smd->sm));
311
312 r = sm_ll_open_disk(&smd->ll, tm, root_le, len);
313 if (r)
314 goto bad;
315
316 r = sm_disk_commit(&smd->sm);
317 if (r)
318 goto bad;
319
320 return &smd->sm;
321
322bad:
323 kfree(smd);
324 return ERR_PTR(r);
325}
326
327struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
328 void *root_le, size_t len)
329{
330 return dm_sm_checker_create(
331 dm_sm_disk_open_real(tm, root_le, len));
332}
333EXPORT_SYMBOL_GPL(dm_sm_disk_open);
334
335/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-space-map-disk.h b/drivers/md/persistent-data/dm-space-map-disk.h
new file mode 100644
index 000000000000..447a0a9a2d9f
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-disk.h
@@ -0,0 +1,25 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef _LINUX_DM_SPACE_MAP_DISK_H
8#define _LINUX_DM_SPACE_MAP_DISK_H
9
10#include "dm-block-manager.h"
11
12struct dm_space_map;
13struct dm_transaction_manager;
14
15/*
16 * Unfortunately we have to use two-phase construction due to the cycle
17 * between the tm and sm.
18 */
19struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
20 dm_block_t nr_blocks);
21
22struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
23 void *root, size_t len);
24
25#endif /* _LINUX_DM_SPACE_MAP_DISK_H */
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
new file mode 100644
index 000000000000..e89ae5e7a519
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -0,0 +1,596 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-space-map.h"
8#include "dm-space-map-common.h"
9#include "dm-space-map-metadata.h"
10
11#include <linux/list.h>
12#include <linux/slab.h>
13#include <linux/device-mapper.h>
14
15#define DM_MSG_PREFIX "space map metadata"
16
17/*----------------------------------------------------------------*/
18
19/*
20 * Space map interface.
21 *
22 * The low level disk format is written using the standard btree and
23 * transaction manager. This means that performing disk operations may
24 * cause us to recurse into the space map in order to allocate new blocks.
25 * For this reason we have a pool of pre-allocated blocks large enough to
26 * service any metadata_ll_disk operation.
27 */
28
29/*
30 * FIXME: we should calculate this based on the size of the device.
31 * Only the metadata space map needs this functionality.
32 */
33#define MAX_RECURSIVE_ALLOCATIONS 1024
34
35enum block_op_type {
36 BOP_INC,
37 BOP_DEC
38};
39
40struct block_op {
41 enum block_op_type type;
42 dm_block_t block;
43};
44
45struct sm_metadata {
46 struct dm_space_map sm;
47
48 struct ll_disk ll;
49 struct ll_disk old_ll;
50
51 dm_block_t begin;
52
53 unsigned recursion_count;
54 unsigned allocated_this_transaction;
55 unsigned nr_uncommitted;
56 struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
57};
58
59static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
60{
61 struct block_op *op;
62
63 if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) {
64 DMERR("too many recursive allocations");
65 return -ENOMEM;
66 }
67
68 op = smm->uncommitted + smm->nr_uncommitted++;
69 op->type = type;
70 op->block = b;
71
72 return 0;
73}
74
75static int commit_bop(struct sm_metadata *smm, struct block_op *op)
76{
77 int r = 0;
78 enum allocation_event ev;
79
80 switch (op->type) {
81 case BOP_INC:
82 r = sm_ll_inc(&smm->ll, op->block, &ev);
83 break;
84
85 case BOP_DEC:
86 r = sm_ll_dec(&smm->ll, op->block, &ev);
87 break;
88 }
89
90 return r;
91}
92
93static void in(struct sm_metadata *smm)
94{
95 smm->recursion_count++;
96}
97
98static int out(struct sm_metadata *smm)
99{
100 int r = 0;
101
102 /*
103 * If we're not recursing then very bad things are happening.
104 */
105 if (!smm->recursion_count) {
106 DMERR("lost track of recursion depth");
107 return -ENOMEM;
108 }
109
110 if (smm->recursion_count == 1 && smm->nr_uncommitted) {
111 while (smm->nr_uncommitted && !r) {
112 smm->nr_uncommitted--;
113 r = commit_bop(smm, smm->uncommitted +
114 smm->nr_uncommitted);
115 if (r)
116 break;
117 }
118 }
119
120 smm->recursion_count--;
121
122 return r;
123}
124
125/*
126 * When using the out() function above, we often want to combine an error
127 * code for the operation run in the recursive context with that from
128 * out().
129 */
130static int combine_errors(int r1, int r2)
131{
132 return r1 ? r1 : r2;
133}
134
135static int recursing(struct sm_metadata *smm)
136{
137 return smm->recursion_count;
138}
139
140static void sm_metadata_destroy(struct dm_space_map *sm)
141{
142 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
143
144 kfree(smm);
145}
146
147static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
148{
149 DMERR("doesn't support extend");
150 return -EINVAL;
151}
152
153static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
154{
155 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
156
157 *count = smm->ll.nr_blocks;
158
159 return 0;
160}
161
162static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
163{
164 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
165
166 *count = smm->old_ll.nr_blocks - smm->old_ll.nr_allocated -
167 smm->allocated_this_transaction;
168
169 return 0;
170}
171
172static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
173 uint32_t *result)
174{
175 int r, i;
176 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
177 unsigned adjustment = 0;
178
179 /*
180 * We may have some uncommitted adjustments to add. This list
181 * should always be really short.
182 */
183 for (i = 0; i < smm->nr_uncommitted; i++) {
184 struct block_op *op = smm->uncommitted + i;
185
186 if (op->block != b)
187 continue;
188
189 switch (op->type) {
190 case BOP_INC:
191 adjustment++;
192 break;
193
194 case BOP_DEC:
195 adjustment--;
196 break;
197 }
198 }
199
200 r = sm_ll_lookup(&smm->ll, b, result);
201 if (r)
202 return r;
203
204 *result += adjustment;
205
206 return 0;
207}
208
209static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
210 dm_block_t b, int *result)
211{
212 int r, i, adjustment = 0;
213 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
214 uint32_t rc;
215
216 /*
217 * We may have some uncommitted adjustments to add. This list
218 * should always be really short.
219 */
220 for (i = 0; i < smm->nr_uncommitted; i++) {
221 struct block_op *op = smm->uncommitted + i;
222
223 if (op->block != b)
224 continue;
225
226 switch (op->type) {
227 case BOP_INC:
228 adjustment++;
229 break;
230
231 case BOP_DEC:
232 adjustment--;
233 break;
234 }
235 }
236
237 if (adjustment > 1) {
238 *result = 1;
239 return 0;
240 }
241
242 r = sm_ll_lookup_bitmap(&smm->ll, b, &rc);
243 if (r)
244 return r;
245
246 if (rc == 3)
247 /*
248 * We err on the side of caution, and always return true.
249 */
250 *result = 1;
251 else
252 *result = rc + adjustment > 1;
253
254 return 0;
255}
256
257static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b,
258 uint32_t count)
259{
260 int r, r2;
261 enum allocation_event ev;
262 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
263
264 if (smm->recursion_count) {
265 DMERR("cannot recurse set_count()");
266 return -EINVAL;
267 }
268
269 in(smm);
270 r = sm_ll_insert(&smm->ll, b, count, &ev);
271 r2 = out(smm);
272
273 return combine_errors(r, r2);
274}
275
276static int sm_metadata_inc_block(struct dm_space_map *sm, dm_block_t b)
277{
278 int r, r2 = 0;
279 enum allocation_event ev;
280 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
281
282 if (recursing(smm))
283 r = add_bop(smm, BOP_INC, b);
284 else {
285 in(smm);
286 r = sm_ll_inc(&smm->ll, b, &ev);
287 r2 = out(smm);
288 }
289
290 return combine_errors(r, r2);
291}
292
293static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b)
294{
295 int r, r2 = 0;
296 enum allocation_event ev;
297 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
298
299 if (recursing(smm))
300 r = add_bop(smm, BOP_DEC, b);
301 else {
302 in(smm);
303 r = sm_ll_dec(&smm->ll, b, &ev);
304 r2 = out(smm);
305 }
306
307 return combine_errors(r, r2);
308}
309
310static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
311{
312 int r, r2 = 0;
313 enum allocation_event ev;
314 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
315
316 r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b);
317 if (r)
318 return r;
319
320 smm->begin = *b + 1;
321
322 if (recursing(smm))
323 r = add_bop(smm, BOP_INC, *b);
324 else {
325 in(smm);
326 r = sm_ll_inc(&smm->ll, *b, &ev);
327 r2 = out(smm);
328 }
329
330 if (!r)
331 smm->allocated_this_transaction++;
332
333 return combine_errors(r, r2);
334}
335
336static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
337{
338 int r = sm_metadata_new_block_(sm, b);
339 if (r)
340 DMERR("out of metadata space");
341 return r;
342}
343
344static int sm_metadata_commit(struct dm_space_map *sm)
345{
346 int r;
347 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
348
349 r = sm_ll_commit(&smm->ll);
350 if (r)
351 return r;
352
353 memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
354 smm->begin = 0;
355 smm->allocated_this_transaction = 0;
356
357 return 0;
358}
359
360static int sm_metadata_root_size(struct dm_space_map *sm, size_t *result)
361{
362 *result = sizeof(struct disk_sm_root);
363
364 return 0;
365}
366
367static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t max)
368{
369 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
370 struct disk_sm_root root_le;
371
372 root_le.nr_blocks = cpu_to_le64(smm->ll.nr_blocks);
373 root_le.nr_allocated = cpu_to_le64(smm->ll.nr_allocated);
374 root_le.bitmap_root = cpu_to_le64(smm->ll.bitmap_root);
375 root_le.ref_count_root = cpu_to_le64(smm->ll.ref_count_root);
376
377 if (max < sizeof(root_le))
378 return -ENOSPC;
379
380 memcpy(where_le, &root_le, sizeof(root_le));
381
382 return 0;
383}
384
385static struct dm_space_map ops = {
386 .destroy = sm_metadata_destroy,
387 .extend = sm_metadata_extend,
388 .get_nr_blocks = sm_metadata_get_nr_blocks,
389 .get_nr_free = sm_metadata_get_nr_free,
390 .get_count = sm_metadata_get_count,
391 .count_is_more_than_one = sm_metadata_count_is_more_than_one,
392 .set_count = sm_metadata_set_count,
393 .inc_block = sm_metadata_inc_block,
394 .dec_block = sm_metadata_dec_block,
395 .new_block = sm_metadata_new_block,
396 .commit = sm_metadata_commit,
397 .root_size = sm_metadata_root_size,
398 .copy_root = sm_metadata_copy_root
399};
400
401/*----------------------------------------------------------------*/
402
403/*
404 * When a new space map is created that manages its own space. We use
405 * this tiny bootstrap allocator.
406 */
407static void sm_bootstrap_destroy(struct dm_space_map *sm)
408{
409}
410
411static int sm_bootstrap_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
412{
413 DMERR("boostrap doesn't support extend");
414
415 return -EINVAL;
416}
417
418static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
419{
420 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
421
422 return smm->ll.nr_blocks;
423}
424
425static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
426{
427 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
428
429 *count = smm->ll.nr_blocks - smm->begin;
430
431 return 0;
432}
433
434static int sm_bootstrap_get_count(struct dm_space_map *sm, dm_block_t b,
435 uint32_t *result)
436{
437 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
438
439 return b < smm->begin ? 1 : 0;
440}
441
442static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm,
443 dm_block_t b, int *result)
444{
445 *result = 0;
446
447 return 0;
448}
449
450static int sm_bootstrap_set_count(struct dm_space_map *sm, dm_block_t b,
451 uint32_t count)
452{
453 DMERR("boostrap doesn't support set_count");
454
455 return -EINVAL;
456}
457
458static int sm_bootstrap_new_block(struct dm_space_map *sm, dm_block_t *b)
459{
460 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
461
462 /*
463 * We know the entire device is unused.
464 */
465 if (smm->begin == smm->ll.nr_blocks)
466 return -ENOSPC;
467
468 *b = smm->begin++;
469
470 return 0;
471}
472
473static int sm_bootstrap_inc_block(struct dm_space_map *sm, dm_block_t b)
474{
475 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
476
477 return add_bop(smm, BOP_INC, b);
478}
479
480static int sm_bootstrap_dec_block(struct dm_space_map *sm, dm_block_t b)
481{
482 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
483
484 return add_bop(smm, BOP_DEC, b);
485}
486
487static int sm_bootstrap_commit(struct dm_space_map *sm)
488{
489 return 0;
490}
491
492static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result)
493{
494 DMERR("boostrap doesn't support root_size");
495
496 return -EINVAL;
497}
498
499static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where,
500 size_t max)
501{
502 DMERR("boostrap doesn't support copy_root");
503
504 return -EINVAL;
505}
506
507static struct dm_space_map bootstrap_ops = {
508 .destroy = sm_bootstrap_destroy,
509 .extend = sm_bootstrap_extend,
510 .get_nr_blocks = sm_bootstrap_get_nr_blocks,
511 .get_nr_free = sm_bootstrap_get_nr_free,
512 .get_count = sm_bootstrap_get_count,
513 .count_is_more_than_one = sm_bootstrap_count_is_more_than_one,
514 .set_count = sm_bootstrap_set_count,
515 .inc_block = sm_bootstrap_inc_block,
516 .dec_block = sm_bootstrap_dec_block,
517 .new_block = sm_bootstrap_new_block,
518 .commit = sm_bootstrap_commit,
519 .root_size = sm_bootstrap_root_size,
520 .copy_root = sm_bootstrap_copy_root
521};
522
523/*----------------------------------------------------------------*/
524
525struct dm_space_map *dm_sm_metadata_init(void)
526{
527 struct sm_metadata *smm;
528
529 smm = kmalloc(sizeof(*smm), GFP_KERNEL);
530 if (!smm)
531 return ERR_PTR(-ENOMEM);
532
533 memcpy(&smm->sm, &ops, sizeof(smm->sm));
534
535 return &smm->sm;
536}
537
538int dm_sm_metadata_create(struct dm_space_map *sm,
539 struct dm_transaction_manager *tm,
540 dm_block_t nr_blocks,
541 dm_block_t superblock)
542{
543 int r;
544 dm_block_t i;
545 enum allocation_event ev;
546 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
547
548 smm->begin = superblock + 1;
549 smm->recursion_count = 0;
550 smm->allocated_this_transaction = 0;
551 smm->nr_uncommitted = 0;
552
553 memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
554
555 r = sm_ll_new_metadata(&smm->ll, tm);
556 if (r)
557 return r;
558
559 r = sm_ll_extend(&smm->ll, nr_blocks);
560 if (r)
561 return r;
562
563 memcpy(&smm->sm, &ops, sizeof(smm->sm));
564
565 /*
566 * Now we need to update the newly created data structures with the
567 * allocated blocks that they were built from.
568 */
569 for (i = superblock; !r && i < smm->begin; i++)
570 r = sm_ll_inc(&smm->ll, i, &ev);
571
572 if (r)
573 return r;
574
575 return sm_metadata_commit(sm);
576}
577
578int dm_sm_metadata_open(struct dm_space_map *sm,
579 struct dm_transaction_manager *tm,
580 void *root_le, size_t len)
581{
582 int r;
583 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
584
585 r = sm_ll_open_metadata(&smm->ll, tm, root_le, len);
586 if (r)
587 return r;
588
589 smm->begin = 0;
590 smm->recursion_count = 0;
591 smm->allocated_this_transaction = 0;
592 smm->nr_uncommitted = 0;
593
594 memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
595 return 0;
596}
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h
new file mode 100644
index 000000000000..39bba0801cf2
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map-metadata.h
@@ -0,0 +1,33 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_SPACE_MAP_METADATA_H
8#define DM_SPACE_MAP_METADATA_H
9
10#include "dm-transaction-manager.h"
11
12/*
13 * Unfortunately we have to use two-phase construction due to the cycle
14 * between the tm and sm.
15 */
16struct dm_space_map *dm_sm_metadata_init(void);
17
18/*
19 * Create a fresh space map.
20 */
21int dm_sm_metadata_create(struct dm_space_map *sm,
22 struct dm_transaction_manager *tm,
23 dm_block_t nr_blocks,
24 dm_block_t superblock);
25
26/*
27 * Open from a previously-recorded root.
28 */
29int dm_sm_metadata_open(struct dm_space_map *sm,
30 struct dm_transaction_manager *tm,
31 void *root_le, size_t len);
32
33#endif /* DM_SPACE_MAP_METADATA_H */
diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h
new file mode 100644
index 000000000000..1cbfc6b1638a
--- /dev/null
+++ b/drivers/md/persistent-data/dm-space-map.h
@@ -0,0 +1,134 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef _LINUX_DM_SPACE_MAP_H
8#define _LINUX_DM_SPACE_MAP_H
9
10#include "dm-block-manager.h"
11
12/*
13 * struct dm_space_map keeps a record of how many times each block in a device
14 * is referenced. It needs to be fixed on disk as part of the transaction.
15 */
16struct dm_space_map {
17 void (*destroy)(struct dm_space_map *sm);
18
19 /*
20 * You must commit before allocating the newly added space.
21 */
22 int (*extend)(struct dm_space_map *sm, dm_block_t extra_blocks);
23
24 /*
25 * Extensions do not appear in this count until after commit has
26 * been called.
27 */
28 int (*get_nr_blocks)(struct dm_space_map *sm, dm_block_t *count);
29
30 /*
31 * Space maps must never allocate a block from the previous
32 * transaction, in case we need to rollback. This complicates the
33 * semantics of get_nr_free(), it should return the number of blocks
34 * that are available for allocation _now_. For instance you may
35 * have blocks with a zero reference count that will not be
36 * available for allocation until after the next commit.
37 */
38 int (*get_nr_free)(struct dm_space_map *sm, dm_block_t *count);
39
40 int (*get_count)(struct dm_space_map *sm, dm_block_t b, uint32_t *result);
41 int (*count_is_more_than_one)(struct dm_space_map *sm, dm_block_t b,
42 int *result);
43 int (*set_count)(struct dm_space_map *sm, dm_block_t b, uint32_t count);
44
45 int (*commit)(struct dm_space_map *sm);
46
47 int (*inc_block)(struct dm_space_map *sm, dm_block_t b);
48 int (*dec_block)(struct dm_space_map *sm, dm_block_t b);
49
50 /*
51 * new_block will increment the returned block.
52 */
53 int (*new_block)(struct dm_space_map *sm, dm_block_t *b);
54
55 /*
56 * The root contains all the information needed to fix the space map.
57 * Generally this info is small, so squirrel it away in a disk block
58 * along with other info.
59 */
60 int (*root_size)(struct dm_space_map *sm, size_t *result);
61 int (*copy_root)(struct dm_space_map *sm, void *copy_to_here_le, size_t len);
62};
63
64/*----------------------------------------------------------------*/
65
66static inline void dm_sm_destroy(struct dm_space_map *sm)
67{
68 sm->destroy(sm);
69}
70
71static inline int dm_sm_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
72{
73 return sm->extend(sm, extra_blocks);
74}
75
76static inline int dm_sm_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
77{
78 return sm->get_nr_blocks(sm, count);
79}
80
81static inline int dm_sm_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
82{
83 return sm->get_nr_free(sm, count);
84}
85
86static inline int dm_sm_get_count(struct dm_space_map *sm, dm_block_t b,
87 uint32_t *result)
88{
89 return sm->get_count(sm, b, result);
90}
91
92static inline int dm_sm_count_is_more_than_one(struct dm_space_map *sm,
93 dm_block_t b, int *result)
94{
95 return sm->count_is_more_than_one(sm, b, result);
96}
97
98static inline int dm_sm_set_count(struct dm_space_map *sm, dm_block_t b,
99 uint32_t count)
100{
101 return sm->set_count(sm, b, count);
102}
103
104static inline int dm_sm_commit(struct dm_space_map *sm)
105{
106 return sm->commit(sm);
107}
108
109static inline int dm_sm_inc_block(struct dm_space_map *sm, dm_block_t b)
110{
111 return sm->inc_block(sm, b);
112}
113
114static inline int dm_sm_dec_block(struct dm_space_map *sm, dm_block_t b)
115{
116 return sm->dec_block(sm, b);
117}
118
119static inline int dm_sm_new_block(struct dm_space_map *sm, dm_block_t *b)
120{
121 return sm->new_block(sm, b);
122}
123
124static inline int dm_sm_root_size(struct dm_space_map *sm, size_t *result)
125{
126 return sm->root_size(sm, result);
127}
128
129static inline int dm_sm_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len)
130{
131 return sm->copy_root(sm, copy_to_here_le, len);
132}
133
134#endif /* _LINUX_DM_SPACE_MAP_H */
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
new file mode 100644
index 000000000000..728e89a3f978
--- /dev/null
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -0,0 +1,400 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6#include "dm-transaction-manager.h"
7#include "dm-space-map.h"
8#include "dm-space-map-checker.h"
9#include "dm-space-map-disk.h"
10#include "dm-space-map-metadata.h"
11#include "dm-persistent-data-internal.h"
12
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/device-mapper.h>
16
17#define DM_MSG_PREFIX "transaction manager"
18
19/*----------------------------------------------------------------*/
20
21struct shadow_info {
22 struct hlist_node hlist;
23 dm_block_t where;
24};
25
26/*
27 * It would be nice if we scaled with the size of transaction.
28 */
29#define HASH_SIZE 256
30#define HASH_MASK (HASH_SIZE - 1)
31
32struct dm_transaction_manager {
33 int is_clone;
34 struct dm_transaction_manager *real;
35
36 struct dm_block_manager *bm;
37 struct dm_space_map *sm;
38
39 spinlock_t lock;
40 struct hlist_head buckets[HASH_SIZE];
41};
42
43/*----------------------------------------------------------------*/
44
45static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b)
46{
47 int r = 0;
48 unsigned bucket = dm_hash_block(b, HASH_MASK);
49 struct shadow_info *si;
50 struct hlist_node *n;
51
52 spin_lock(&tm->lock);
53 hlist_for_each_entry(si, n, tm->buckets + bucket, hlist)
54 if (si->where == b) {
55 r = 1;
56 break;
57 }
58 spin_unlock(&tm->lock);
59
60 return r;
61}
62
63/*
64 * This can silently fail if there's no memory. We're ok with this since
65 * creating redundant shadows causes no harm.
66 */
67static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b)
68{
69 unsigned bucket;
70 struct shadow_info *si;
71
72 si = kmalloc(sizeof(*si), GFP_NOIO);
73 if (si) {
74 si->where = b;
75 bucket = dm_hash_block(b, HASH_MASK);
76 spin_lock(&tm->lock);
77 hlist_add_head(&si->hlist, tm->buckets + bucket);
78 spin_unlock(&tm->lock);
79 }
80}
81
82static void wipe_shadow_table(struct dm_transaction_manager *tm)
83{
84 struct shadow_info *si;
85 struct hlist_node *n, *tmp;
86 struct hlist_head *bucket;
87 int i;
88
89 spin_lock(&tm->lock);
90 for (i = 0; i < HASH_SIZE; i++) {
91 bucket = tm->buckets + i;
92 hlist_for_each_entry_safe(si, n, tmp, bucket, hlist)
93 kfree(si);
94
95 INIT_HLIST_HEAD(bucket);
96 }
97
98 spin_unlock(&tm->lock);
99}
100
101/*----------------------------------------------------------------*/
102
103static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
104 struct dm_space_map *sm)
105{
106 int i;
107 struct dm_transaction_manager *tm;
108
109 tm = kmalloc(sizeof(*tm), GFP_KERNEL);
110 if (!tm)
111 return ERR_PTR(-ENOMEM);
112
113 tm->is_clone = 0;
114 tm->real = NULL;
115 tm->bm = bm;
116 tm->sm = sm;
117
118 spin_lock_init(&tm->lock);
119 for (i = 0; i < HASH_SIZE; i++)
120 INIT_HLIST_HEAD(tm->buckets + i);
121
122 return tm;
123}
124
125struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real)
126{
127 struct dm_transaction_manager *tm;
128
129 tm = kmalloc(sizeof(*tm), GFP_KERNEL);
130 if (tm) {
131 tm->is_clone = 1;
132 tm->real = real;
133 }
134
135 return tm;
136}
137EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone);
138
139void dm_tm_destroy(struct dm_transaction_manager *tm)
140{
141 kfree(tm);
142}
143EXPORT_SYMBOL_GPL(dm_tm_destroy);
144
145int dm_tm_pre_commit(struct dm_transaction_manager *tm)
146{
147 int r;
148
149 if (tm->is_clone)
150 return -EWOULDBLOCK;
151
152 r = dm_sm_commit(tm->sm);
153 if (r < 0)
154 return r;
155
156 return 0;
157}
158EXPORT_SYMBOL_GPL(dm_tm_pre_commit);
159
160int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root)
161{
162 if (tm->is_clone)
163 return -EWOULDBLOCK;
164
165 wipe_shadow_table(tm);
166
167 return dm_bm_flush_and_unlock(tm->bm, root);
168}
169EXPORT_SYMBOL_GPL(dm_tm_commit);
170
171int dm_tm_new_block(struct dm_transaction_manager *tm,
172 struct dm_block_validator *v,
173 struct dm_block **result)
174{
175 int r;
176 dm_block_t new_block;
177
178 if (tm->is_clone)
179 return -EWOULDBLOCK;
180
181 r = dm_sm_new_block(tm->sm, &new_block);
182 if (r < 0)
183 return r;
184
185 r = dm_bm_write_lock_zero(tm->bm, new_block, v, result);
186 if (r < 0) {
187 dm_sm_dec_block(tm->sm, new_block);
188 return r;
189 }
190
191 /*
192 * New blocks count as shadows in that they don't need to be
193 * shadowed again.
194 */
195 insert_shadow(tm, new_block);
196
197 return 0;
198}
199
200static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
201 struct dm_block_validator *v,
202 struct dm_block **result)
203{
204 int r;
205 dm_block_t new;
206 struct dm_block *orig_block;
207
208 r = dm_sm_new_block(tm->sm, &new);
209 if (r < 0)
210 return r;
211
212 r = dm_sm_dec_block(tm->sm, orig);
213 if (r < 0)
214 return r;
215
216 r = dm_bm_read_lock(tm->bm, orig, v, &orig_block);
217 if (r < 0)
218 return r;
219
220 r = dm_bm_unlock_move(orig_block, new);
221 if (r < 0) {
222 dm_bm_unlock(orig_block);
223 return r;
224 }
225
226 return dm_bm_write_lock(tm->bm, new, v, result);
227}
228
229int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
230 struct dm_block_validator *v, struct dm_block **result,
231 int *inc_children)
232{
233 int r;
234
235 if (tm->is_clone)
236 return -EWOULDBLOCK;
237
238 r = dm_sm_count_is_more_than_one(tm->sm, orig, inc_children);
239 if (r < 0)
240 return r;
241
242 if (is_shadow(tm, orig) && !*inc_children)
243 return dm_bm_write_lock(tm->bm, orig, v, result);
244
245 r = __shadow_block(tm, orig, v, result);
246 if (r < 0)
247 return r;
248 insert_shadow(tm, dm_block_location(*result));
249
250 return r;
251}
252
253int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
254 struct dm_block_validator *v,
255 struct dm_block **blk)
256{
257 if (tm->is_clone)
258 return dm_bm_read_try_lock(tm->real->bm, b, v, blk);
259
260 return dm_bm_read_lock(tm->bm, b, v, blk);
261}
262
263int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b)
264{
265 return dm_bm_unlock(b);
266}
267EXPORT_SYMBOL_GPL(dm_tm_unlock);
268
269void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b)
270{
271 /*
272 * The non-blocking clone doesn't support this.
273 */
274 BUG_ON(tm->is_clone);
275
276 dm_sm_inc_block(tm->sm, b);
277}
278EXPORT_SYMBOL_GPL(dm_tm_inc);
279
280void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b)
281{
282 /*
283 * The non-blocking clone doesn't support this.
284 */
285 BUG_ON(tm->is_clone);
286
287 dm_sm_dec_block(tm->sm, b);
288}
289EXPORT_SYMBOL_GPL(dm_tm_dec);
290
291int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
292 uint32_t *result)
293{
294 if (tm->is_clone)
295 return -EWOULDBLOCK;
296
297 return dm_sm_get_count(tm->sm, b, result);
298}
299
300struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
301{
302 return tm->bm;
303}
304
305/*----------------------------------------------------------------*/
306
307static int dm_tm_create_internal(struct dm_block_manager *bm,
308 dm_block_t sb_location,
309 struct dm_block_validator *sb_validator,
310 size_t root_offset, size_t root_max_len,
311 struct dm_transaction_manager **tm,
312 struct dm_space_map **sm,
313 struct dm_block **sblock,
314 int create)
315{
316 int r;
317 struct dm_space_map *inner;
318
319 inner = dm_sm_metadata_init();
320 if (IS_ERR(inner))
321 return PTR_ERR(inner);
322
323 *tm = dm_tm_create(bm, inner);
324 if (IS_ERR(*tm)) {
325 dm_sm_destroy(inner);
326 return PTR_ERR(*tm);
327 }
328
329 if (create) {
330 r = dm_bm_write_lock_zero(dm_tm_get_bm(*tm), sb_location,
331 sb_validator, sblock);
332 if (r < 0) {
333 DMERR("couldn't lock superblock");
334 goto bad1;
335 }
336
337 r = dm_sm_metadata_create(inner, *tm, dm_bm_nr_blocks(bm),
338 sb_location);
339 if (r) {
340 DMERR("couldn't create metadata space map");
341 goto bad2;
342 }
343
344 *sm = dm_sm_checker_create(inner);
345 if (!*sm)
346 goto bad2;
347
348 } else {
349 r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location,
350 sb_validator, sblock);
351 if (r < 0) {
352 DMERR("couldn't lock superblock");
353 goto bad1;
354 }
355
356 r = dm_sm_metadata_open(inner, *tm,
357 dm_block_data(*sblock) + root_offset,
358 root_max_len);
359 if (r) {
360 DMERR("couldn't open metadata space map");
361 goto bad2;
362 }
363
364 *sm = dm_sm_checker_create(inner);
365 if (!*sm)
366 goto bad2;
367 }
368
369 return 0;
370
371bad2:
372 dm_tm_unlock(*tm, *sblock);
373bad1:
374 dm_tm_destroy(*tm);
375 dm_sm_destroy(inner);
376 return r;
377}
378
379int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
380 struct dm_block_validator *sb_validator,
381 struct dm_transaction_manager **tm,
382 struct dm_space_map **sm, struct dm_block **sblock)
383{
384 return dm_tm_create_internal(bm, sb_location, sb_validator,
385 0, 0, tm, sm, sblock, 1);
386}
387EXPORT_SYMBOL_GPL(dm_tm_create_with_sm);
388
389int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
390 struct dm_block_validator *sb_validator,
391 size_t root_offset, size_t root_max_len,
392 struct dm_transaction_manager **tm,
393 struct dm_space_map **sm, struct dm_block **sblock)
394{
395 return dm_tm_create_internal(bm, sb_location, sb_validator, root_offset,
396 root_max_len, tm, sm, sblock, 0);
397}
398EXPORT_SYMBOL_GPL(dm_tm_open_with_sm);
399
400/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
new file mode 100644
index 000000000000..6da784871db4
--- /dev/null
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -0,0 +1,130 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef _LINUX_DM_TRANSACTION_MANAGER_H
8#define _LINUX_DM_TRANSACTION_MANAGER_H
9
10#include "dm-block-manager.h"
11
12struct dm_transaction_manager;
13struct dm_space_map;
14
15/*----------------------------------------------------------------*/
16
17/*
18 * This manages the scope of a transaction. It also enforces immutability
19 * of the on-disk data structures by limiting access to writeable blocks.
20 *
21 * Clients should not fiddle with the block manager directly.
22 */
23
24void dm_tm_destroy(struct dm_transaction_manager *tm);
25
26/*
27 * The non-blocking version of a transaction manager is intended for use in
28 * fast path code that needs to do lookups e.g. a dm mapping function.
29 * You create the non-blocking variant from a normal tm. The interface is
30 * the same, except that most functions will just return -EWOULDBLOCK.
31 * Methods that return void yet may block should not be called on a clone
32 * viz. dm_tm_inc, dm_tm_dec. Call dm_tm_destroy() as you would with a normal
33 * tm when you've finished with it. You may not destroy the original prior
34 * to clones.
35 */
36struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real);
37
38/*
39 * We use a 2-phase commit here.
40 *
41 * i) In the first phase the block manager is told to start flushing, and
42 * the changes to the space map are written to disk. You should interrogate
43 * your particular space map to get detail of its root node etc. to be
44 * included in your superblock.
45 *
46 * ii) @root will be committed last. You shouldn't use more than the
47 * first 512 bytes of @root if you wish the transaction to survive a power
48 * failure. You *must* have a write lock held on @root for both stage (i)
49 * and (ii). The commit will drop the write lock.
50 */
51int dm_tm_pre_commit(struct dm_transaction_manager *tm);
52int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root);
53
54/*
55 * These methods are the only way to get hold of a writeable block.
56 */
57
58/*
59 * dm_tm_new_block() is pretty self-explanatory. Make sure you do actually
60 * write to the whole of @data before you unlock, otherwise you could get
61 * a data leak. (The other option is for tm_new_block() to zero new blocks
62 * before handing them out, which will be redundant in most, if not all,
63 * cases).
64 * Zeroes the new block and returns with write lock held.
65 */
66int dm_tm_new_block(struct dm_transaction_manager *tm,
67 struct dm_block_validator *v,
68 struct dm_block **result);
69
70/*
71 * dm_tm_shadow_block() allocates a new block and copies the data from @orig
72 * to it. It then decrements the reference count on original block. Use
73 * this to update the contents of a block in a data structure, don't
74 * confuse this with a clone - you shouldn't access the orig block after
75 * this operation. Because the tm knows the scope of the transaction it
76 * can optimise requests for a shadow of a shadow to a no-op. Don't forget
77 * to unlock when you've finished with the shadow.
78 *
79 * The @inc_children flag is used to tell the caller whether it needs to
80 * adjust reference counts for children. (Data in the block may refer to
81 * other blocks.)
82 *
83 * Shadowing implicitly drops a reference on @orig so you must not have
84 * it locked when you call this.
85 */
86int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
87 struct dm_block_validator *v,
88 struct dm_block **result, int *inc_children);
89
90/*
91 * Read access. You can lock any block you want. If there's a write lock
92 * on it outstanding then it'll block.
93 */
94int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b,
95 struct dm_block_validator *v,
96 struct dm_block **result);
97
98int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b);
99
100/*
101 * Functions for altering the reference count of a block directly.
102 */
103void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b);
104
105void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b);
106
107int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
108 uint32_t *result);
109
110struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
111
112/*
113 * A little utility that ties the knot by producing a transaction manager
114 * that has a space map managed by the transaction manager...
115 *
116 * Returns a tm that has an open transaction to write the new disk sm.
117 * Caller should store the new sm root and commit.
118 */
119int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
120 struct dm_block_validator *sb_validator,
121 struct dm_transaction_manager **tm,
122 struct dm_space_map **sm, struct dm_block **sblock);
123
124int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location,
125 struct dm_block_validator *sb_validator,
126 size_t root_offset, size_t root_max_len,
127 struct dm_transaction_manager **tm,
128 struct dm_space_map **sm, struct dm_block **sblock);
129
130#endif /* _LINUX_DM_TRANSACTION_MANAGER_H */