aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Makefile5
-rw-r--r--drivers/md/bitmap.c11
-rw-r--r--drivers/md/dm-crypt.c8
-rw-r--r--drivers/md/dm-delay.c6
-rw-r--r--drivers/md/dm-exception-store.c749
-rw-r--r--drivers/md/dm-exception-store.h148
-rw-r--r--drivers/md/dm-io.c2
-rw-r--r--drivers/md/dm-ioctl.c16
-rw-r--r--drivers/md/dm-linear.c6
-rw-r--r--drivers/md/dm-log.c40
-rw-r--r--drivers/md/dm-mpath.c14
-rw-r--r--drivers/md/dm-raid1.c24
-rw-r--r--drivers/md/dm-snap-persistent.c704
-rw-r--r--drivers/md/dm-snap-transient.c98
-rw-r--r--drivers/md/dm-snap.c48
-rw-r--r--drivers/md/dm-snap.h129
-rw-r--r--drivers/md/dm-stripe.c4
-rw-r--r--drivers/md/dm-sysfs.c99
-rw-r--r--drivers/md/dm-table.c47
-rw-r--r--drivers/md/dm-target.c15
-rw-r--r--drivers/md/dm-zero.c5
-rw-r--r--drivers/md/dm.c111
-rw-r--r--drivers/md/dm.h10
-rw-r--r--drivers/md/faulty.c3
-rw-r--r--drivers/md/linear.c3
-rw-r--r--drivers/md/md.c416
-rw-r--r--drivers/md/multipath.c3
-rw-r--r--drivers/md/raid0.c178
-rw-r--r--drivers/md/raid1.c11
-rw-r--r--drivers/md/raid10.c3
-rw-r--r--drivers/md/raid5.c8
31 files changed, 1702 insertions, 1222 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 1c615804ea76..72880b7e28d9 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -3,9 +3,10 @@
3# 3#
4 4
5dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ 5dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
6 dm-ioctl.o dm-io.o dm-kcopyd.o 6 dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o
7dm-multipath-objs := dm-path-selector.o dm-mpath.o 7dm-multipath-objs := dm-path-selector.o dm-mpath.o
8dm-snapshot-objs := dm-snap.o dm-exception-store.o 8dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \
9 dm-snap-persistent.o
9dm-mirror-objs := dm-raid1.o 10dm-mirror-objs := dm-raid1.o
10md-mod-objs := md.o bitmap.o 11md-mod-objs := md.o bitmap.o
11raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \ 12raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index ab7c8e4a61f9..719943763391 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -215,7 +215,6 @@ static struct page *read_sb_page(mddev_t *mddev, long offset,
215 /* choose a good rdev and read the page from there */ 215 /* choose a good rdev and read the page from there */
216 216
217 mdk_rdev_t *rdev; 217 mdk_rdev_t *rdev;
218 struct list_head *tmp;
219 sector_t target; 218 sector_t target;
220 219
221 if (!page) 220 if (!page)
@@ -223,7 +222,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset,
223 if (!page) 222 if (!page)
224 return ERR_PTR(-ENOMEM); 223 return ERR_PTR(-ENOMEM);
225 224
226 rdev_for_each(rdev, tmp, mddev) { 225 list_for_each_entry(rdev, &mddev->disks, same_set) {
227 if (! test_bit(In_sync, &rdev->flags) 226 if (! test_bit(In_sync, &rdev->flags)
228 || test_bit(Faulty, &rdev->flags)) 227 || test_bit(Faulty, &rdev->flags))
229 continue; 228 continue;
@@ -964,9 +963,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
964 */ 963 */
965 page = bitmap->sb_page; 964 page = bitmap->sb_page;
966 offset = sizeof(bitmap_super_t); 965 offset = sizeof(bitmap_super_t);
967 read_sb_page(bitmap->mddev, bitmap->offset, 966 if (!file)
968 page, 967 read_sb_page(bitmap->mddev,
969 index, count); 968 bitmap->offset,
969 page,
970 index, count);
970 } else if (file) { 971 } else if (file) {
971 page = read_page(file, index, bitmap, count); 972 page = read_page(file, index, bitmap, count);
972 offset = 0; 973 offset = 0;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ce26c84af064..35bda49796fb 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1060,7 +1060,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1060 goto bad_page_pool; 1060 goto bad_page_pool;
1061 } 1061 }
1062 1062
1063 cc->bs = bioset_create(MIN_IOS, MIN_IOS); 1063 cc->bs = bioset_create(MIN_IOS, 0);
1064 if (!cc->bs) { 1064 if (!cc->bs) {
1065 ti->error = "Cannot allocate crypt bioset"; 1065 ti->error = "Cannot allocate crypt bioset";
1066 goto bad_bs; 1066 goto bad_bs;
@@ -1322,11 +1322,7 @@ static int __init dm_crypt_init(void)
1322 1322
1323static void __exit dm_crypt_exit(void) 1323static void __exit dm_crypt_exit(void)
1324{ 1324{
1325 int r = dm_unregister_target(&crypt_target); 1325 dm_unregister_target(&crypt_target);
1326
1327 if (r < 0)
1328 DMERR("unregister failed %d", r);
1329
1330 kmem_cache_destroy(_crypt_io_pool); 1326 kmem_cache_destroy(_crypt_io_pool);
1331} 1327}
1332 1328
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 848b381f1173..59ee1b015d2d 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -364,11 +364,7 @@ bad_queue:
364 364
365static void __exit dm_delay_exit(void) 365static void __exit dm_delay_exit(void)
366{ 366{
367 int r = dm_unregister_target(&delay_target); 367 dm_unregister_target(&delay_target);
368
369 if (r < 0)
370 DMERR("unregister failed %d", r);
371
372 kmem_cache_destroy(delayed_cache); 368 kmem_cache_destroy(delayed_cache);
373 destroy_workqueue(kdelayd_wq); 369 destroy_workqueue(kdelayd_wq);
374} 370}
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 01590f3e0009..dccbfb0e010f 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -1,756 +1,45 @@
1/* 1/*
2 * dm-exception-store.c
3 *
4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited. 2 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5 * Copyright (C) 2006 Red Hat GmbH 3 * Copyright (C) 2006-2008 Red Hat GmbH
6 * 4 *
7 * This file is released under the GPL. 5 * This file is released under the GPL.
8 */ 6 */
9 7
10#include "dm-snap.h" 8#include "dm-exception-store.h"
11 9
12#include <linux/mm.h> 10#include <linux/mm.h>
13#include <linux/pagemap.h> 11#include <linux/pagemap.h>
14#include <linux/vmalloc.h> 12#include <linux/vmalloc.h>
15#include <linux/slab.h> 13#include <linux/slab.h>
16#include <linux/dm-io.h>
17#include <linux/dm-kcopyd.h>
18
19#define DM_MSG_PREFIX "snapshots"
20#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */
21
22/*-----------------------------------------------------------------
23 * Persistent snapshots, by persistent we mean that the snapshot
24 * will survive a reboot.
25 *---------------------------------------------------------------*/
26
27/*
28 * We need to store a record of which parts of the origin have
29 * been copied to the snapshot device. The snapshot code
30 * requires that we copy exception chunks to chunk aligned areas
31 * of the COW store. It makes sense therefore, to store the
32 * metadata in chunk size blocks.
33 *
34 * There is no backward or forward compatibility implemented,
35 * snapshots with different disk versions than the kernel will
36 * not be usable. It is expected that "lvcreate" will blank out
37 * the start of a fresh COW device before calling the snapshot
38 * constructor.
39 *
40 * The first chunk of the COW device just contains the header.
41 * After this there is a chunk filled with exception metadata,
42 * followed by as many exception chunks as can fit in the
43 * metadata areas.
44 *
45 * All on disk structures are in little-endian format. The end
46 * of the exceptions info is indicated by an exception with a
47 * new_chunk of 0, which is invalid since it would point to the
48 * header chunk.
49 */
50
51/*
52 * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
53 */
54#define SNAP_MAGIC 0x70416e53
55
56/*
57 * The on-disk version of the metadata.
58 */
59#define SNAPSHOT_DISK_VERSION 1
60
61struct disk_header {
62 uint32_t magic;
63
64 /*
65 * Is this snapshot valid. There is no way of recovering
66 * an invalid snapshot.
67 */
68 uint32_t valid;
69
70 /*
71 * Simple, incrementing version. no backward
72 * compatibility.
73 */
74 uint32_t version;
75
76 /* In sectors */
77 uint32_t chunk_size;
78};
79
80struct disk_exception {
81 uint64_t old_chunk;
82 uint64_t new_chunk;
83};
84
85struct commit_callback {
86 void (*callback)(void *, int success);
87 void *context;
88};
89
90/*
91 * The top level structure for a persistent exception store.
92 */
93struct pstore {
94 struct dm_snapshot *snap; /* up pointer to my snapshot */
95 int version;
96 int valid;
97 uint32_t exceptions_per_area;
98
99 /*
100 * Now that we have an asynchronous kcopyd there is no
101 * need for large chunk sizes, so it wont hurt to have a
102 * whole chunks worth of metadata in memory at once.
103 */
104 void *area;
105
106 /*
107 * An area of zeros used to clear the next area.
108 */
109 void *zero_area;
110
111 /*
112 * Used to keep track of which metadata area the data in
113 * 'chunk' refers to.
114 */
115 chunk_t current_area;
116
117 /*
118 * The next free chunk for an exception.
119 */
120 chunk_t next_free;
121
122 /*
123 * The index of next free exception in the current
124 * metadata area.
125 */
126 uint32_t current_committed;
127
128 atomic_t pending_count;
129 uint32_t callback_count;
130 struct commit_callback *callbacks;
131 struct dm_io_client *io_client;
132
133 struct workqueue_struct *metadata_wq;
134};
135
136static unsigned sectors_to_pages(unsigned sectors)
137{
138 return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
139}
140
141static int alloc_area(struct pstore *ps)
142{
143 int r = -ENOMEM;
144 size_t len;
145
146 len = ps->snap->chunk_size << SECTOR_SHIFT;
147
148 /*
149 * Allocate the chunk_size block of memory that will hold
150 * a single metadata area.
151 */
152 ps->area = vmalloc(len);
153 if (!ps->area)
154 return r;
155
156 ps->zero_area = vmalloc(len);
157 if (!ps->zero_area) {
158 vfree(ps->area);
159 return r;
160 }
161 memset(ps->zero_area, 0, len);
162
163 return 0;
164}
165
166static void free_area(struct pstore *ps)
167{
168 vfree(ps->area);
169 ps->area = NULL;
170 vfree(ps->zero_area);
171 ps->zero_area = NULL;
172}
173
174struct mdata_req {
175 struct dm_io_region *where;
176 struct dm_io_request *io_req;
177 struct work_struct work;
178 int result;
179};
180
181static void do_metadata(struct work_struct *work)
182{
183 struct mdata_req *req = container_of(work, struct mdata_req, work);
184
185 req->result = dm_io(req->io_req, 1, req->where, NULL);
186}
187
188/*
189 * Read or write a chunk aligned and sized block of data from a device.
190 */
191static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
192{
193 struct dm_io_region where = {
194 .bdev = ps->snap->cow->bdev,
195 .sector = ps->snap->chunk_size * chunk,
196 .count = ps->snap->chunk_size,
197 };
198 struct dm_io_request io_req = {
199 .bi_rw = rw,
200 .mem.type = DM_IO_VMA,
201 .mem.ptr.vma = ps->area,
202 .client = ps->io_client,
203 .notify.fn = NULL,
204 };
205 struct mdata_req req;
206
207 if (!metadata)
208 return dm_io(&io_req, 1, &where, NULL);
209
210 req.where = &where;
211 req.io_req = &io_req;
212
213 /*
214 * Issue the synchronous I/O from a different thread
215 * to avoid generic_make_request recursion.
216 */
217 INIT_WORK(&req.work, do_metadata);
218 queue_work(ps->metadata_wq, &req.work);
219 flush_workqueue(ps->metadata_wq);
220
221 return req.result;
222}
223
224/*
225 * Convert a metadata area index to a chunk index.
226 */
227static chunk_t area_location(struct pstore *ps, chunk_t area)
228{
229 return 1 + ((ps->exceptions_per_area + 1) * area);
230}
231
232/*
233 * Read or write a metadata area. Remembering to skip the first
234 * chunk which holds the header.
235 */
236static int area_io(struct pstore *ps, int rw)
237{
238 int r;
239 chunk_t chunk;
240
241 chunk = area_location(ps, ps->current_area);
242
243 r = chunk_io(ps, chunk, rw, 0);
244 if (r)
245 return r;
246
247 return 0;
248}
249
250static void zero_memory_area(struct pstore *ps)
251{
252 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
253}
254
255static int zero_disk_area(struct pstore *ps, chunk_t area)
256{
257 struct dm_io_region where = {
258 .bdev = ps->snap->cow->bdev,
259 .sector = ps->snap->chunk_size * area_location(ps, area),
260 .count = ps->snap->chunk_size,
261 };
262 struct dm_io_request io_req = {
263 .bi_rw = WRITE,
264 .mem.type = DM_IO_VMA,
265 .mem.ptr.vma = ps->zero_area,
266 .client = ps->io_client,
267 .notify.fn = NULL,
268 };
269
270 return dm_io(&io_req, 1, &where, NULL);
271}
272
273static int read_header(struct pstore *ps, int *new_snapshot)
274{
275 int r;
276 struct disk_header *dh;
277 chunk_t chunk_size;
278 int chunk_size_supplied = 1;
279
280 /*
281 * Use default chunk size (or hardsect_size, if larger) if none supplied
282 */
283 if (!ps->snap->chunk_size) {
284 ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
285 bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
286 ps->snap->chunk_mask = ps->snap->chunk_size - 1;
287 ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
288 chunk_size_supplied = 0;
289 }
290
291 ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
292 chunk_size));
293 if (IS_ERR(ps->io_client))
294 return PTR_ERR(ps->io_client);
295
296 r = alloc_area(ps);
297 if (r)
298 return r;
299
300 r = chunk_io(ps, 0, READ, 1);
301 if (r)
302 goto bad;
303
304 dh = (struct disk_header *) ps->area;
305
306 if (le32_to_cpu(dh->magic) == 0) {
307 *new_snapshot = 1;
308 return 0;
309 }
310
311 if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
312 DMWARN("Invalid or corrupt snapshot");
313 r = -ENXIO;
314 goto bad;
315 }
316
317 *new_snapshot = 0;
318 ps->valid = le32_to_cpu(dh->valid);
319 ps->version = le32_to_cpu(dh->version);
320 chunk_size = le32_to_cpu(dh->chunk_size);
321
322 if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
323 return 0;
324
325 DMWARN("chunk size %llu in device metadata overrides "
326 "table chunk size of %llu.",
327 (unsigned long long)chunk_size,
328 (unsigned long long)ps->snap->chunk_size);
329
330 /* We had a bogus chunk_size. Fix stuff up. */
331 free_area(ps);
332
333 ps->snap->chunk_size = chunk_size;
334 ps->snap->chunk_mask = chunk_size - 1;
335 ps->snap->chunk_shift = ffs(chunk_size) - 1;
336
337 r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
338 ps->io_client);
339 if (r)
340 return r;
341
342 r = alloc_area(ps);
343 return r;
344
345bad:
346 free_area(ps);
347 return r;
348}
349
350static int write_header(struct pstore *ps)
351{
352 struct disk_header *dh;
353
354 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
355
356 dh = (struct disk_header *) ps->area;
357 dh->magic = cpu_to_le32(SNAP_MAGIC);
358 dh->valid = cpu_to_le32(ps->valid);
359 dh->version = cpu_to_le32(ps->version);
360 dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
361
362 return chunk_io(ps, 0, WRITE, 1);
363}
364
365/*
366 * Access functions for the disk exceptions, these do the endian conversions.
367 */
368static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
369{
370 BUG_ON(index >= ps->exceptions_per_area);
371
372 return ((struct disk_exception *) ps->area) + index;
373}
374 14
375static void read_exception(struct pstore *ps, 15#define DM_MSG_PREFIX "snapshot exception stores"
376 uint32_t index, struct disk_exception *result)
377{
378 struct disk_exception *e = get_exception(ps, index);
379
380 /* copy it */
381 result->old_chunk = le64_to_cpu(e->old_chunk);
382 result->new_chunk = le64_to_cpu(e->new_chunk);
383}
384
385static void write_exception(struct pstore *ps,
386 uint32_t index, struct disk_exception *de)
387{
388 struct disk_exception *e = get_exception(ps, index);
389
390 /* copy it */
391 e->old_chunk = cpu_to_le64(de->old_chunk);
392 e->new_chunk = cpu_to_le64(de->new_chunk);
393}
394 16
395/* 17int dm_exception_store_init(void)
396 * Registers the exceptions that are present in the current area.
397 * 'full' is filled in to indicate if the area has been
398 * filled.
399 */
400static int insert_exceptions(struct pstore *ps, int *full)
401{ 18{
402 int r; 19 int r;
403 unsigned int i;
404 struct disk_exception de;
405
406 /* presume the area is full */
407 *full = 1;
408
409 for (i = 0; i < ps->exceptions_per_area; i++) {
410 read_exception(ps, i, &de);
411
412 /*
413 * If the new_chunk is pointing at the start of
414 * the COW device, where the first metadata area
415 * is we know that we've hit the end of the
416 * exceptions. Therefore the area is not full.
417 */
418 if (de.new_chunk == 0LL) {
419 ps->current_committed = i;
420 *full = 0;
421 break;
422 }
423
424 /*
425 * Keep track of the start of the free chunks.
426 */
427 if (ps->next_free <= de.new_chunk)
428 ps->next_free = de.new_chunk + 1;
429
430 /*
431 * Otherwise we add the exception to the snapshot.
432 */
433 r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
434 if (r)
435 return r;
436 }
437
438 return 0;
439}
440
441static int read_exceptions(struct pstore *ps)
442{
443 int r, full = 1;
444
445 /*
446 * Keeping reading chunks and inserting exceptions until
447 * we find a partially full area.
448 */
449 for (ps->current_area = 0; full; ps->current_area++) {
450 r = area_io(ps, READ);
451 if (r)
452 return r;
453 20
454 r = insert_exceptions(ps, &full); 21 r = dm_transient_snapshot_init();
455 if (r) 22 if (r) {
456 return r; 23 DMERR("Unable to register transient exception store type.");
24 goto transient_fail;
457 } 25 }
458 26
459 ps->current_area--; 27 r = dm_persistent_snapshot_init();
460 28 if (r) {
461 return 0; 29 DMERR("Unable to register persistent exception store type");
462} 30 goto persistent_fail;
463
464static struct pstore *get_info(struct exception_store *store)
465{
466 return (struct pstore *) store->context;
467}
468
469static void persistent_fraction_full(struct exception_store *store,
470 sector_t *numerator, sector_t *denominator)
471{
472 *numerator = get_info(store)->next_free * store->snap->chunk_size;
473 *denominator = get_dev_size(store->snap->cow->bdev);
474}
475
476static void persistent_destroy(struct exception_store *store)
477{
478 struct pstore *ps = get_info(store);
479
480 destroy_workqueue(ps->metadata_wq);
481 dm_io_client_destroy(ps->io_client);
482 vfree(ps->callbacks);
483 free_area(ps);
484 kfree(ps);
485}
486
487static int persistent_read_metadata(struct exception_store *store)
488{
489 int r, uninitialized_var(new_snapshot);
490 struct pstore *ps = get_info(store);
491
492 /*
493 * Read the snapshot header.
494 */
495 r = read_header(ps, &new_snapshot);
496 if (r)
497 return r;
498
499 /*
500 * Now we know correct chunk_size, complete the initialisation.
501 */
502 ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
503 sizeof(struct disk_exception);
504 ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
505 sizeof(*ps->callbacks));
506 if (!ps->callbacks)
507 return -ENOMEM;
508
509 /*
510 * Do we need to setup a new snapshot ?
511 */
512 if (new_snapshot) {
513 r = write_header(ps);
514 if (r) {
515 DMWARN("write_header failed");
516 return r;
517 }
518
519 ps->current_area = 0;
520 zero_memory_area(ps);
521 r = zero_disk_area(ps, 0);
522 if (r) {
523 DMWARN("zero_disk_area(0) failed");
524 return r;
525 }
526 } else {
527 /*
528 * Sanity checks.
529 */
530 if (ps->version != SNAPSHOT_DISK_VERSION) {
531 DMWARN("unable to handle snapshot disk version %d",
532 ps->version);
533 return -EINVAL;
534 }
535
536 /*
537 * Metadata are valid, but snapshot is invalidated
538 */
539 if (!ps->valid)
540 return 1;
541
542 /*
543 * Read the metadata.
544 */
545 r = read_exceptions(ps);
546 if (r)
547 return r;
548 } 31 }
549 32
550 return 0; 33 return 0;
551}
552
553static int persistent_prepare(struct exception_store *store,
554 struct dm_snap_exception *e)
555{
556 struct pstore *ps = get_info(store);
557 uint32_t stride;
558 chunk_t next_free;
559 sector_t size = get_dev_size(store->snap->cow->bdev);
560
561 /* Is there enough room ? */
562 if (size < ((ps->next_free + 1) * store->snap->chunk_size))
563 return -ENOSPC;
564 34
565 e->new_chunk = ps->next_free; 35persistent_fail:
566 36 dm_persistent_snapshot_exit();
567 /* 37transient_fail:
568 * Move onto the next free pending, making sure to take 38 return r;
569 * into account the location of the metadata chunks.
570 */
571 stride = (ps->exceptions_per_area + 1);
572 next_free = ++ps->next_free;
573 if (sector_div(next_free, stride) == 1)
574 ps->next_free++;
575
576 atomic_inc(&ps->pending_count);
577 return 0;
578}
579
580static void persistent_commit(struct exception_store *store,
581 struct dm_snap_exception *e,
582 void (*callback) (void *, int success),
583 void *callback_context)
584{
585 unsigned int i;
586 struct pstore *ps = get_info(store);
587 struct disk_exception de;
588 struct commit_callback *cb;
589
590 de.old_chunk = e->old_chunk;
591 de.new_chunk = e->new_chunk;
592 write_exception(ps, ps->current_committed++, &de);
593
594 /*
595 * Add the callback to the back of the array. This code
596 * is the only place where the callback array is
597 * manipulated, and we know that it will never be called
598 * multiple times concurrently.
599 */
600 cb = ps->callbacks + ps->callback_count++;
601 cb->callback = callback;
602 cb->context = callback_context;
603
604 /*
605 * If there are exceptions in flight and we have not yet
606 * filled this metadata area there's nothing more to do.
607 */
608 if (!atomic_dec_and_test(&ps->pending_count) &&
609 (ps->current_committed != ps->exceptions_per_area))
610 return;
611
612 /*
613 * If we completely filled the current area, then wipe the next one.
614 */
615 if ((ps->current_committed == ps->exceptions_per_area) &&
616 zero_disk_area(ps, ps->current_area + 1))
617 ps->valid = 0;
618
619 /*
620 * Commit exceptions to disk.
621 */
622 if (ps->valid && area_io(ps, WRITE))
623 ps->valid = 0;
624
625 /*
626 * Advance to the next area if this one is full.
627 */
628 if (ps->current_committed == ps->exceptions_per_area) {
629 ps->current_committed = 0;
630 ps->current_area++;
631 zero_memory_area(ps);
632 }
633
634 for (i = 0; i < ps->callback_count; i++) {
635 cb = ps->callbacks + i;
636 cb->callback(cb->context, ps->valid);
637 }
638
639 ps->callback_count = 0;
640}
641
642static void persistent_drop(struct exception_store *store)
643{
644 struct pstore *ps = get_info(store);
645
646 ps->valid = 0;
647 if (write_header(ps))
648 DMWARN("write header failed");
649}
650
651int dm_create_persistent(struct exception_store *store)
652{
653 struct pstore *ps;
654
655 /* allocate the pstore */
656 ps = kmalloc(sizeof(*ps), GFP_KERNEL);
657 if (!ps)
658 return -ENOMEM;
659
660 ps->snap = store->snap;
661 ps->valid = 1;
662 ps->version = SNAPSHOT_DISK_VERSION;
663 ps->area = NULL;
664 ps->next_free = 2; /* skipping the header and first area */
665 ps->current_committed = 0;
666
667 ps->callback_count = 0;
668 atomic_set(&ps->pending_count, 0);
669 ps->callbacks = NULL;
670
671 ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
672 if (!ps->metadata_wq) {
673 kfree(ps);
674 DMERR("couldn't start header metadata update thread");
675 return -ENOMEM;
676 }
677
678 store->destroy = persistent_destroy;
679 store->read_metadata = persistent_read_metadata;
680 store->prepare_exception = persistent_prepare;
681 store->commit_exception = persistent_commit;
682 store->drop_snapshot = persistent_drop;
683 store->fraction_full = persistent_fraction_full;
684 store->context = ps;
685
686 return 0;
687}
688
689/*-----------------------------------------------------------------
690 * Implementation of the store for non-persistent snapshots.
691 *---------------------------------------------------------------*/
692struct transient_c {
693 sector_t next_free;
694};
695
696static void transient_destroy(struct exception_store *store)
697{
698 kfree(store->context);
699}
700
701static int transient_read_metadata(struct exception_store *store)
702{
703 return 0;
704}
705
706static int transient_prepare(struct exception_store *store,
707 struct dm_snap_exception *e)
708{
709 struct transient_c *tc = (struct transient_c *) store->context;
710 sector_t size = get_dev_size(store->snap->cow->bdev);
711
712 if (size < (tc->next_free + store->snap->chunk_size))
713 return -1;
714
715 e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
716 tc->next_free += store->snap->chunk_size;
717
718 return 0;
719}
720
721static void transient_commit(struct exception_store *store,
722 struct dm_snap_exception *e,
723 void (*callback) (void *, int success),
724 void *callback_context)
725{
726 /* Just succeed */
727 callback(callback_context, 1);
728}
729
730static void transient_fraction_full(struct exception_store *store,
731 sector_t *numerator, sector_t *denominator)
732{
733 *numerator = ((struct transient_c *) store->context)->next_free;
734 *denominator = get_dev_size(store->snap->cow->bdev);
735} 39}
736 40
737int dm_create_transient(struct exception_store *store) 41void dm_exception_store_exit(void)
738{ 42{
739 struct transient_c *tc; 43 dm_persistent_snapshot_exit();
740 44 dm_transient_snapshot_exit();
741 store->destroy = transient_destroy;
742 store->read_metadata = transient_read_metadata;
743 store->prepare_exception = transient_prepare;
744 store->commit_exception = transient_commit;
745 store->drop_snapshot = NULL;
746 store->fraction_full = transient_fraction_full;
747
748 tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
749 if (!tc)
750 return -ENOMEM;
751
752 tc->next_free = 0;
753 store->context = tc;
754
755 return 0;
756} 45}
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
new file mode 100644
index 000000000000..bb9f33d5daa2
--- /dev/null
+++ b/drivers/md/dm-exception-store.h
@@ -0,0 +1,148 @@
1/*
2 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
4 *
5 * Device-mapper snapshot exception store.
6 *
7 * This file is released under the GPL.
8 */
9
10#ifndef _LINUX_DM_EXCEPTION_STORE
11#define _LINUX_DM_EXCEPTION_STORE
12
13#include <linux/blkdev.h>
14#include <linux/device-mapper.h>
15
16/*
17 * The snapshot code deals with largish chunks of the disk at a
18 * time. Typically 32k - 512k.
19 */
20typedef sector_t chunk_t;
21
22/*
23 * An exception is used where an old chunk of data has been
24 * replaced by a new one.
25 * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number
26 * of chunks that follow contiguously. Remaining bits hold the number of the
27 * chunk within the device.
28 */
29struct dm_snap_exception {
30 struct list_head hash_list;
31
32 chunk_t old_chunk;
33 chunk_t new_chunk;
34};
35
36/*
37 * Abstraction to handle the meta/layout of exception stores (the
38 * COW device).
39 */
40struct dm_exception_store {
41 /*
42 * Destroys this object when you've finished with it.
43 */
44 void (*destroy) (struct dm_exception_store *store);
45
46 /*
47 * The target shouldn't read the COW device until this is
48 * called. As exceptions are read from the COW, they are
49 * reported back via the callback.
50 */
51 int (*read_metadata) (struct dm_exception_store *store,
52 int (*callback)(void *callback_context,
53 chunk_t old, chunk_t new),
54 void *callback_context);
55
56 /*
57 * Find somewhere to store the next exception.
58 */
59 int (*prepare_exception) (struct dm_exception_store *store,
60 struct dm_snap_exception *e);
61
62 /*
63 * Update the metadata with this exception.
64 */
65 void (*commit_exception) (struct dm_exception_store *store,
66 struct dm_snap_exception *e,
67 void (*callback) (void *, int success),
68 void *callback_context);
69
70 /*
71 * The snapshot is invalid, note this in the metadata.
72 */
73 void (*drop_snapshot) (struct dm_exception_store *store);
74
75 int (*status) (struct dm_exception_store *store, status_type_t status,
76 char *result, unsigned int maxlen);
77
78 /*
79 * Return how full the snapshot is.
80 */
81 void (*fraction_full) (struct dm_exception_store *store,
82 sector_t *numerator,
83 sector_t *denominator);
84
85 struct dm_snapshot *snap;
86 void *context;
87};
88
89/*
90 * Funtions to manipulate consecutive chunks
91 */
92# if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
93# define DM_CHUNK_CONSECUTIVE_BITS 8
94# define DM_CHUNK_NUMBER_BITS 56
95
96static inline chunk_t dm_chunk_number(chunk_t chunk)
97{
98 return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
99}
100
101static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
102{
103 return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
104}
105
106static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
107{
108 e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
109
110 BUG_ON(!dm_consecutive_chunk_count(e));
111}
112
113# else
114# define DM_CHUNK_CONSECUTIVE_BITS 0
115
116static inline chunk_t dm_chunk_number(chunk_t chunk)
117{
118 return chunk;
119}
120
121static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
122{
123 return 0;
124}
125
126static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
127{
128}
129
130# endif
131
132int dm_exception_store_init(void);
133void dm_exception_store_exit(void);
134
135/*
136 * Two exception store implementations.
137 */
138int dm_persistent_snapshot_init(void);
139void dm_persistent_snapshot_exit(void);
140
141int dm_transient_snapshot_init(void);
142void dm_transient_snapshot_exit(void);
143
144int dm_create_persistent(struct dm_exception_store *store);
145
146int dm_create_transient(struct dm_exception_store *store);
147
148#endif /* _LINUX_DM_EXCEPTION_STORE */
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 2fd6d4450637..a34338567a2a 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -56,7 +56,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
56 if (!client->pool) 56 if (!client->pool)
57 goto bad; 57 goto bad;
58 58
59 client->bios = bioset_create(16, 16); 59 client->bios = bioset_create(16, 0);
60 if (!client->bios) 60 if (!client->bios)
61 goto bad; 61 goto bad;
62 62
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 777c948180f9..54d0588fc1f6 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -233,7 +233,7 @@ static void __hash_remove(struct hash_cell *hc)
233 } 233 }
234 234
235 if (hc->new_map) 235 if (hc->new_map)
236 dm_table_put(hc->new_map); 236 dm_table_destroy(hc->new_map);
237 dm_put(hc->md); 237 dm_put(hc->md);
238 free_cell(hc); 238 free_cell(hc);
239} 239}
@@ -827,8 +827,8 @@ static int do_resume(struct dm_ioctl *param)
827 827
828 r = dm_swap_table(md, new_map); 828 r = dm_swap_table(md, new_map);
829 if (r) { 829 if (r) {
830 dm_table_destroy(new_map);
830 dm_put(md); 831 dm_put(md);
831 dm_table_put(new_map);
832 return r; 832 return r;
833 } 833 }
834 834
@@ -836,8 +836,6 @@ static int do_resume(struct dm_ioctl *param)
836 set_disk_ro(dm_disk(md), 0); 836 set_disk_ro(dm_disk(md), 0);
837 else 837 else
838 set_disk_ro(dm_disk(md), 1); 838 set_disk_ro(dm_disk(md), 1);
839
840 dm_table_put(new_map);
841 } 839 }
842 840
843 if (dm_suspended(md)) 841 if (dm_suspended(md))
@@ -1080,7 +1078,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
1080 } 1078 }
1081 1079
1082 if (hc->new_map) 1080 if (hc->new_map)
1083 dm_table_put(hc->new_map); 1081 dm_table_destroy(hc->new_map);
1084 hc->new_map = t; 1082 hc->new_map = t;
1085 up_write(&_hash_lock); 1083 up_write(&_hash_lock);
1086 1084
@@ -1109,7 +1107,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
1109 } 1107 }
1110 1108
1111 if (hc->new_map) { 1109 if (hc->new_map) {
1112 dm_table_put(hc->new_map); 1110 dm_table_destroy(hc->new_map);
1113 hc->new_map = NULL; 1111 hc->new_map = NULL;
1114 } 1112 }
1115 1113
@@ -1550,8 +1548,10 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
1550 goto out; 1548 goto out;
1551 } 1549 }
1552 1550
1553 strcpy(name, hc->name); 1551 if (name)
1554 strcpy(uuid, hc->uuid ? : ""); 1552 strcpy(name, hc->name);
1553 if (uuid)
1554 strcpy(uuid, hc->uuid ? : "");
1555 1555
1556out: 1556out:
1557 up_read(&_hash_lock); 1557 up_read(&_hash_lock);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 44042becad8a..bfa107f59d96 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -142,6 +142,7 @@ static struct target_type linear_target = {
142 .status = linear_status, 142 .status = linear_status,
143 .ioctl = linear_ioctl, 143 .ioctl = linear_ioctl,
144 .merge = linear_merge, 144 .merge = linear_merge,
145 .features = DM_TARGET_SUPPORTS_BARRIERS,
145}; 146};
146 147
147int __init dm_linear_init(void) 148int __init dm_linear_init(void)
@@ -156,8 +157,5 @@ int __init dm_linear_init(void)
156 157
157void dm_linear_exit(void) 158void dm_linear_exit(void)
158{ 159{
159 int r = dm_unregister_target(&linear_target); 160 dm_unregister_target(&linear_target);
160
161 if (r < 0)
162 DMERR("unregister failed %d", r);
163} 161}
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index a8c0fc79ca78..737961f275c1 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -326,8 +326,6 @@ static void header_from_disk(struct log_header *core, struct log_header *disk)
326static int rw_header(struct log_c *lc, int rw) 326static int rw_header(struct log_c *lc, int rw)
327{ 327{
328 lc->io_req.bi_rw = rw; 328 lc->io_req.bi_rw = rw;
329 lc->io_req.mem.ptr.vma = lc->disk_header;
330 lc->io_req.notify.fn = NULL;
331 329
332 return dm_io(&lc->io_req, 1, &lc->header_location, NULL); 330 return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
333} 331}
@@ -362,10 +360,15 @@ static int read_header(struct log_c *log)
362 return 0; 360 return 0;
363} 361}
364 362
365static inline int write_header(struct log_c *log) 363static int _check_region_size(struct dm_target *ti, uint32_t region_size)
366{ 364{
367 header_to_disk(&log->header, log->disk_header); 365 if (region_size < 2 || region_size > ti->len)
368 return rw_header(log, WRITE); 366 return 0;
367
368 if (!is_power_of_2(region_size))
369 return 0;
370
371 return 1;
369} 372}
370 373
371/*---------------------------------------------------------------- 374/*----------------------------------------------------------------
@@ -403,8 +406,9 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
403 } 406 }
404 } 407 }
405 408
406 if (sscanf(argv[0], "%u", &region_size) != 1) { 409 if (sscanf(argv[0], "%u", &region_size) != 1 ||
407 DMWARN("invalid region size string"); 410 !_check_region_size(ti, region_size)) {
411 DMWARN("invalid region size %s", argv[0]);
408 return -EINVAL; 412 return -EINVAL;
409 } 413 }
410 414
@@ -453,8 +457,18 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
453 */ 457 */
454 buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + 458 buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
455 bitset_size, ti->limits.hardsect_size); 459 bitset_size, ti->limits.hardsect_size);
460
461 if (buf_size > dev->bdev->bd_inode->i_size) {
462 DMWARN("log device %s too small: need %llu bytes",
463 dev->name, (unsigned long long)buf_size);
464 kfree(lc);
465 return -EINVAL;
466 }
467
456 lc->header_location.count = buf_size >> SECTOR_SHIFT; 468 lc->header_location.count = buf_size >> SECTOR_SHIFT;
469
457 lc->io_req.mem.type = DM_IO_VMA; 470 lc->io_req.mem.type = DM_IO_VMA;
471 lc->io_req.notify.fn = NULL;
458 lc->io_req.client = dm_io_client_create(dm_div_up(buf_size, 472 lc->io_req.client = dm_io_client_create(dm_div_up(buf_size,
459 PAGE_SIZE)); 473 PAGE_SIZE));
460 if (IS_ERR(lc->io_req.client)) { 474 if (IS_ERR(lc->io_req.client)) {
@@ -467,10 +481,12 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
467 lc->disk_header = vmalloc(buf_size); 481 lc->disk_header = vmalloc(buf_size);
468 if (!lc->disk_header) { 482 if (!lc->disk_header) {
469 DMWARN("couldn't allocate disk log buffer"); 483 DMWARN("couldn't allocate disk log buffer");
484 dm_io_client_destroy(lc->io_req.client);
470 kfree(lc); 485 kfree(lc);
471 return -ENOMEM; 486 return -ENOMEM;
472 } 487 }
473 488
489 lc->io_req.mem.ptr.vma = lc->disk_header;
474 lc->clean_bits = (void *)lc->disk_header + 490 lc->clean_bits = (void *)lc->disk_header +
475 (LOG_OFFSET << SECTOR_SHIFT); 491 (LOG_OFFSET << SECTOR_SHIFT);
476 } 492 }
@@ -482,6 +498,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
482 DMWARN("couldn't allocate sync bitset"); 498 DMWARN("couldn't allocate sync bitset");
483 if (!dev) 499 if (!dev)
484 vfree(lc->clean_bits); 500 vfree(lc->clean_bits);
501 else
502 dm_io_client_destroy(lc->io_req.client);
485 vfree(lc->disk_header); 503 vfree(lc->disk_header);
486 kfree(lc); 504 kfree(lc);
487 return -ENOMEM; 505 return -ENOMEM;
@@ -495,6 +513,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
495 vfree(lc->sync_bits); 513 vfree(lc->sync_bits);
496 if (!dev) 514 if (!dev)
497 vfree(lc->clean_bits); 515 vfree(lc->clean_bits);
516 else
517 dm_io_client_destroy(lc->io_req.client);
498 vfree(lc->disk_header); 518 vfree(lc->disk_header);
499 kfree(lc); 519 kfree(lc);
500 return -ENOMEM; 520 return -ENOMEM;
@@ -631,8 +651,10 @@ static int disk_resume(struct dm_dirty_log *log)
631 /* set the correct number of regions in the header */ 651 /* set the correct number of regions in the header */
632 lc->header.nr_regions = lc->region_count; 652 lc->header.nr_regions = lc->region_count;
633 653
654 header_to_disk(&lc->header, lc->disk_header);
655
634 /* write the new header */ 656 /* write the new header */
635 r = write_header(lc); 657 r = rw_header(lc, WRITE);
636 if (r) { 658 if (r) {
637 DMWARN("%s: Failed to write header on dirty region log device", 659 DMWARN("%s: Failed to write header on dirty region log device",
638 lc->log_dev->name); 660 lc->log_dev->name);
@@ -682,7 +704,7 @@ static int disk_flush(struct dm_dirty_log *log)
682 if (!lc->touched) 704 if (!lc->touched)
683 return 0; 705 return 0;
684 706
685 r = write_header(lc); 707 r = rw_header(lc, WRITE);
686 if (r) 708 if (r)
687 fail_log_device(lc); 709 fail_log_device(lc);
688 else 710 else
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 3d7f4923cd13..095f77bf9681 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -889,7 +889,7 @@ static int fail_path(struct pgpath *pgpath)
889 dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti, 889 dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
890 pgpath->path.dev->name, m->nr_valid_paths); 890 pgpath->path.dev->name, m->nr_valid_paths);
891 891
892 queue_work(kmultipathd, &m->trigger_event); 892 schedule_work(&m->trigger_event);
893 queue_work(kmultipathd, &pgpath->deactivate_path); 893 queue_work(kmultipathd, &pgpath->deactivate_path);
894 894
895out: 895out:
@@ -932,7 +932,7 @@ static int reinstate_path(struct pgpath *pgpath)
932 dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, 932 dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
933 pgpath->path.dev->name, m->nr_valid_paths); 933 pgpath->path.dev->name, m->nr_valid_paths);
934 934
935 queue_work(kmultipathd, &m->trigger_event); 935 schedule_work(&m->trigger_event);
936 936
937out: 937out:
938 spin_unlock_irqrestore(&m->lock, flags); 938 spin_unlock_irqrestore(&m->lock, flags);
@@ -976,7 +976,7 @@ static void bypass_pg(struct multipath *m, struct priority_group *pg,
976 976
977 spin_unlock_irqrestore(&m->lock, flags); 977 spin_unlock_irqrestore(&m->lock, flags);
978 978
979 queue_work(kmultipathd, &m->trigger_event); 979 schedule_work(&m->trigger_event);
980} 980}
981 981
982/* 982/*
@@ -1006,7 +1006,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
1006 } 1006 }
1007 spin_unlock_irqrestore(&m->lock, flags); 1007 spin_unlock_irqrestore(&m->lock, flags);
1008 1008
1009 queue_work(kmultipathd, &m->trigger_event); 1009 schedule_work(&m->trigger_event);
1010 return 0; 1010 return 0;
1011} 1011}
1012 1012
@@ -1495,14 +1495,10 @@ static int __init dm_multipath_init(void)
1495 1495
1496static void __exit dm_multipath_exit(void) 1496static void __exit dm_multipath_exit(void)
1497{ 1497{
1498 int r;
1499
1500 destroy_workqueue(kmpath_handlerd); 1498 destroy_workqueue(kmpath_handlerd);
1501 destroy_workqueue(kmultipathd); 1499 destroy_workqueue(kmultipathd);
1502 1500
1503 r = dm_unregister_target(&multipath_target); 1501 dm_unregister_target(&multipath_target);
1504 if (r < 0)
1505 DMERR("target unregister failed %d", r);
1506 kmem_cache_destroy(_mpio_cache); 1502 kmem_cache_destroy(_mpio_cache);
1507} 1503}
1508 1504
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index ec43f9fa4b2a..4d6bc101962e 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -197,9 +197,6 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
197 struct mirror_set *ms = m->ms; 197 struct mirror_set *ms = m->ms;
198 struct mirror *new; 198 struct mirror *new;
199 199
200 if (!errors_handled(ms))
201 return;
202
203 /* 200 /*
204 * error_count is used for nothing more than a 201 * error_count is used for nothing more than a
205 * simple way to tell if a device has encountered 202 * simple way to tell if a device has encountered
@@ -210,6 +207,9 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
210 if (test_and_set_bit(error_type, &m->error_type)) 207 if (test_and_set_bit(error_type, &m->error_type))
211 return; 208 return;
212 209
210 if (!errors_handled(ms))
211 return;
212
213 if (m != get_default_mirror(ms)) 213 if (m != get_default_mirror(ms))
214 goto out; 214 goto out;
215 215
@@ -808,12 +808,6 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
808 kfree(ms); 808 kfree(ms);
809} 809}
810 810
811static inline int _check_region_size(struct dm_target *ti, uint32_t size)
812{
813 return !(size % (PAGE_SIZE >> 9) || !is_power_of_2(size) ||
814 size > ti->len);
815}
816
817static int get_mirror(struct mirror_set *ms, struct dm_target *ti, 811static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
818 unsigned int mirror, char **argv) 812 unsigned int mirror, char **argv)
819{ 813{
@@ -872,12 +866,6 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
872 return NULL; 866 return NULL;
873 } 867 }
874 868
875 if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
876 ti->error = "Invalid region size";
877 dm_dirty_log_destroy(dl);
878 return NULL;
879 }
880
881 return dl; 869 return dl;
882} 870}
883 871
@@ -1300,11 +1288,7 @@ static int __init dm_mirror_init(void)
1300 1288
1301static void __exit dm_mirror_exit(void) 1289static void __exit dm_mirror_exit(void)
1302{ 1290{
1303 int r; 1291 dm_unregister_target(&mirror_target);
1304
1305 r = dm_unregister_target(&mirror_target);
1306 if (r < 0)
1307 DMERR("unregister failed %d", r);
1308} 1292}
1309 1293
1310/* Module hooks */ 1294/* Module hooks */
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
new file mode 100644
index 000000000000..936b34e0959f
--- /dev/null
+++ b/drivers/md/dm-snap-persistent.c
@@ -0,0 +1,704 @@
1/*
2 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2006-2008 Red Hat GmbH
4 *
5 * This file is released under the GPL.
6 */
7
8#include "dm-exception-store.h"
9#include "dm-snap.h"
10
11#include <linux/mm.h>
12#include <linux/pagemap.h>
13#include <linux/vmalloc.h>
14#include <linux/slab.h>
15#include <linux/dm-io.h>
16
17#define DM_MSG_PREFIX "persistent snapshot"
18#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */
19
20/*-----------------------------------------------------------------
21 * Persistent snapshots, by persistent we mean that the snapshot
22 * will survive a reboot.
23 *---------------------------------------------------------------*/
24
25/*
26 * We need to store a record of which parts of the origin have
27 * been copied to the snapshot device. The snapshot code
28 * requires that we copy exception chunks to chunk aligned areas
29 * of the COW store. It makes sense therefore, to store the
30 * metadata in chunk size blocks.
31 *
32 * There is no backward or forward compatibility implemented,
33 * snapshots with different disk versions than the kernel will
34 * not be usable. It is expected that "lvcreate" will blank out
35 * the start of a fresh COW device before calling the snapshot
36 * constructor.
37 *
38 * The first chunk of the COW device just contains the header.
39 * After this there is a chunk filled with exception metadata,
40 * followed by as many exception chunks as can fit in the
41 * metadata areas.
42 *
43 * All on disk structures are in little-endian format. The end
44 * of the exceptions info is indicated by an exception with a
45 * new_chunk of 0, which is invalid since it would point to the
46 * header chunk.
47 */
48
49/*
50 * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
51 */
52#define SNAP_MAGIC 0x70416e53
53
54/*
55 * The on-disk version of the metadata.
56 */
57#define SNAPSHOT_DISK_VERSION 1
58
59struct disk_header {
60 uint32_t magic;
61
62 /*
63 * Is this snapshot valid. There is no way of recovering
64 * an invalid snapshot.
65 */
66 uint32_t valid;
67
68 /*
69 * Simple, incrementing version. no backward
70 * compatibility.
71 */
72 uint32_t version;
73
74 /* In sectors */
75 uint32_t chunk_size;
76};
77
78struct disk_exception {
79 uint64_t old_chunk;
80 uint64_t new_chunk;
81};
82
83struct commit_callback {
84 void (*callback)(void *, int success);
85 void *context;
86};
87
88/*
89 * The top level structure for a persistent exception store.
90 */
91struct pstore {
92 struct dm_snapshot *snap; /* up pointer to my snapshot */
93 int version;
94 int valid;
95 uint32_t exceptions_per_area;
96
97 /*
98 * Now that we have an asynchronous kcopyd there is no
99 * need for large chunk sizes, so it wont hurt to have a
100 * whole chunks worth of metadata in memory at once.
101 */
102 void *area;
103
104 /*
105 * An area of zeros used to clear the next area.
106 */
107 void *zero_area;
108
109 /*
110 * Used to keep track of which metadata area the data in
111 * 'chunk' refers to.
112 */
113 chunk_t current_area;
114
115 /*
116 * The next free chunk for an exception.
117 */
118 chunk_t next_free;
119
120 /*
121 * The index of next free exception in the current
122 * metadata area.
123 */
124 uint32_t current_committed;
125
126 atomic_t pending_count;
127 uint32_t callback_count;
128 struct commit_callback *callbacks;
129 struct dm_io_client *io_client;
130
131 struct workqueue_struct *metadata_wq;
132};
133
134static unsigned sectors_to_pages(unsigned sectors)
135{
136 return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
137}
138
139static int alloc_area(struct pstore *ps)
140{
141 int r = -ENOMEM;
142 size_t len;
143
144 len = ps->snap->chunk_size << SECTOR_SHIFT;
145
146 /*
147 * Allocate the chunk_size block of memory that will hold
148 * a single metadata area.
149 */
150 ps->area = vmalloc(len);
151 if (!ps->area)
152 return r;
153
154 ps->zero_area = vmalloc(len);
155 if (!ps->zero_area) {
156 vfree(ps->area);
157 return r;
158 }
159 memset(ps->zero_area, 0, len);
160
161 return 0;
162}
163
164static void free_area(struct pstore *ps)
165{
166 vfree(ps->area);
167 ps->area = NULL;
168 vfree(ps->zero_area);
169 ps->zero_area = NULL;
170}
171
172struct mdata_req {
173 struct dm_io_region *where;
174 struct dm_io_request *io_req;
175 struct work_struct work;
176 int result;
177};
178
179static void do_metadata(struct work_struct *work)
180{
181 struct mdata_req *req = container_of(work, struct mdata_req, work);
182
183 req->result = dm_io(req->io_req, 1, req->where, NULL);
184}
185
186/*
187 * Read or write a chunk aligned and sized block of data from a device.
188 */
189static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
190{
191 struct dm_io_region where = {
192 .bdev = ps->snap->cow->bdev,
193 .sector = ps->snap->chunk_size * chunk,
194 .count = ps->snap->chunk_size,
195 };
196 struct dm_io_request io_req = {
197 .bi_rw = rw,
198 .mem.type = DM_IO_VMA,
199 .mem.ptr.vma = ps->area,
200 .client = ps->io_client,
201 .notify.fn = NULL,
202 };
203 struct mdata_req req;
204
205 if (!metadata)
206 return dm_io(&io_req, 1, &where, NULL);
207
208 req.where = &where;
209 req.io_req = &io_req;
210
211 /*
212 * Issue the synchronous I/O from a different thread
213 * to avoid generic_make_request recursion.
214 */
215 INIT_WORK(&req.work, do_metadata);
216 queue_work(ps->metadata_wq, &req.work);
217 flush_workqueue(ps->metadata_wq);
218
219 return req.result;
220}
221
222/*
223 * Convert a metadata area index to a chunk index.
224 */
225static chunk_t area_location(struct pstore *ps, chunk_t area)
226{
227 return 1 + ((ps->exceptions_per_area + 1) * area);
228}
229
230/*
231 * Read or write a metadata area. Remembering to skip the first
232 * chunk which holds the header.
233 */
234static int area_io(struct pstore *ps, int rw)
235{
236 int r;
237 chunk_t chunk;
238
239 chunk = area_location(ps, ps->current_area);
240
241 r = chunk_io(ps, chunk, rw, 0);
242 if (r)
243 return r;
244
245 return 0;
246}
247
248static void zero_memory_area(struct pstore *ps)
249{
250 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
251}
252
253static int zero_disk_area(struct pstore *ps, chunk_t area)
254{
255 struct dm_io_region where = {
256 .bdev = ps->snap->cow->bdev,
257 .sector = ps->snap->chunk_size * area_location(ps, area),
258 .count = ps->snap->chunk_size,
259 };
260 struct dm_io_request io_req = {
261 .bi_rw = WRITE,
262 .mem.type = DM_IO_VMA,
263 .mem.ptr.vma = ps->zero_area,
264 .client = ps->io_client,
265 .notify.fn = NULL,
266 };
267
268 return dm_io(&io_req, 1, &where, NULL);
269}
270
271static int read_header(struct pstore *ps, int *new_snapshot)
272{
273 int r;
274 struct disk_header *dh;
275 chunk_t chunk_size;
276 int chunk_size_supplied = 1;
277
278 /*
279 * Use default chunk size (or hardsect_size, if larger) if none supplied
280 */
281 if (!ps->snap->chunk_size) {
282 ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
283 bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
284 ps->snap->chunk_mask = ps->snap->chunk_size - 1;
285 ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
286 chunk_size_supplied = 0;
287 }
288
289 ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
290 chunk_size));
291 if (IS_ERR(ps->io_client))
292 return PTR_ERR(ps->io_client);
293
294 r = alloc_area(ps);
295 if (r)
296 return r;
297
298 r = chunk_io(ps, 0, READ, 1);
299 if (r)
300 goto bad;
301
302 dh = (struct disk_header *) ps->area;
303
304 if (le32_to_cpu(dh->magic) == 0) {
305 *new_snapshot = 1;
306 return 0;
307 }
308
309 if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
310 DMWARN("Invalid or corrupt snapshot");
311 r = -ENXIO;
312 goto bad;
313 }
314
315 *new_snapshot = 0;
316 ps->valid = le32_to_cpu(dh->valid);
317 ps->version = le32_to_cpu(dh->version);
318 chunk_size = le32_to_cpu(dh->chunk_size);
319
320 if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
321 return 0;
322
323 DMWARN("chunk size %llu in device metadata overrides "
324 "table chunk size of %llu.",
325 (unsigned long long)chunk_size,
326 (unsigned long long)ps->snap->chunk_size);
327
328 /* We had a bogus chunk_size. Fix stuff up. */
329 free_area(ps);
330
331 ps->snap->chunk_size = chunk_size;
332 ps->snap->chunk_mask = chunk_size - 1;
333 ps->snap->chunk_shift = ffs(chunk_size) - 1;
334
335 r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
336 ps->io_client);
337 if (r)
338 return r;
339
340 r = alloc_area(ps);
341 return r;
342
343bad:
344 free_area(ps);
345 return r;
346}
347
348static int write_header(struct pstore *ps)
349{
350 struct disk_header *dh;
351
352 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
353
354 dh = (struct disk_header *) ps->area;
355 dh->magic = cpu_to_le32(SNAP_MAGIC);
356 dh->valid = cpu_to_le32(ps->valid);
357 dh->version = cpu_to_le32(ps->version);
358 dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
359
360 return chunk_io(ps, 0, WRITE, 1);
361}
362
363/*
364 * Access functions for the disk exceptions, these do the endian conversions.
365 */
366static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
367{
368 BUG_ON(index >= ps->exceptions_per_area);
369
370 return ((struct disk_exception *) ps->area) + index;
371}
372
373static void read_exception(struct pstore *ps,
374 uint32_t index, struct disk_exception *result)
375{
376 struct disk_exception *e = get_exception(ps, index);
377
378 /* copy it */
379 result->old_chunk = le64_to_cpu(e->old_chunk);
380 result->new_chunk = le64_to_cpu(e->new_chunk);
381}
382
383static void write_exception(struct pstore *ps,
384 uint32_t index, struct disk_exception *de)
385{
386 struct disk_exception *e = get_exception(ps, index);
387
388 /* copy it */
389 e->old_chunk = cpu_to_le64(de->old_chunk);
390 e->new_chunk = cpu_to_le64(de->new_chunk);
391}
392
393/*
394 * Registers the exceptions that are present in the current area.
395 * 'full' is filled in to indicate if the area has been
396 * filled.
397 */
398static int insert_exceptions(struct pstore *ps,
399 int (*callback)(void *callback_context,
400 chunk_t old, chunk_t new),
401 void *callback_context,
402 int *full)
403{
404 int r;
405 unsigned int i;
406 struct disk_exception de;
407
408 /* presume the area is full */
409 *full = 1;
410
411 for (i = 0; i < ps->exceptions_per_area; i++) {
412 read_exception(ps, i, &de);
413
414 /*
415 * If the new_chunk is pointing at the start of
416 * the COW device, where the first metadata area
417 * is we know that we've hit the end of the
418 * exceptions. Therefore the area is not full.
419 */
420 if (de.new_chunk == 0LL) {
421 ps->current_committed = i;
422 *full = 0;
423 break;
424 }
425
426 /*
427 * Keep track of the start of the free chunks.
428 */
429 if (ps->next_free <= de.new_chunk)
430 ps->next_free = de.new_chunk + 1;
431
432 /*
433 * Otherwise we add the exception to the snapshot.
434 */
435 r = callback(callback_context, de.old_chunk, de.new_chunk);
436 if (r)
437 return r;
438 }
439
440 return 0;
441}
442
443static int read_exceptions(struct pstore *ps,
444 int (*callback)(void *callback_context, chunk_t old,
445 chunk_t new),
446 void *callback_context)
447{
448 int r, full = 1;
449
450 /*
451 * Keeping reading chunks and inserting exceptions until
452 * we find a partially full area.
453 */
454 for (ps->current_area = 0; full; ps->current_area++) {
455 r = area_io(ps, READ);
456 if (r)
457 return r;
458
459 r = insert_exceptions(ps, callback, callback_context, &full);
460 if (r)
461 return r;
462 }
463
464 ps->current_area--;
465
466 return 0;
467}
468
469static struct pstore *get_info(struct dm_exception_store *store)
470{
471 return (struct pstore *) store->context;
472}
473
474static void persistent_fraction_full(struct dm_exception_store *store,
475 sector_t *numerator, sector_t *denominator)
476{
477 *numerator = get_info(store)->next_free * store->snap->chunk_size;
478 *denominator = get_dev_size(store->snap->cow->bdev);
479}
480
481static void persistent_destroy(struct dm_exception_store *store)
482{
483 struct pstore *ps = get_info(store);
484
485 destroy_workqueue(ps->metadata_wq);
486 dm_io_client_destroy(ps->io_client);
487 vfree(ps->callbacks);
488 free_area(ps);
489 kfree(ps);
490}
491
492static int persistent_read_metadata(struct dm_exception_store *store,
493 int (*callback)(void *callback_context,
494 chunk_t old, chunk_t new),
495 void *callback_context)
496{
497 int r, uninitialized_var(new_snapshot);
498 struct pstore *ps = get_info(store);
499
500 /*
501 * Read the snapshot header.
502 */
503 r = read_header(ps, &new_snapshot);
504 if (r)
505 return r;
506
507 /*
508 * Now we know correct chunk_size, complete the initialisation.
509 */
510 ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
511 sizeof(struct disk_exception);
512 ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
513 sizeof(*ps->callbacks));
514 if (!ps->callbacks)
515 return -ENOMEM;
516
517 /*
518 * Do we need to setup a new snapshot ?
519 */
520 if (new_snapshot) {
521 r = write_header(ps);
522 if (r) {
523 DMWARN("write_header failed");
524 return r;
525 }
526
527 ps->current_area = 0;
528 zero_memory_area(ps);
529 r = zero_disk_area(ps, 0);
530 if (r) {
531 DMWARN("zero_disk_area(0) failed");
532 return r;
533 }
534 } else {
535 /*
536 * Sanity checks.
537 */
538 if (ps->version != SNAPSHOT_DISK_VERSION) {
539 DMWARN("unable to handle snapshot disk version %d",
540 ps->version);
541 return -EINVAL;
542 }
543
544 /*
545 * Metadata are valid, but snapshot is invalidated
546 */
547 if (!ps->valid)
548 return 1;
549
550 /*
551 * Read the metadata.
552 */
553 r = read_exceptions(ps, callback, callback_context);
554 if (r)
555 return r;
556 }
557
558 return 0;
559}
560
561static int persistent_prepare_exception(struct dm_exception_store *store,
562 struct dm_snap_exception *e)
563{
564 struct pstore *ps = get_info(store);
565 uint32_t stride;
566 chunk_t next_free;
567 sector_t size = get_dev_size(store->snap->cow->bdev);
568
569 /* Is there enough room ? */
570 if (size < ((ps->next_free + 1) * store->snap->chunk_size))
571 return -ENOSPC;
572
573 e->new_chunk = ps->next_free;
574
575 /*
576 * Move onto the next free pending, making sure to take
577 * into account the location of the metadata chunks.
578 */
579 stride = (ps->exceptions_per_area + 1);
580 next_free = ++ps->next_free;
581 if (sector_div(next_free, stride) == 1)
582 ps->next_free++;
583
584 atomic_inc(&ps->pending_count);
585 return 0;
586}
587
588static void persistent_commit_exception(struct dm_exception_store *store,
589 struct dm_snap_exception *e,
590 void (*callback) (void *, int success),
591 void *callback_context)
592{
593 unsigned int i;
594 struct pstore *ps = get_info(store);
595 struct disk_exception de;
596 struct commit_callback *cb;
597
598 de.old_chunk = e->old_chunk;
599 de.new_chunk = e->new_chunk;
600 write_exception(ps, ps->current_committed++, &de);
601
602 /*
603 * Add the callback to the back of the array. This code
604 * is the only place where the callback array is
605 * manipulated, and we know that it will never be called
606 * multiple times concurrently.
607 */
608 cb = ps->callbacks + ps->callback_count++;
609 cb->callback = callback;
610 cb->context = callback_context;
611
612 /*
613 * If there are exceptions in flight and we have not yet
614 * filled this metadata area there's nothing more to do.
615 */
616 if (!atomic_dec_and_test(&ps->pending_count) &&
617 (ps->current_committed != ps->exceptions_per_area))
618 return;
619
620 /*
621 * If we completely filled the current area, then wipe the next one.
622 */
623 if ((ps->current_committed == ps->exceptions_per_area) &&
624 zero_disk_area(ps, ps->current_area + 1))
625 ps->valid = 0;
626
627 /*
628 * Commit exceptions to disk.
629 */
630 if (ps->valid && area_io(ps, WRITE))
631 ps->valid = 0;
632
633 /*
634 * Advance to the next area if this one is full.
635 */
636 if (ps->current_committed == ps->exceptions_per_area) {
637 ps->current_committed = 0;
638 ps->current_area++;
639 zero_memory_area(ps);
640 }
641
642 for (i = 0; i < ps->callback_count; i++) {
643 cb = ps->callbacks + i;
644 cb->callback(cb->context, ps->valid);
645 }
646
647 ps->callback_count = 0;
648}
649
650static void persistent_drop_snapshot(struct dm_exception_store *store)
651{
652 struct pstore *ps = get_info(store);
653
654 ps->valid = 0;
655 if (write_header(ps))
656 DMWARN("write header failed");
657}
658
659int dm_create_persistent(struct dm_exception_store *store)
660{
661 struct pstore *ps;
662
663 /* allocate the pstore */
664 ps = kmalloc(sizeof(*ps), GFP_KERNEL);
665 if (!ps)
666 return -ENOMEM;
667
668 ps->snap = store->snap;
669 ps->valid = 1;
670 ps->version = SNAPSHOT_DISK_VERSION;
671 ps->area = NULL;
672 ps->next_free = 2; /* skipping the header and first area */
673 ps->current_committed = 0;
674
675 ps->callback_count = 0;
676 atomic_set(&ps->pending_count, 0);
677 ps->callbacks = NULL;
678
679 ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
680 if (!ps->metadata_wq) {
681 kfree(ps);
682 DMERR("couldn't start header metadata update thread");
683 return -ENOMEM;
684 }
685
686 store->destroy = persistent_destroy;
687 store->read_metadata = persistent_read_metadata;
688 store->prepare_exception = persistent_prepare_exception;
689 store->commit_exception = persistent_commit_exception;
690 store->drop_snapshot = persistent_drop_snapshot;
691 store->fraction_full = persistent_fraction_full;
692 store->context = ps;
693
694 return 0;
695}
696
697int dm_persistent_snapshot_init(void)
698{
699 return 0;
700}
701
702void dm_persistent_snapshot_exit(void)
703{
704}
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
new file mode 100644
index 000000000000..7f6e2e6dcb0d
--- /dev/null
+++ b/drivers/md/dm-snap-transient.c
@@ -0,0 +1,98 @@
1/*
2 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2006-2008 Red Hat GmbH
4 *
5 * This file is released under the GPL.
6 */
7
8#include "dm-exception-store.h"
9#include "dm-snap.h"
10
11#include <linux/mm.h>
12#include <linux/pagemap.h>
13#include <linux/vmalloc.h>
14#include <linux/slab.h>
15#include <linux/dm-io.h>
16
17#define DM_MSG_PREFIX "transient snapshot"
18
19/*-----------------------------------------------------------------
20 * Implementation of the store for non-persistent snapshots.
21 *---------------------------------------------------------------*/
22struct transient_c {
23 sector_t next_free;
24};
25
26static void transient_destroy(struct dm_exception_store *store)
27{
28 kfree(store->context);
29}
30
31static int transient_read_metadata(struct dm_exception_store *store,
32 int (*callback)(void *callback_context,
33 chunk_t old, chunk_t new),
34 void *callback_context)
35{
36 return 0;
37}
38
39static int transient_prepare_exception(struct dm_exception_store *store,
40 struct dm_snap_exception *e)
41{
42 struct transient_c *tc = (struct transient_c *) store->context;
43 sector_t size = get_dev_size(store->snap->cow->bdev);
44
45 if (size < (tc->next_free + store->snap->chunk_size))
46 return -1;
47
48 e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
49 tc->next_free += store->snap->chunk_size;
50
51 return 0;
52}
53
54static void transient_commit_exception(struct dm_exception_store *store,
55 struct dm_snap_exception *e,
56 void (*callback) (void *, int success),
57 void *callback_context)
58{
59 /* Just succeed */
60 callback(callback_context, 1);
61}
62
63static void transient_fraction_full(struct dm_exception_store *store,
64 sector_t *numerator, sector_t *denominator)
65{
66 *numerator = ((struct transient_c *) store->context)->next_free;
67 *denominator = get_dev_size(store->snap->cow->bdev);
68}
69
70int dm_create_transient(struct dm_exception_store *store)
71{
72 struct transient_c *tc;
73
74 store->destroy = transient_destroy;
75 store->read_metadata = transient_read_metadata;
76 store->prepare_exception = transient_prepare_exception;
77 store->commit_exception = transient_commit_exception;
78 store->drop_snapshot = NULL;
79 store->fraction_full = transient_fraction_full;
80
81 tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
82 if (!tc)
83 return -ENOMEM;
84
85 tc->next_free = 0;
86 store->context = tc;
87
88 return 0;
89}
90
91int dm_transient_snapshot_init(void)
92{
93 return 0;
94}
95
96void dm_transient_snapshot_exit(void)
97{
98}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 6c96db26b87c..65ff82ff124e 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -9,6 +9,7 @@
9#include <linux/blkdev.h> 9#include <linux/blkdev.h>
10#include <linux/ctype.h> 10#include <linux/ctype.h>
11#include <linux/device-mapper.h> 11#include <linux/device-mapper.h>
12#include <linux/delay.h>
12#include <linux/fs.h> 13#include <linux/fs.h>
13#include <linux/init.h> 14#include <linux/init.h>
14#include <linux/kdev_t.h> 15#include <linux/kdev_t.h>
@@ -20,6 +21,7 @@
20#include <linux/log2.h> 21#include <linux/log2.h>
21#include <linux/dm-kcopyd.h> 22#include <linux/dm-kcopyd.h>
22 23
24#include "dm-exception-store.h"
23#include "dm-snap.h" 25#include "dm-snap.h"
24#include "dm-bio-list.h" 26#include "dm-bio-list.h"
25 27
@@ -428,8 +430,13 @@ out:
428 list_add(&new_e->hash_list, e ? &e->hash_list : l); 430 list_add(&new_e->hash_list, e ? &e->hash_list : l);
429} 431}
430 432
431int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) 433/*
434 * Callback used by the exception stores to load exceptions when
435 * initialising.
436 */
437static int dm_add_exception(void *context, chunk_t old, chunk_t new)
432{ 438{
439 struct dm_snapshot *s = context;
433 struct dm_snap_exception *e; 440 struct dm_snap_exception *e;
434 441
435 e = alloc_exception(); 442 e = alloc_exception();
@@ -658,7 +665,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
658 spin_lock_init(&s->tracked_chunk_lock); 665 spin_lock_init(&s->tracked_chunk_lock);
659 666
660 /* Metadata must only be loaded into one table at once */ 667 /* Metadata must only be loaded into one table at once */
661 r = s->store.read_metadata(&s->store); 668 r = s->store.read_metadata(&s->store, dm_add_exception, (void *)s);
662 if (r < 0) { 669 if (r < 0) {
663 ti->error = "Failed to read snapshot metadata"; 670 ti->error = "Failed to read snapshot metadata";
664 goto bad_load_and_register; 671 goto bad_load_and_register;
@@ -735,7 +742,7 @@ static void snapshot_dtr(struct dm_target *ti)
735 unregister_snapshot(s); 742 unregister_snapshot(s);
736 743
737 while (atomic_read(&s->pending_exceptions_count)) 744 while (atomic_read(&s->pending_exceptions_count))
738 yield(); 745 msleep(1);
739 /* 746 /*
740 * Ensure instructions in mempool_destroy aren't reordered 747 * Ensure instructions in mempool_destroy aren't reordered
741 * before atomic_read. 748 * before atomic_read.
@@ -888,10 +895,10 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
888 895
889 /* 896 /*
890 * Check for conflicting reads. This is extremely improbable, 897 * Check for conflicting reads. This is extremely improbable,
891 * so yield() is sufficient and there is no need for a wait queue. 898 * so msleep(1) is sufficient and there is no need for a wait queue.
892 */ 899 */
893 while (__chunk_is_tracked(s, pe->e.old_chunk)) 900 while (__chunk_is_tracked(s, pe->e.old_chunk))
894 yield(); 901 msleep(1);
895 902
896 /* 903 /*
897 * Add a proper exception, and remove the 904 * Add a proper exception, and remove the
@@ -1404,6 +1411,12 @@ static int __init dm_snapshot_init(void)
1404{ 1411{
1405 int r; 1412 int r;
1406 1413
1414 r = dm_exception_store_init();
1415 if (r) {
1416 DMERR("Failed to initialize exception stores");
1417 return r;
1418 }
1419
1407 r = dm_register_target(&snapshot_target); 1420 r = dm_register_target(&snapshot_target);
1408 if (r) { 1421 if (r) {
1409 DMERR("snapshot target register failed %d", r); 1422 DMERR("snapshot target register failed %d", r);
@@ -1452,39 +1465,34 @@ static int __init dm_snapshot_init(void)
1452 1465
1453 return 0; 1466 return 0;
1454 1467
1455 bad_pending_pool: 1468bad_pending_pool:
1456 kmem_cache_destroy(tracked_chunk_cache); 1469 kmem_cache_destroy(tracked_chunk_cache);
1457 bad5: 1470bad5:
1458 kmem_cache_destroy(pending_cache); 1471 kmem_cache_destroy(pending_cache);
1459 bad4: 1472bad4:
1460 kmem_cache_destroy(exception_cache); 1473 kmem_cache_destroy(exception_cache);
1461 bad3: 1474bad3:
1462 exit_origin_hash(); 1475 exit_origin_hash();
1463 bad2: 1476bad2:
1464 dm_unregister_target(&origin_target); 1477 dm_unregister_target(&origin_target);
1465 bad1: 1478bad1:
1466 dm_unregister_target(&snapshot_target); 1479 dm_unregister_target(&snapshot_target);
1467 return r; 1480 return r;
1468} 1481}
1469 1482
1470static void __exit dm_snapshot_exit(void) 1483static void __exit dm_snapshot_exit(void)
1471{ 1484{
1472 int r;
1473
1474 destroy_workqueue(ksnapd); 1485 destroy_workqueue(ksnapd);
1475 1486
1476 r = dm_unregister_target(&snapshot_target); 1487 dm_unregister_target(&snapshot_target);
1477 if (r) 1488 dm_unregister_target(&origin_target);
1478 DMERR("snapshot unregister failed %d", r);
1479
1480 r = dm_unregister_target(&origin_target);
1481 if (r)
1482 DMERR("origin unregister failed %d", r);
1483 1489
1484 exit_origin_hash(); 1490 exit_origin_hash();
1485 kmem_cache_destroy(pending_cache); 1491 kmem_cache_destroy(pending_cache);
1486 kmem_cache_destroy(exception_cache); 1492 kmem_cache_destroy(exception_cache);
1487 kmem_cache_destroy(tracked_chunk_cache); 1493 kmem_cache_destroy(tracked_chunk_cache);
1494
1495 dm_exception_store_exit();
1488} 1496}
1489 1497
1490/* Module hooks */ 1498/* Module hooks */
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
index 99c0106ede2d..d9e62b43cf85 100644
--- a/drivers/md/dm-snap.h
+++ b/drivers/md/dm-snap.h
@@ -1,6 +1,4 @@
1/* 1/*
2 * dm-snapshot.c
3 *
4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited. 2 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5 * 3 *
6 * This file is released under the GPL. 4 * This file is released under the GPL.
@@ -10,6 +8,7 @@
10#define DM_SNAPSHOT_H 8#define DM_SNAPSHOT_H
11 9
12#include <linux/device-mapper.h> 10#include <linux/device-mapper.h>
11#include "dm-exception-store.h"
13#include "dm-bio-list.h" 12#include "dm-bio-list.h"
14#include <linux/blkdev.h> 13#include <linux/blkdev.h>
15#include <linux/workqueue.h> 14#include <linux/workqueue.h>
@@ -20,116 +19,6 @@ struct exception_table {
20 struct list_head *table; 19 struct list_head *table;
21}; 20};
22 21
23/*
24 * The snapshot code deals with largish chunks of the disk at a
25 * time. Typically 32k - 512k.
26 */
27typedef sector_t chunk_t;
28
29/*
30 * An exception is used where an old chunk of data has been
31 * replaced by a new one.
32 * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number
33 * of chunks that follow contiguously. Remaining bits hold the number of the
34 * chunk within the device.
35 */
36struct dm_snap_exception {
37 struct list_head hash_list;
38
39 chunk_t old_chunk;
40 chunk_t new_chunk;
41};
42
43/*
44 * Funtions to manipulate consecutive chunks
45 */
46# if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
47# define DM_CHUNK_CONSECUTIVE_BITS 8
48# define DM_CHUNK_NUMBER_BITS 56
49
50static inline chunk_t dm_chunk_number(chunk_t chunk)
51{
52 return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
53}
54
55static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
56{
57 return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
58}
59
60static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
61{
62 e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
63
64 BUG_ON(!dm_consecutive_chunk_count(e));
65}
66
67# else
68# define DM_CHUNK_CONSECUTIVE_BITS 0
69
70static inline chunk_t dm_chunk_number(chunk_t chunk)
71{
72 return chunk;
73}
74
75static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
76{
77 return 0;
78}
79
80static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
81{
82}
83
84# endif
85
86/*
87 * Abstraction to handle the meta/layout of exception stores (the
88 * COW device).
89 */
90struct exception_store {
91
92 /*
93 * Destroys this object when you've finished with it.
94 */
95 void (*destroy) (struct exception_store *store);
96
97 /*
98 * The target shouldn't read the COW device until this is
99 * called.
100 */
101 int (*read_metadata) (struct exception_store *store);
102
103 /*
104 * Find somewhere to store the next exception.
105 */
106 int (*prepare_exception) (struct exception_store *store,
107 struct dm_snap_exception *e);
108
109 /*
110 * Update the metadata with this exception.
111 */
112 void (*commit_exception) (struct exception_store *store,
113 struct dm_snap_exception *e,
114 void (*callback) (void *, int success),
115 void *callback_context);
116
117 /*
118 * The snapshot is invalid, note this in the metadata.
119 */
120 void (*drop_snapshot) (struct exception_store *store);
121
122 /*
123 * Return how full the snapshot is.
124 */
125 void (*fraction_full) (struct exception_store *store,
126 sector_t *numerator,
127 sector_t *denominator);
128
129 struct dm_snapshot *snap;
130 void *context;
131};
132
133#define DM_TRACKED_CHUNK_HASH_SIZE 16 22#define DM_TRACKED_CHUNK_HASH_SIZE 16
134#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ 23#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \
135 (DM_TRACKED_CHUNK_HASH_SIZE - 1)) 24 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
@@ -172,7 +61,7 @@ struct dm_snapshot {
172 spinlock_t pe_lock; 61 spinlock_t pe_lock;
173 62
174 /* The on disk metadata handler */ 63 /* The on disk metadata handler */
175 struct exception_store store; 64 struct dm_exception_store store;
176 65
177 struct dm_kcopyd_client *kcopyd_client; 66 struct dm_kcopyd_client *kcopyd_client;
178 67
@@ -187,20 +76,6 @@ struct dm_snapshot {
187}; 76};
188 77
189/* 78/*
190 * Used by the exception stores to load exceptions hen
191 * initialising.
192 */
193int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
194
195/*
196 * Constructor and destructor for the default persistent
197 * store.
198 */
199int dm_create_persistent(struct exception_store *store);
200
201int dm_create_transient(struct exception_store *store);
202
203/*
204 * Return the number of sectors in the device. 79 * Return the number of sectors in the device.
205 */ 80 */
206static inline sector_t get_dev_size(struct block_device *bdev) 81static inline sector_t get_dev_size(struct block_device *bdev)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 9e4ef88d421e..41569bc60abc 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -337,9 +337,7 @@ int __init dm_stripe_init(void)
337 337
338void dm_stripe_exit(void) 338void dm_stripe_exit(void)
339{ 339{
340 if (dm_unregister_target(&stripe_target)) 340 dm_unregister_target(&stripe_target);
341 DMWARN("target unregistration failed");
342
343 destroy_workqueue(kstriped); 341 destroy_workqueue(kstriped);
344 342
345 return; 343 return;
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
new file mode 100644
index 000000000000..a2a45e6c7c8b
--- /dev/null
+++ b/drivers/md/dm-sysfs.c
@@ -0,0 +1,99 @@
1/*
2 * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#include <linux/sysfs.h>
8#include <linux/dm-ioctl.h>
9#include "dm.h"
10
11struct dm_sysfs_attr {
12 struct attribute attr;
13 ssize_t (*show)(struct mapped_device *, char *);
14 ssize_t (*store)(struct mapped_device *, char *);
15};
16
17#define DM_ATTR_RO(_name) \
18struct dm_sysfs_attr dm_attr_##_name = \
19 __ATTR(_name, S_IRUGO, dm_attr_##_name##_show, NULL)
20
21static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr,
22 char *page)
23{
24 struct dm_sysfs_attr *dm_attr;
25 struct mapped_device *md;
26 ssize_t ret;
27
28 dm_attr = container_of(attr, struct dm_sysfs_attr, attr);
29 if (!dm_attr->show)
30 return -EIO;
31
32 md = dm_get_from_kobject(kobj);
33 if (!md)
34 return -EINVAL;
35
36 ret = dm_attr->show(md, page);
37 dm_put(md);
38
39 return ret;
40}
41
42static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf)
43{
44 if (dm_copy_name_and_uuid(md, buf, NULL))
45 return -EIO;
46
47 strcat(buf, "\n");
48 return strlen(buf);
49}
50
51static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
52{
53 if (dm_copy_name_and_uuid(md, NULL, buf))
54 return -EIO;
55
56 strcat(buf, "\n");
57 return strlen(buf);
58}
59
60static DM_ATTR_RO(name);
61static DM_ATTR_RO(uuid);
62
63static struct attribute *dm_attrs[] = {
64 &dm_attr_name.attr,
65 &dm_attr_uuid.attr,
66 NULL,
67};
68
69static struct sysfs_ops dm_sysfs_ops = {
70 .show = dm_attr_show,
71};
72
73/*
74 * dm kobject is embedded in mapped_device structure
75 * no need to define release function here
76 */
77static struct kobj_type dm_ktype = {
78 .sysfs_ops = &dm_sysfs_ops,
79 .default_attrs = dm_attrs,
80};
81
82/*
83 * Initialize kobj
84 * because nobody using md yet, no need to call explicit dm_get/put
85 */
86int dm_sysfs_init(struct mapped_device *md)
87{
88 return kobject_init_and_add(dm_kobject(md), &dm_ktype,
89 &disk_to_dev(dm_disk(md))->kobj,
90 "%s", "dm");
91}
92
93/*
94 * Remove kobj, called after all references removed
95 */
96void dm_sysfs_exit(struct mapped_device *md)
97{
98 kobject_put(dm_kobject(md));
99}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 04e5fd742c2c..2fd66c30f7f8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) 2001 Sistina Software (UK) Limited. 2 * Copyright (C) 2001 Sistina Software (UK) Limited.
3 * Copyright (C) 2004 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
@@ -15,6 +15,7 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/mutex.h> 17#include <linux/mutex.h>
18#include <linux/delay.h>
18#include <asm/atomic.h> 19#include <asm/atomic.h>
19 20
20#define DM_MSG_PREFIX "table" 21#define DM_MSG_PREFIX "table"
@@ -24,6 +25,19 @@
24#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) 25#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
25#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) 26#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
26 27
28/*
29 * The table has always exactly one reference from either mapped_device->map
30 * or hash_cell->new_map. This reference is not counted in table->holders.
31 * A pair of dm_create_table/dm_destroy_table functions is used for table
32 * creation/destruction.
33 *
34 * Temporary references from the other code increase table->holders. A pair
35 * of dm_table_get/dm_table_put functions is used to manipulate it.
36 *
37 * When the table is about to be destroyed, we wait for table->holders to
38 * drop to zero.
39 */
40
27struct dm_table { 41struct dm_table {
28 struct mapped_device *md; 42 struct mapped_device *md;
29 atomic_t holders; 43 atomic_t holders;
@@ -38,6 +52,8 @@ struct dm_table {
38 sector_t *highs; 52 sector_t *highs;
39 struct dm_target *targets; 53 struct dm_target *targets;
40 54
55 unsigned barriers_supported:1;
56
41 /* 57 /*
42 * Indicates the rw permissions for the new logical 58 * Indicates the rw permissions for the new logical
43 * device. This should be a combination of FMODE_READ 59 * device. This should be a combination of FMODE_READ
@@ -226,7 +242,8 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
226 return -ENOMEM; 242 return -ENOMEM;
227 243
228 INIT_LIST_HEAD(&t->devices); 244 INIT_LIST_HEAD(&t->devices);
229 atomic_set(&t->holders, 1); 245 atomic_set(&t->holders, 0);
246 t->barriers_supported = 1;
230 247
231 if (!num_targets) 248 if (!num_targets)
232 num_targets = KEYS_PER_NODE; 249 num_targets = KEYS_PER_NODE;
@@ -256,10 +273,14 @@ static void free_devices(struct list_head *devices)
256 } 273 }
257} 274}
258 275
259static void table_destroy(struct dm_table *t) 276void dm_table_destroy(struct dm_table *t)
260{ 277{
261 unsigned int i; 278 unsigned int i;
262 279
280 while (atomic_read(&t->holders))
281 msleep(1);
282 smp_mb();
283
263 /* free the indexes (see dm_table_complete) */ 284 /* free the indexes (see dm_table_complete) */
264 if (t->depth >= 2) 285 if (t->depth >= 2)
265 vfree(t->index[t->depth - 2]); 286 vfree(t->index[t->depth - 2]);
@@ -297,8 +318,8 @@ void dm_table_put(struct dm_table *t)
297 if (!t) 318 if (!t)
298 return; 319 return;
299 320
300 if (atomic_dec_and_test(&t->holders)) 321 smp_mb__before_atomic_dec();
301 table_destroy(t); 322 atomic_dec(&t->holders);
302} 323}
303 324
304/* 325/*
@@ -728,6 +749,10 @@ int dm_table_add_target(struct dm_table *t, const char *type,
728 /* FIXME: the plan is to combine high here and then have 749 /* FIXME: the plan is to combine high here and then have
729 * the merge fn apply the target level restrictions. */ 750 * the merge fn apply the target level restrictions. */
730 combine_restrictions_low(&t->limits, &tgt->limits); 751 combine_restrictions_low(&t->limits, &tgt->limits);
752
753 if (!(tgt->type->features & DM_TARGET_SUPPORTS_BARRIERS))
754 t->barriers_supported = 0;
755
731 return 0; 756 return 0;
732 757
733 bad: 758 bad:
@@ -772,6 +797,12 @@ int dm_table_complete(struct dm_table *t)
772 797
773 check_for_valid_limits(&t->limits); 798 check_for_valid_limits(&t->limits);
774 799
800 /*
801 * We only support barriers if there is exactly one underlying device.
802 */
803 if (!list_is_singular(&t->devices))
804 t->barriers_supported = 0;
805
775 /* how many indexes will the btree have ? */ 806 /* how many indexes will the btree have ? */
776 leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); 807 leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
777 t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); 808 t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
@@ -986,6 +1017,12 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
986 return t->md; 1017 return t->md;
987} 1018}
988 1019
1020int dm_table_barrier_ok(struct dm_table *t)
1021{
1022 return t->barriers_supported;
1023}
1024EXPORT_SYMBOL(dm_table_barrier_ok);
1025
989EXPORT_SYMBOL(dm_vcalloc); 1026EXPORT_SYMBOL(dm_vcalloc);
990EXPORT_SYMBOL(dm_get_device); 1027EXPORT_SYMBOL(dm_get_device);
991EXPORT_SYMBOL(dm_put_device); 1028EXPORT_SYMBOL(dm_put_device);
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 835cf95b857f..7decf10006e4 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -130,26 +130,26 @@ int dm_register_target(struct target_type *t)
130 return rv; 130 return rv;
131} 131}
132 132
133int dm_unregister_target(struct target_type *t) 133void dm_unregister_target(struct target_type *t)
134{ 134{
135 struct tt_internal *ti; 135 struct tt_internal *ti;
136 136
137 down_write(&_lock); 137 down_write(&_lock);
138 if (!(ti = __find_target_type(t->name))) { 138 if (!(ti = __find_target_type(t->name))) {
139 up_write(&_lock); 139 DMCRIT("Unregistering unrecognised target: %s", t->name);
140 return -EINVAL; 140 BUG();
141 } 141 }
142 142
143 if (ti->use) { 143 if (ti->use) {
144 up_write(&_lock); 144 DMCRIT("Attempt to unregister target still in use: %s",
145 return -ETXTBSY; 145 t->name);
146 BUG();
146 } 147 }
147 148
148 list_del(&ti->list); 149 list_del(&ti->list);
149 kfree(ti); 150 kfree(ti);
150 151
151 up_write(&_lock); 152 up_write(&_lock);
152 return 0;
153} 153}
154 154
155/* 155/*
@@ -187,8 +187,7 @@ int __init dm_target_init(void)
187 187
188void dm_target_exit(void) 188void dm_target_exit(void)
189{ 189{
190 if (dm_unregister_target(&error_target)) 190 dm_unregister_target(&error_target);
191 DMWARN("error target unregistration failed");
192} 191}
193 192
194EXPORT_SYMBOL(dm_register_target); 193EXPORT_SYMBOL(dm_register_target);
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index cdbf126ec106..bbc97030c0c2 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -69,10 +69,7 @@ static int __init dm_zero_init(void)
69 69
70static void __exit dm_zero_exit(void) 70static void __exit dm_zero_exit(void)
71{ 71{
72 int r = dm_unregister_target(&zero_target); 72 dm_unregister_target(&zero_target);
73
74 if (r < 0)
75 DMERR("unregister failed %d", r);
76} 73}
77 74
78module_init(dm_zero_init) 75module_init(dm_zero_init)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c99e4728ff41..51ba1db4b3e7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
@@ -21,6 +21,7 @@
21#include <linux/idr.h> 21#include <linux/idr.h>
22#include <linux/hdreg.h> 22#include <linux/hdreg.h>
23#include <linux/blktrace_api.h> 23#include <linux/blktrace_api.h>
24#include <trace/block.h>
24 25
25#define DM_MSG_PREFIX "core" 26#define DM_MSG_PREFIX "core"
26 27
@@ -31,6 +32,7 @@ static unsigned int _major = 0;
31 32
32static DEFINE_SPINLOCK(_minor_lock); 33static DEFINE_SPINLOCK(_minor_lock);
33/* 34/*
35 * For bio-based dm.
34 * One of these is allocated per bio. 36 * One of these is allocated per bio.
35 */ 37 */
36struct dm_io { 38struct dm_io {
@@ -42,6 +44,7 @@ struct dm_io {
42}; 44};
43 45
44/* 46/*
47 * For bio-based dm.
45 * One of these is allocated per target within a bio. Hopefully 48 * One of these is allocated per target within a bio. Hopefully
46 * this will be simplified out one day. 49 * this will be simplified out one day.
47 */ 50 */
@@ -51,6 +54,29 @@ struct dm_target_io {
51 union map_info info; 54 union map_info info;
52}; 55};
53 56
57DEFINE_TRACE(block_bio_complete);
58
59/*
60 * For request-based dm.
61 * One of these is allocated per request.
62 */
63struct dm_rq_target_io {
64 struct mapped_device *md;
65 struct dm_target *ti;
66 struct request *orig, clone;
67 int error;
68 union map_info info;
69};
70
71/*
72 * For request-based dm.
73 * One of these is allocated per bio.
74 */
75struct dm_rq_clone_bio_info {
76 struct bio *orig;
77 struct request *rq;
78};
79
54union map_info *dm_get_mapinfo(struct bio *bio) 80union map_info *dm_get_mapinfo(struct bio *bio)
55{ 81{
56 if (bio && bio->bi_private) 82 if (bio && bio->bi_private)
@@ -141,11 +167,16 @@ struct mapped_device {
141 167
142 /* forced geometry settings */ 168 /* forced geometry settings */
143 struct hd_geometry geometry; 169 struct hd_geometry geometry;
170
171 /* sysfs handle */
172 struct kobject kobj;
144}; 173};
145 174
146#define MIN_IOS 256 175#define MIN_IOS 256
147static struct kmem_cache *_io_cache; 176static struct kmem_cache *_io_cache;
148static struct kmem_cache *_tio_cache; 177static struct kmem_cache *_tio_cache;
178static struct kmem_cache *_rq_tio_cache;
179static struct kmem_cache *_rq_bio_info_cache;
149 180
150static int __init local_init(void) 181static int __init local_init(void)
151{ 182{
@@ -161,9 +192,17 @@ static int __init local_init(void)
161 if (!_tio_cache) 192 if (!_tio_cache)
162 goto out_free_io_cache; 193 goto out_free_io_cache;
163 194
195 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
196 if (!_rq_tio_cache)
197 goto out_free_tio_cache;
198
199 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
200 if (!_rq_bio_info_cache)
201 goto out_free_rq_tio_cache;
202
164 r = dm_uevent_init(); 203 r = dm_uevent_init();
165 if (r) 204 if (r)
166 goto out_free_tio_cache; 205 goto out_free_rq_bio_info_cache;
167 206
168 _major = major; 207 _major = major;
169 r = register_blkdev(_major, _name); 208 r = register_blkdev(_major, _name);
@@ -177,6 +216,10 @@ static int __init local_init(void)
177 216
178out_uevent_exit: 217out_uevent_exit:
179 dm_uevent_exit(); 218 dm_uevent_exit();
219out_free_rq_bio_info_cache:
220 kmem_cache_destroy(_rq_bio_info_cache);
221out_free_rq_tio_cache:
222 kmem_cache_destroy(_rq_tio_cache);
180out_free_tio_cache: 223out_free_tio_cache:
181 kmem_cache_destroy(_tio_cache); 224 kmem_cache_destroy(_tio_cache);
182out_free_io_cache: 225out_free_io_cache:
@@ -187,6 +230,8 @@ out_free_io_cache:
187 230
188static void local_exit(void) 231static void local_exit(void)
189{ 232{
233 kmem_cache_destroy(_rq_bio_info_cache);
234 kmem_cache_destroy(_rq_tio_cache);
190 kmem_cache_destroy(_tio_cache); 235 kmem_cache_destroy(_tio_cache);
191 kmem_cache_destroy(_io_cache); 236 kmem_cache_destroy(_io_cache);
192 unregister_blkdev(_major, _name); 237 unregister_blkdev(_major, _name);
@@ -504,8 +549,7 @@ static void dec_pending(struct dm_io *io, int error)
504 end_io_acct(io); 549 end_io_acct(io);
505 550
506 if (io->error != DM_ENDIO_REQUEUE) { 551 if (io->error != DM_ENDIO_REQUEUE) {
507 blk_add_trace_bio(io->md->queue, io->bio, 552 trace_block_bio_complete(io->md->queue, io->bio);
508 BLK_TA_COMPLETE);
509 553
510 bio_endio(io->bio, io->error); 554 bio_endio(io->bio, io->error);
511 } 555 }
@@ -598,7 +642,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
598 if (r == DM_MAPIO_REMAPPED) { 642 if (r == DM_MAPIO_REMAPPED) {
599 /* the bio has been remapped so dispatch it */ 643 /* the bio has been remapped so dispatch it */
600 644
601 blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, 645 trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
602 tio->io->bio->bi_bdev->bd_dev, 646 tio->io->bio->bi_bdev->bd_dev,
603 clone->bi_sector, sector); 647 clone->bi_sector, sector);
604 648
@@ -794,7 +838,11 @@ static int __split_bio(struct mapped_device *md, struct bio *bio)
794 ci.map = dm_get_table(md); 838 ci.map = dm_get_table(md);
795 if (unlikely(!ci.map)) 839 if (unlikely(!ci.map))
796 return -EIO; 840 return -EIO;
797 841 if (unlikely(bio_barrier(bio) && !dm_table_barrier_ok(ci.map))) {
842 dm_table_put(ci.map);
843 bio_endio(bio, -EOPNOTSUPP);
844 return 0;
845 }
798 ci.md = md; 846 ci.md = md;
799 ci.bio = bio; 847 ci.bio = bio;
800 ci.io = alloc_io(md); 848 ci.io = alloc_io(md);
@@ -878,15 +926,6 @@ static int dm_request(struct request_queue *q, struct bio *bio)
878 struct mapped_device *md = q->queuedata; 926 struct mapped_device *md = q->queuedata;
879 int cpu; 927 int cpu;
880 928
881 /*
882 * There is no use in forwarding any barrier request since we can't
883 * guarantee it is (or can be) handled by the targets correctly.
884 */
885 if (unlikely(bio_barrier(bio))) {
886 bio_endio(bio, -EOPNOTSUPP);
887 return 0;
888 }
889
890 down_read(&md->io_lock); 929 down_read(&md->io_lock);
891 930
892 cpu = part_stat_lock(); 931 cpu = part_stat_lock();
@@ -941,8 +980,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
941 struct mapped_device *md = congested_data; 980 struct mapped_device *md = congested_data;
942 struct dm_table *map; 981 struct dm_table *map;
943 982
944 atomic_inc(&md->pending);
945
946 if (!test_bit(DMF_BLOCK_IO, &md->flags)) { 983 if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
947 map = dm_get_table(md); 984 map = dm_get_table(md);
948 if (map) { 985 if (map) {
@@ -951,10 +988,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
951 } 988 }
952 } 989 }
953 990
954 if (!atomic_dec_return(&md->pending))
955 /* nudge anyone waiting on suspend queue */
956 wake_up(&md->wait);
957
958 return r; 991 return r;
959} 992}
960 993
@@ -1091,7 +1124,7 @@ static struct mapped_device *alloc_dev(int minor)
1091 if (!md->tio_pool) 1124 if (!md->tio_pool)
1092 goto bad_tio_pool; 1125 goto bad_tio_pool;
1093 1126
1094 md->bs = bioset_create(16, 16); 1127 md->bs = bioset_create(16, 0);
1095 if (!md->bs) 1128 if (!md->bs)
1096 goto bad_no_bioset; 1129 goto bad_no_bioset;
1097 1130
@@ -1214,10 +1247,12 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
1214 1247
1215 if (md->suspended_bdev) 1248 if (md->suspended_bdev)
1216 __set_size(md, size); 1249 __set_size(md, size);
1217 if (size == 0) 1250
1251 if (!size) {
1252 dm_table_destroy(t);
1218 return 0; 1253 return 0;
1254 }
1219 1255
1220 dm_table_get(t);
1221 dm_table_event_callback(t, event_callback, md); 1256 dm_table_event_callback(t, event_callback, md);
1222 1257
1223 write_lock(&md->map_lock); 1258 write_lock(&md->map_lock);
@@ -1239,7 +1274,7 @@ static void __unbind(struct mapped_device *md)
1239 write_lock(&md->map_lock); 1274 write_lock(&md->map_lock);
1240 md->map = NULL; 1275 md->map = NULL;
1241 write_unlock(&md->map_lock); 1276 write_unlock(&md->map_lock);
1242 dm_table_put(map); 1277 dm_table_destroy(map);
1243} 1278}
1244 1279
1245/* 1280/*
@@ -1253,6 +1288,8 @@ int dm_create(int minor, struct mapped_device **result)
1253 if (!md) 1288 if (!md)
1254 return -ENXIO; 1289 return -ENXIO;
1255 1290
1291 dm_sysfs_init(md);
1292
1256 *result = md; 1293 *result = md;
1257 return 0; 1294 return 0;
1258} 1295}
@@ -1328,8 +1365,9 @@ void dm_put(struct mapped_device *md)
1328 dm_table_presuspend_targets(map); 1365 dm_table_presuspend_targets(map);
1329 dm_table_postsuspend_targets(map); 1366 dm_table_postsuspend_targets(map);
1330 } 1367 }
1331 __unbind(md); 1368 dm_sysfs_exit(md);
1332 dm_table_put(map); 1369 dm_table_put(map);
1370 __unbind(md);
1333 free_dev(md); 1371 free_dev(md);
1334 } 1372 }
1335} 1373}
@@ -1667,6 +1705,27 @@ struct gendisk *dm_disk(struct mapped_device *md)
1667 return md->disk; 1705 return md->disk;
1668} 1706}
1669 1707
1708struct kobject *dm_kobject(struct mapped_device *md)
1709{
1710 return &md->kobj;
1711}
1712
1713/*
1714 * struct mapped_device should not be exported outside of dm.c
1715 * so use this check to verify that kobj is part of md structure
1716 */
1717struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
1718{
1719 struct mapped_device *md;
1720
1721 md = container_of(kobj, struct mapped_device, kobj);
1722 if (&md->kobj != kobj)
1723 return NULL;
1724
1725 dm_get(md);
1726 return md;
1727}
1728
1670int dm_suspended(struct mapped_device *md) 1729int dm_suspended(struct mapped_device *md)
1671{ 1730{
1672 return test_bit(DMF_SUSPENDED, &md->flags); 1731 return test_bit(DMF_SUSPENDED, &md->flags);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 0ade60cdef42..20194e000c5a 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -36,6 +36,7 @@ struct dm_table;
36/*----------------------------------------------------------------- 36/*-----------------------------------------------------------------
37 * Internal table functions. 37 * Internal table functions.
38 *---------------------------------------------------------------*/ 38 *---------------------------------------------------------------*/
39void dm_table_destroy(struct dm_table *t);
39void dm_table_event_callback(struct dm_table *t, 40void dm_table_event_callback(struct dm_table *t,
40 void (*fn)(void *), void *context); 41 void (*fn)(void *), void *context);
41struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); 42struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
@@ -51,6 +52,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits);
51 * To check the return value from dm_table_find_target(). 52 * To check the return value from dm_table_find_target().
52 */ 53 */
53#define dm_target_is_valid(t) ((t)->table) 54#define dm_target_is_valid(t) ((t)->table)
55int dm_table_barrier_ok(struct dm_table *t);
54 56
55/*----------------------------------------------------------------- 57/*-----------------------------------------------------------------
56 * A registry of target types. 58 * A registry of target types.
@@ -72,6 +74,14 @@ int dm_interface_init(void);
72void dm_interface_exit(void); 74void dm_interface_exit(void);
73 75
74/* 76/*
77 * sysfs interface
78 */
79int dm_sysfs_init(struct mapped_device *md);
80void dm_sysfs_exit(struct mapped_device *md);
81struct kobject *dm_kobject(struct mapped_device *md);
82struct mapped_device *dm_get_from_kobject(struct kobject *kobj);
83
84/*
75 * Targets for linear and striped mappings 85 * Targets for linear and striped mappings
76 */ 86 */
77int dm_linear_init(void); 87int dm_linear_init(void);
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index f26c1f9a475b..86d9adf90e79 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -283,7 +283,6 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size)
283static int run(mddev_t *mddev) 283static int run(mddev_t *mddev)
284{ 284{
285 mdk_rdev_t *rdev; 285 mdk_rdev_t *rdev;
286 struct list_head *tmp;
287 int i; 286 int i;
288 287
289 conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL); 288 conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL);
@@ -296,7 +295,7 @@ static int run(mddev_t *mddev)
296 } 295 }
297 conf->nfaults = 0; 296 conf->nfaults = 0;
298 297
299 rdev_for_each(rdev, tmp, mddev) 298 list_for_each_entry(rdev, &mddev->disks, same_set)
300 conf->rdev = rdev; 299 conf->rdev = rdev;
301 300
302 mddev->array_sectors = mddev->size * 2; 301 mddev->array_sectors = mddev->size * 2;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 3b90c5c924ec..1e3aea9eecf1 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -105,7 +105,6 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
105 int i, nb_zone, cnt; 105 int i, nb_zone, cnt;
106 sector_t min_sectors; 106 sector_t min_sectors;
107 sector_t curr_sector; 107 sector_t curr_sector;
108 struct list_head *tmp;
109 108
110 conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t), 109 conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
111 GFP_KERNEL); 110 GFP_KERNEL);
@@ -115,7 +114,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
115 cnt = 0; 114 cnt = 0;
116 conf->array_sectors = 0; 115 conf->array_sectors = 0;
117 116
118 rdev_for_each(rdev, tmp, mddev) { 117 list_for_each_entry(rdev, &mddev->disks, same_set) {
119 int j = rdev->raid_disk; 118 int j = rdev->raid_disk;
120 dev_info_t *disk = conf->disks + j; 119 dev_info_t *disk = conf->disks + j;
121 120
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1b1d32694f6f..41e2509bf896 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -214,20 +214,33 @@ static inline mddev_t *mddev_get(mddev_t *mddev)
214 return mddev; 214 return mddev;
215} 215}
216 216
217static void mddev_delayed_delete(struct work_struct *ws)
218{
219 mddev_t *mddev = container_of(ws, mddev_t, del_work);
220 kobject_del(&mddev->kobj);
221 kobject_put(&mddev->kobj);
222}
223
217static void mddev_put(mddev_t *mddev) 224static void mddev_put(mddev_t *mddev)
218{ 225{
219 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 226 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
220 return; 227 return;
221 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 228 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
229 !mddev->hold_active) {
222 list_del(&mddev->all_mddevs); 230 list_del(&mddev->all_mddevs);
223 spin_unlock(&all_mddevs_lock); 231 if (mddev->gendisk) {
224 blk_cleanup_queue(mddev->queue); 232 /* we did a probe so need to clean up.
225 if (mddev->sysfs_state) 233 * Call schedule_work inside the spinlock
226 sysfs_put(mddev->sysfs_state); 234 * so that flush_scheduled_work() after
227 mddev->sysfs_state = NULL; 235 * mddev_find will succeed in waiting for the
228 kobject_put(&mddev->kobj); 236 * work to be done.
229 } else 237 */
230 spin_unlock(&all_mddevs_lock); 238 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
239 schedule_work(&mddev->del_work);
240 } else
241 kfree(mddev);
242 }
243 spin_unlock(&all_mddevs_lock);
231} 244}
232 245
233static mddev_t * mddev_find(dev_t unit) 246static mddev_t * mddev_find(dev_t unit)
@@ -236,15 +249,50 @@ static mddev_t * mddev_find(dev_t unit)
236 249
237 retry: 250 retry:
238 spin_lock(&all_mddevs_lock); 251 spin_lock(&all_mddevs_lock);
239 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 252
240 if (mddev->unit == unit) { 253 if (unit) {
241 mddev_get(mddev); 254 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
255 if (mddev->unit == unit) {
256 mddev_get(mddev);
257 spin_unlock(&all_mddevs_lock);
258 kfree(new);
259 return mddev;
260 }
261
262 if (new) {
263 list_add(&new->all_mddevs, &all_mddevs);
242 spin_unlock(&all_mddevs_lock); 264 spin_unlock(&all_mddevs_lock);
243 kfree(new); 265 new->hold_active = UNTIL_IOCTL;
244 return mddev; 266 return new;
245 } 267 }
246 268 } else if (new) {
247 if (new) { 269 /* find an unused unit number */
270 static int next_minor = 512;
271 int start = next_minor;
272 int is_free = 0;
273 int dev = 0;
274 while (!is_free) {
275 dev = MKDEV(MD_MAJOR, next_minor);
276 next_minor++;
277 if (next_minor > MINORMASK)
278 next_minor = 0;
279 if (next_minor == start) {
280 /* Oh dear, all in use. */
281 spin_unlock(&all_mddevs_lock);
282 kfree(new);
283 return NULL;
284 }
285
286 is_free = 1;
287 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
288 if (mddev->unit == dev) {
289 is_free = 0;
290 break;
291 }
292 }
293 new->unit = dev;
294 new->md_minor = MINOR(dev);
295 new->hold_active = UNTIL_STOP;
248 list_add(&new->all_mddevs, &all_mddevs); 296 list_add(&new->all_mddevs, &all_mddevs);
249 spin_unlock(&all_mddevs_lock); 297 spin_unlock(&all_mddevs_lock);
250 return new; 298 return new;
@@ -275,16 +323,6 @@ static mddev_t * mddev_find(dev_t unit)
275 new->resync_max = MaxSector; 323 new->resync_max = MaxSector;
276 new->level = LEVEL_NONE; 324 new->level = LEVEL_NONE;
277 325
278 new->queue = blk_alloc_queue(GFP_KERNEL);
279 if (!new->queue) {
280 kfree(new);
281 return NULL;
282 }
283 /* Can be unlocked because the queue is new: no concurrency */
284 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, new->queue);
285
286 blk_queue_make_request(new->queue, md_fail_request);
287
288 goto retry; 326 goto retry;
289} 327}
290 328
@@ -307,25 +345,23 @@ static inline void mddev_unlock(mddev_t * mddev)
307 345
308static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 346static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
309{ 347{
310 mdk_rdev_t * rdev; 348 mdk_rdev_t *rdev;
311 struct list_head *tmp;
312 349
313 rdev_for_each(rdev, tmp, mddev) { 350 list_for_each_entry(rdev, &mddev->disks, same_set)
314 if (rdev->desc_nr == nr) 351 if (rdev->desc_nr == nr)
315 return rdev; 352 return rdev;
316 } 353
317 return NULL; 354 return NULL;
318} 355}
319 356
320static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 357static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
321{ 358{
322 struct list_head *tmp;
323 mdk_rdev_t *rdev; 359 mdk_rdev_t *rdev;
324 360
325 rdev_for_each(rdev, tmp, mddev) { 361 list_for_each_entry(rdev, &mddev->disks, same_set)
326 if (rdev->bdev->bd_dev == dev) 362 if (rdev->bdev->bd_dev == dev)
327 return rdev; 363 return rdev;
328 } 364
329 return NULL; 365 return NULL;
330} 366}
331 367
@@ -861,7 +897,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
861static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 897static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
862{ 898{
863 mdp_super_t *sb; 899 mdp_super_t *sb;
864 struct list_head *tmp;
865 mdk_rdev_t *rdev2; 900 mdk_rdev_t *rdev2;
866 int next_spare = mddev->raid_disks; 901 int next_spare = mddev->raid_disks;
867 902
@@ -933,7 +968,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
933 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 968 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
934 969
935 sb->disks[0].state = (1<<MD_DISK_REMOVED); 970 sb->disks[0].state = (1<<MD_DISK_REMOVED);
936 rdev_for_each(rdev2, tmp, mddev) { 971 list_for_each_entry(rdev2, &mddev->disks, same_set) {
937 mdp_disk_t *d; 972 mdp_disk_t *d;
938 int desc_nr; 973 int desc_nr;
939 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 974 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
@@ -1259,7 +1294,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1259static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1294static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1260{ 1295{
1261 struct mdp_superblock_1 *sb; 1296 struct mdp_superblock_1 *sb;
1262 struct list_head *tmp;
1263 mdk_rdev_t *rdev2; 1297 mdk_rdev_t *rdev2;
1264 int max_dev, i; 1298 int max_dev, i;
1265 /* make rdev->sb match mddev and rdev data. */ 1299 /* make rdev->sb match mddev and rdev data. */
@@ -1307,7 +1341,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1307 } 1341 }
1308 1342
1309 max_dev = 0; 1343 max_dev = 0;
1310 rdev_for_each(rdev2, tmp, mddev) 1344 list_for_each_entry(rdev2, &mddev->disks, same_set)
1311 if (rdev2->desc_nr+1 > max_dev) 1345 if (rdev2->desc_nr+1 > max_dev)
1312 max_dev = rdev2->desc_nr+1; 1346 max_dev = rdev2->desc_nr+1;
1313 1347
@@ -1316,7 +1350,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1316 for (i=0; i<max_dev;i++) 1350 for (i=0; i<max_dev;i++)
1317 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1351 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1318 1352
1319 rdev_for_each(rdev2, tmp, mddev) { 1353 list_for_each_entry(rdev2, &mddev->disks, same_set) {
1320 i = rdev2->desc_nr; 1354 i = rdev2->desc_nr;
1321 if (test_bit(Faulty, &rdev2->flags)) 1355 if (test_bit(Faulty, &rdev2->flags))
1322 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1356 sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1466,6 +1500,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1466 1500
1467 list_add_rcu(&rdev->same_set, &mddev->disks); 1501 list_add_rcu(&rdev->same_set, &mddev->disks);
1468 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); 1502 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
1503
1504 /* May as well allow recovery to be retried once */
1505 mddev->recovery_disabled = 0;
1469 return 0; 1506 return 0;
1470 1507
1471 fail: 1508 fail:
@@ -1571,8 +1608,7 @@ static void kick_rdev_from_array(mdk_rdev_t * rdev)
1571 1608
1572static void export_array(mddev_t *mddev) 1609static void export_array(mddev_t *mddev)
1573{ 1610{
1574 struct list_head *tmp; 1611 mdk_rdev_t *rdev, *tmp;
1575 mdk_rdev_t *rdev;
1576 1612
1577 rdev_for_each(rdev, tmp, mddev) { 1613 rdev_for_each(rdev, tmp, mddev) {
1578 if (!rdev->mddev) { 1614 if (!rdev->mddev) {
@@ -1593,7 +1629,7 @@ static void print_desc(mdp_disk_t *desc)
1593 desc->major,desc->minor,desc->raid_disk,desc->state); 1629 desc->major,desc->minor,desc->raid_disk,desc->state);
1594} 1630}
1595 1631
1596static void print_sb(mdp_super_t *sb) 1632static void print_sb_90(mdp_super_t *sb)
1597{ 1633{
1598 int i; 1634 int i;
1599 1635
@@ -1624,10 +1660,57 @@ static void print_sb(mdp_super_t *sb)
1624 } 1660 }
1625 printk(KERN_INFO "md: THIS: "); 1661 printk(KERN_INFO "md: THIS: ");
1626 print_desc(&sb->this_disk); 1662 print_desc(&sb->this_disk);
1627
1628} 1663}
1629 1664
1630static void print_rdev(mdk_rdev_t *rdev) 1665static void print_sb_1(struct mdp_superblock_1 *sb)
1666{
1667 __u8 *uuid;
1668
1669 uuid = sb->set_uuid;
1670 printk(KERN_INFO "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x"
1671 ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
1672 KERN_INFO "md: Name: \"%s\" CT:%llu\n",
1673 le32_to_cpu(sb->major_version),
1674 le32_to_cpu(sb->feature_map),
1675 uuid[0], uuid[1], uuid[2], uuid[3],
1676 uuid[4], uuid[5], uuid[6], uuid[7],
1677 uuid[8], uuid[9], uuid[10], uuid[11],
1678 uuid[12], uuid[13], uuid[14], uuid[15],
1679 sb->set_name,
1680 (unsigned long long)le64_to_cpu(sb->ctime)
1681 & MD_SUPERBLOCK_1_TIME_SEC_MASK);
1682
1683 uuid = sb->device_uuid;
1684 printk(KERN_INFO "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
1685 " RO:%llu\n"
1686 KERN_INFO "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x"
1687 ":%02x%02x%02x%02x%02x%02x\n"
1688 KERN_INFO "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
1689 KERN_INFO "md: (MaxDev:%u) \n",
1690 le32_to_cpu(sb->level),
1691 (unsigned long long)le64_to_cpu(sb->size),
1692 le32_to_cpu(sb->raid_disks),
1693 le32_to_cpu(sb->layout),
1694 le32_to_cpu(sb->chunksize),
1695 (unsigned long long)le64_to_cpu(sb->data_offset),
1696 (unsigned long long)le64_to_cpu(sb->data_size),
1697 (unsigned long long)le64_to_cpu(sb->super_offset),
1698 (unsigned long long)le64_to_cpu(sb->recovery_offset),
1699 le32_to_cpu(sb->dev_number),
1700 uuid[0], uuid[1], uuid[2], uuid[3],
1701 uuid[4], uuid[5], uuid[6], uuid[7],
1702 uuid[8], uuid[9], uuid[10], uuid[11],
1703 uuid[12], uuid[13], uuid[14], uuid[15],
1704 sb->devflags,
1705 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
1706 (unsigned long long)le64_to_cpu(sb->events),
1707 (unsigned long long)le64_to_cpu(sb->resync_offset),
1708 le32_to_cpu(sb->sb_csum),
1709 le32_to_cpu(sb->max_dev)
1710 );
1711}
1712
1713static void print_rdev(mdk_rdev_t *rdev, int major_version)
1631{ 1714{
1632 char b[BDEVNAME_SIZE]; 1715 char b[BDEVNAME_SIZE];
1633 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1716 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
@@ -1635,15 +1718,22 @@ static void print_rdev(mdk_rdev_t *rdev)
1635 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1718 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1636 rdev->desc_nr); 1719 rdev->desc_nr);
1637 if (rdev->sb_loaded) { 1720 if (rdev->sb_loaded) {
1638 printk(KERN_INFO "md: rdev superblock:\n"); 1721 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
1639 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1722 switch (major_version) {
1723 case 0:
1724 print_sb_90((mdp_super_t*)page_address(rdev->sb_page));
1725 break;
1726 case 1:
1727 print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page));
1728 break;
1729 }
1640 } else 1730 } else
1641 printk(KERN_INFO "md: no rdev superblock!\n"); 1731 printk(KERN_INFO "md: no rdev superblock!\n");
1642} 1732}
1643 1733
1644static void md_print_devices(void) 1734static void md_print_devices(void)
1645{ 1735{
1646 struct list_head *tmp, *tmp2; 1736 struct list_head *tmp;
1647 mdk_rdev_t *rdev; 1737 mdk_rdev_t *rdev;
1648 mddev_t *mddev; 1738 mddev_t *mddev;
1649 char b[BDEVNAME_SIZE]; 1739 char b[BDEVNAME_SIZE];
@@ -1658,12 +1748,12 @@ static void md_print_devices(void)
1658 bitmap_print_sb(mddev->bitmap); 1748 bitmap_print_sb(mddev->bitmap);
1659 else 1749 else
1660 printk("%s: ", mdname(mddev)); 1750 printk("%s: ", mdname(mddev));
1661 rdev_for_each(rdev, tmp2, mddev) 1751 list_for_each_entry(rdev, &mddev->disks, same_set)
1662 printk("<%s>", bdevname(rdev->bdev,b)); 1752 printk("<%s>", bdevname(rdev->bdev,b));
1663 printk("\n"); 1753 printk("\n");
1664 1754
1665 rdev_for_each(rdev, tmp2, mddev) 1755 list_for_each_entry(rdev, &mddev->disks, same_set)
1666 print_rdev(rdev); 1756 print_rdev(rdev, mddev->major_version);
1667 } 1757 }
1668 printk("md: **********************************\n"); 1758 printk("md: **********************************\n");
1669 printk("\n"); 1759 printk("\n");
@@ -1679,9 +1769,8 @@ static void sync_sbs(mddev_t * mddev, int nospares)
1679 * with the rest of the array) 1769 * with the rest of the array)
1680 */ 1770 */
1681 mdk_rdev_t *rdev; 1771 mdk_rdev_t *rdev;
1682 struct list_head *tmp;
1683 1772
1684 rdev_for_each(rdev, tmp, mddev) { 1773 list_for_each_entry(rdev, &mddev->disks, same_set) {
1685 if (rdev->sb_events == mddev->events || 1774 if (rdev->sb_events == mddev->events ||
1686 (nospares && 1775 (nospares &&
1687 rdev->raid_disk < 0 && 1776 rdev->raid_disk < 0 &&
@@ -1699,7 +1788,6 @@ static void sync_sbs(mddev_t * mddev, int nospares)
1699 1788
1700static void md_update_sb(mddev_t * mddev, int force_change) 1789static void md_update_sb(mddev_t * mddev, int force_change)
1701{ 1790{
1702 struct list_head *tmp;
1703 mdk_rdev_t *rdev; 1791 mdk_rdev_t *rdev;
1704 int sync_req; 1792 int sync_req;
1705 int nospares = 0; 1793 int nospares = 0;
@@ -1790,7 +1878,7 @@ repeat:
1790 mdname(mddev),mddev->in_sync); 1878 mdname(mddev),mddev->in_sync);
1791 1879
1792 bitmap_update_sb(mddev->bitmap); 1880 bitmap_update_sb(mddev->bitmap);
1793 rdev_for_each(rdev, tmp, mddev) { 1881 list_for_each_entry(rdev, &mddev->disks, same_set) {
1794 char b[BDEVNAME_SIZE]; 1882 char b[BDEVNAME_SIZE];
1795 dprintk(KERN_INFO "md: "); 1883 dprintk(KERN_INFO "md: ");
1796 if (rdev->sb_loaded != 1) 1884 if (rdev->sb_loaded != 1)
@@ -1999,7 +2087,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1999 md_wakeup_thread(rdev->mddev->thread); 2087 md_wakeup_thread(rdev->mddev->thread);
2000 } else if (rdev->mddev->pers) { 2088 } else if (rdev->mddev->pers) {
2001 mdk_rdev_t *rdev2; 2089 mdk_rdev_t *rdev2;
2002 struct list_head *tmp;
2003 /* Activating a spare .. or possibly reactivating 2090 /* Activating a spare .. or possibly reactivating
2004 * if we every get bitmaps working here. 2091 * if we every get bitmaps working here.
2005 */ 2092 */
@@ -2010,7 +2097,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2010 if (rdev->mddev->pers->hot_add_disk == NULL) 2097 if (rdev->mddev->pers->hot_add_disk == NULL)
2011 return -EINVAL; 2098 return -EINVAL;
2012 2099
2013 rdev_for_each(rdev2, tmp, rdev->mddev) 2100 list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
2014 if (rdev2->raid_disk == slot) 2101 if (rdev2->raid_disk == slot)
2015 return -EEXIST; 2102 return -EEXIST;
2016 2103
@@ -2125,14 +2212,14 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2125 */ 2212 */
2126 mddev_t *mddev; 2213 mddev_t *mddev;
2127 int overlap = 0; 2214 int overlap = 0;
2128 struct list_head *tmp, *tmp2; 2215 struct list_head *tmp;
2129 2216
2130 mddev_unlock(my_mddev); 2217 mddev_unlock(my_mddev);
2131 for_each_mddev(mddev, tmp) { 2218 for_each_mddev(mddev, tmp) {
2132 mdk_rdev_t *rdev2; 2219 mdk_rdev_t *rdev2;
2133 2220
2134 mddev_lock(mddev); 2221 mddev_lock(mddev);
2135 rdev_for_each(rdev2, tmp2, mddev) 2222 list_for_each_entry(rdev2, &mddev->disks, same_set)
2136 if (test_bit(AllReserved, &rdev2->flags) || 2223 if (test_bit(AllReserved, &rdev2->flags) ||
2137 (rdev->bdev == rdev2->bdev && 2224 (rdev->bdev == rdev2->bdev &&
2138 rdev != rdev2 && 2225 rdev != rdev2 &&
@@ -2328,8 +2415,7 @@ abort_free:
2328static void analyze_sbs(mddev_t * mddev) 2415static void analyze_sbs(mddev_t * mddev)
2329{ 2416{
2330 int i; 2417 int i;
2331 struct list_head *tmp; 2418 mdk_rdev_t *rdev, *freshest, *tmp;
2332 mdk_rdev_t *rdev, *freshest;
2333 char b[BDEVNAME_SIZE]; 2419 char b[BDEVNAME_SIZE];
2334 2420
2335 freshest = NULL; 2421 freshest = NULL;
@@ -3046,7 +3132,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
3046 } 3132 }
3047 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3133 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3048 md_wakeup_thread(mddev->thread); 3134 md_wakeup_thread(mddev->thread);
3049 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 3135 sysfs_notify_dirent(mddev->sysfs_action);
3050 return len; 3136 return len;
3051} 3137}
3052 3138
@@ -3404,6 +3490,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
3404 if (!capable(CAP_SYS_ADMIN)) 3490 if (!capable(CAP_SYS_ADMIN))
3405 return -EACCES; 3491 return -EACCES;
3406 rv = mddev_lock(mddev); 3492 rv = mddev_lock(mddev);
3493 if (mddev->hold_active == UNTIL_IOCTL)
3494 mddev->hold_active = 0;
3407 if (!rv) { 3495 if (!rv) {
3408 rv = entry->store(mddev, page, length); 3496 rv = entry->store(mddev, page, length);
3409 mddev_unlock(mddev); 3497 mddev_unlock(mddev);
@@ -3414,6 +3502,17 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
3414static void md_free(struct kobject *ko) 3502static void md_free(struct kobject *ko)
3415{ 3503{
3416 mddev_t *mddev = container_of(ko, mddev_t, kobj); 3504 mddev_t *mddev = container_of(ko, mddev_t, kobj);
3505
3506 if (mddev->sysfs_state)
3507 sysfs_put(mddev->sysfs_state);
3508
3509 if (mddev->gendisk) {
3510 del_gendisk(mddev->gendisk);
3511 put_disk(mddev->gendisk);
3512 }
3513 if (mddev->queue)
3514 blk_cleanup_queue(mddev->queue);
3515
3417 kfree(mddev); 3516 kfree(mddev);
3418} 3517}
3419 3518
@@ -3429,34 +3528,74 @@ static struct kobj_type md_ktype = {
3429 3528
3430int mdp_major = 0; 3529int mdp_major = 0;
3431 3530
3432static struct kobject *md_probe(dev_t dev, int *part, void *data) 3531static int md_alloc(dev_t dev, char *name)
3433{ 3532{
3434 static DEFINE_MUTEX(disks_mutex); 3533 static DEFINE_MUTEX(disks_mutex);
3435 mddev_t *mddev = mddev_find(dev); 3534 mddev_t *mddev = mddev_find(dev);
3436 struct gendisk *disk; 3535 struct gendisk *disk;
3437 int partitioned = (MAJOR(dev) != MD_MAJOR); 3536 int partitioned;
3438 int shift = partitioned ? MdpMinorShift : 0; 3537 int shift;
3439 int unit = MINOR(dev) >> shift; 3538 int unit;
3440 int error; 3539 int error;
3441 3540
3442 if (!mddev) 3541 if (!mddev)
3443 return NULL; 3542 return -ENODEV;
3543
3544 partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
3545 shift = partitioned ? MdpMinorShift : 0;
3546 unit = MINOR(mddev->unit) >> shift;
3547
3548 /* wait for any previous instance if this device
3549 * to be completed removed (mddev_delayed_delete).
3550 */
3551 flush_scheduled_work();
3444 3552
3445 mutex_lock(&disks_mutex); 3553 mutex_lock(&disks_mutex);
3446 if (mddev->gendisk) { 3554 if (mddev->gendisk) {
3447 mutex_unlock(&disks_mutex); 3555 mutex_unlock(&disks_mutex);
3448 mddev_put(mddev); 3556 mddev_put(mddev);
3449 return NULL; 3557 return -EEXIST;
3558 }
3559
3560 if (name) {
3561 /* Need to ensure that 'name' is not a duplicate.
3562 */
3563 mddev_t *mddev2;
3564 spin_lock(&all_mddevs_lock);
3565
3566 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
3567 if (mddev2->gendisk &&
3568 strcmp(mddev2->gendisk->disk_name, name) == 0) {
3569 spin_unlock(&all_mddevs_lock);
3570 return -EEXIST;
3571 }
3572 spin_unlock(&all_mddevs_lock);
3573 }
3574
3575 mddev->queue = blk_alloc_queue(GFP_KERNEL);
3576 if (!mddev->queue) {
3577 mutex_unlock(&disks_mutex);
3578 mddev_put(mddev);
3579 return -ENOMEM;
3450 } 3580 }
3581 /* Can be unlocked because the queue is new: no concurrency */
3582 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
3583
3584 blk_queue_make_request(mddev->queue, md_fail_request);
3585
3451 disk = alloc_disk(1 << shift); 3586 disk = alloc_disk(1 << shift);
3452 if (!disk) { 3587 if (!disk) {
3453 mutex_unlock(&disks_mutex); 3588 mutex_unlock(&disks_mutex);
3589 blk_cleanup_queue(mddev->queue);
3590 mddev->queue = NULL;
3454 mddev_put(mddev); 3591 mddev_put(mddev);
3455 return NULL; 3592 return -ENOMEM;
3456 } 3593 }
3457 disk->major = MAJOR(dev); 3594 disk->major = MAJOR(mddev->unit);
3458 disk->first_minor = unit << shift; 3595 disk->first_minor = unit << shift;
3459 if (partitioned) 3596 if (name)
3597 strcpy(disk->disk_name, name);
3598 else if (partitioned)
3460 sprintf(disk->disk_name, "md_d%d", unit); 3599 sprintf(disk->disk_name, "md_d%d", unit);
3461 else 3600 else
3462 sprintf(disk->disk_name, "md%d", unit); 3601 sprintf(disk->disk_name, "md%d", unit);
@@ -3464,7 +3603,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
3464 disk->private_data = mddev; 3603 disk->private_data = mddev;
3465 disk->queue = mddev->queue; 3604 disk->queue = mddev->queue;
3466 /* Allow extended partitions. This makes the 3605 /* Allow extended partitions. This makes the
3467 * 'mdp' device redundant, but we can really 3606 * 'mdp' device redundant, but we can't really
3468 * remove it now. 3607 * remove it now.
3469 */ 3608 */
3470 disk->flags |= GENHD_FL_EXT_DEVT; 3609 disk->flags |= GENHD_FL_EXT_DEVT;
@@ -3480,9 +3619,35 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
3480 kobject_uevent(&mddev->kobj, KOBJ_ADD); 3619 kobject_uevent(&mddev->kobj, KOBJ_ADD);
3481 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); 3620 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
3482 } 3621 }
3622 mddev_put(mddev);
3623 return 0;
3624}
3625
3626static struct kobject *md_probe(dev_t dev, int *part, void *data)
3627{
3628 md_alloc(dev, NULL);
3483 return NULL; 3629 return NULL;
3484} 3630}
3485 3631
3632static int add_named_array(const char *val, struct kernel_param *kp)
3633{
3634 /* val must be "md_*" where * is not all digits.
3635 * We allocate an array with a large free minor number, and
3636 * set the name to val. val must not already be an active name.
3637 */
3638 int len = strlen(val);
3639 char buf[DISK_NAME_LEN];
3640
3641 while (len && val[len-1] == '\n')
3642 len--;
3643 if (len >= DISK_NAME_LEN)
3644 return -E2BIG;
3645 strlcpy(buf, val, len+1);
3646 if (strncmp(buf, "md_", 3) != 0)
3647 return -EINVAL;
3648 return md_alloc(0, buf);
3649}
3650
3486static void md_safemode_timeout(unsigned long data) 3651static void md_safemode_timeout(unsigned long data)
3487{ 3652{
3488 mddev_t *mddev = (mddev_t *) data; 3653 mddev_t *mddev = (mddev_t *) data;
@@ -3501,7 +3666,6 @@ static int do_md_run(mddev_t * mddev)
3501{ 3666{
3502 int err; 3667 int err;
3503 int chunk_size; 3668 int chunk_size;
3504 struct list_head *tmp;
3505 mdk_rdev_t *rdev; 3669 mdk_rdev_t *rdev;
3506 struct gendisk *disk; 3670 struct gendisk *disk;
3507 struct mdk_personality *pers; 3671 struct mdk_personality *pers;
@@ -3540,7 +3704,7 @@ static int do_md_run(mddev_t * mddev)
3540 } 3704 }
3541 3705
3542 /* devices must have minimum size of one chunk */ 3706 /* devices must have minimum size of one chunk */
3543 rdev_for_each(rdev, tmp, mddev) { 3707 list_for_each_entry(rdev, &mddev->disks, same_set) {
3544 if (test_bit(Faulty, &rdev->flags)) 3708 if (test_bit(Faulty, &rdev->flags))
3545 continue; 3709 continue;
3546 if (rdev->size < chunk_size / 1024) { 3710 if (rdev->size < chunk_size / 1024) {
@@ -3565,7 +3729,7 @@ static int do_md_run(mddev_t * mddev)
3565 * the only valid external interface is through the md 3729 * the only valid external interface is through the md
3566 * device. 3730 * device.
3567 */ 3731 */
3568 rdev_for_each(rdev, tmp, mddev) { 3732 list_for_each_entry(rdev, &mddev->disks, same_set) {
3569 if (test_bit(Faulty, &rdev->flags)) 3733 if (test_bit(Faulty, &rdev->flags))
3570 continue; 3734 continue;
3571 sync_blockdev(rdev->bdev); 3735 sync_blockdev(rdev->bdev);
@@ -3630,10 +3794,10 @@ static int do_md_run(mddev_t * mddev)
3630 */ 3794 */
3631 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 3795 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3632 mdk_rdev_t *rdev2; 3796 mdk_rdev_t *rdev2;
3633 struct list_head *tmp2;
3634 int warned = 0; 3797 int warned = 0;
3635 rdev_for_each(rdev, tmp, mddev) { 3798
3636 rdev_for_each(rdev2, tmp2, mddev) { 3799 list_for_each_entry(rdev, &mddev->disks, same_set)
3800 list_for_each_entry(rdev2, &mddev->disks, same_set) {
3637 if (rdev < rdev2 && 3801 if (rdev < rdev2 &&
3638 rdev->bdev->bd_contains == 3802 rdev->bdev->bd_contains ==
3639 rdev2->bdev->bd_contains) { 3803 rdev2->bdev->bd_contains) {
@@ -3647,7 +3811,7 @@ static int do_md_run(mddev_t * mddev)
3647 warned = 1; 3811 warned = 1;
3648 } 3812 }
3649 } 3813 }
3650 } 3814
3651 if (warned) 3815 if (warned)
3652 printk(KERN_WARNING 3816 printk(KERN_WARNING
3653 "True protection against single-disk" 3817 "True protection against single-disk"
@@ -3684,6 +3848,7 @@ static int do_md_run(mddev_t * mddev)
3684 printk(KERN_WARNING 3848 printk(KERN_WARNING
3685 "md: cannot register extra attributes for %s\n", 3849 "md: cannot register extra attributes for %s\n",
3686 mdname(mddev)); 3850 mdname(mddev));
3851 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
3687 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 3852 } else if (mddev->ro == 2) /* auto-readonly not meaningful */
3688 mddev->ro = 0; 3853 mddev->ro = 0;
3689 3854
@@ -3694,7 +3859,7 @@ static int do_md_run(mddev_t * mddev)
3694 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 3859 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
3695 mddev->in_sync = 1; 3860 mddev->in_sync = 1;
3696 3861
3697 rdev_for_each(rdev, tmp, mddev) 3862 list_for_each_entry(rdev, &mddev->disks, same_set)
3698 if (rdev->raid_disk >= 0) { 3863 if (rdev->raid_disk >= 0) {
3699 char nm[20]; 3864 char nm[20];
3700 sprintf(nm, "rd%d", rdev->raid_disk); 3865 sprintf(nm, "rd%d", rdev->raid_disk);
@@ -3725,9 +3890,8 @@ static int do_md_run(mddev_t * mddev)
3725 * it will remove the drives and not do the right thing 3890 * it will remove the drives and not do the right thing
3726 */ 3891 */
3727 if (mddev->degraded && !mddev->sync_thread) { 3892 if (mddev->degraded && !mddev->sync_thread) {
3728 struct list_head *rtmp;
3729 int spares = 0; 3893 int spares = 0;
3730 rdev_for_each(rdev, rtmp, mddev) 3894 list_for_each_entry(rdev, &mddev->disks, same_set)
3731 if (rdev->raid_disk >= 0 && 3895 if (rdev->raid_disk >= 0 &&
3732 !test_bit(In_sync, &rdev->flags) && 3896 !test_bit(In_sync, &rdev->flags) &&
3733 !test_bit(Faulty, &rdev->flags)) 3897 !test_bit(Faulty, &rdev->flags))
@@ -3754,7 +3918,8 @@ static int do_md_run(mddev_t * mddev)
3754 mddev->changed = 1; 3918 mddev->changed = 1;
3755 md_new_event(mddev); 3919 md_new_event(mddev);
3756 sysfs_notify_dirent(mddev->sysfs_state); 3920 sysfs_notify_dirent(mddev->sysfs_state);
3757 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 3921 if (mddev->sysfs_action)
3922 sysfs_notify_dirent(mddev->sysfs_action);
3758 sysfs_notify(&mddev->kobj, NULL, "degraded"); 3923 sysfs_notify(&mddev->kobj, NULL, "degraded");
3759 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 3924 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
3760 return 0; 3925 return 0;
@@ -3854,9 +4019,12 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
3854 mddev->queue->merge_bvec_fn = NULL; 4019 mddev->queue->merge_bvec_fn = NULL;
3855 mddev->queue->unplug_fn = NULL; 4020 mddev->queue->unplug_fn = NULL;
3856 mddev->queue->backing_dev_info.congested_fn = NULL; 4021 mddev->queue->backing_dev_info.congested_fn = NULL;
3857 if (mddev->pers->sync_request) 4022 if (mddev->pers->sync_request) {
3858 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 4023 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3859 4024 if (mddev->sysfs_action)
4025 sysfs_put(mddev->sysfs_action);
4026 mddev->sysfs_action = NULL;
4027 }
3860 module_put(mddev->pers->owner); 4028 module_put(mddev->pers->owner);
3861 mddev->pers = NULL; 4029 mddev->pers = NULL;
3862 /* tell userspace to handle 'inactive' */ 4030 /* tell userspace to handle 'inactive' */
@@ -3883,7 +4051,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
3883 */ 4051 */
3884 if (mode == 0) { 4052 if (mode == 0) {
3885 mdk_rdev_t *rdev; 4053 mdk_rdev_t *rdev;
3886 struct list_head *tmp;
3887 4054
3888 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 4055 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
3889 4056
@@ -3895,7 +4062,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
3895 } 4062 }
3896 mddev->bitmap_offset = 0; 4063 mddev->bitmap_offset = 0;
3897 4064
3898 rdev_for_each(rdev, tmp, mddev) 4065 list_for_each_entry(rdev, &mddev->disks, same_set)
3899 if (rdev->raid_disk >= 0) { 4066 if (rdev->raid_disk >= 0) {
3900 char nm[20]; 4067 char nm[20];
3901 sprintf(nm, "rd%d", rdev->raid_disk); 4068 sprintf(nm, "rd%d", rdev->raid_disk);
@@ -3941,6 +4108,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
3941 mddev->barriers_work = 0; 4108 mddev->barriers_work = 0;
3942 mddev->safemode = 0; 4109 mddev->safemode = 0;
3943 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4110 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4111 if (mddev->hold_active == UNTIL_STOP)
4112 mddev->hold_active = 0;
3944 4113
3945 } else if (mddev->pers) 4114 } else if (mddev->pers)
3946 printk(KERN_INFO "md: %s switched to read-only mode.\n", 4115 printk(KERN_INFO "md: %s switched to read-only mode.\n",
@@ -3956,7 +4125,6 @@ out:
3956static void autorun_array(mddev_t *mddev) 4125static void autorun_array(mddev_t *mddev)
3957{ 4126{
3958 mdk_rdev_t *rdev; 4127 mdk_rdev_t *rdev;
3959 struct list_head *tmp;
3960 int err; 4128 int err;
3961 4129
3962 if (list_empty(&mddev->disks)) 4130 if (list_empty(&mddev->disks))
@@ -3964,7 +4132,7 @@ static void autorun_array(mddev_t *mddev)
3964 4132
3965 printk(KERN_INFO "md: running: "); 4133 printk(KERN_INFO "md: running: ");
3966 4134
3967 rdev_for_each(rdev, tmp, mddev) { 4135 list_for_each_entry(rdev, &mddev->disks, same_set) {
3968 char b[BDEVNAME_SIZE]; 4136 char b[BDEVNAME_SIZE];
3969 printk("<%s>", bdevname(rdev->bdev,b)); 4137 printk("<%s>", bdevname(rdev->bdev,b));
3970 } 4138 }
@@ -3991,8 +4159,7 @@ static void autorun_array(mddev_t *mddev)
3991 */ 4159 */
3992static void autorun_devices(int part) 4160static void autorun_devices(int part)
3993{ 4161{
3994 struct list_head *tmp; 4162 mdk_rdev_t *rdev0, *rdev, *tmp;
3995 mdk_rdev_t *rdev0, *rdev;
3996 mddev_t *mddev; 4163 mddev_t *mddev;
3997 char b[BDEVNAME_SIZE]; 4164 char b[BDEVNAME_SIZE];
3998 4165
@@ -4007,7 +4174,7 @@ static void autorun_devices(int part)
4007 printk(KERN_INFO "md: considering %s ...\n", 4174 printk(KERN_INFO "md: considering %s ...\n",
4008 bdevname(rdev0->bdev,b)); 4175 bdevname(rdev0->bdev,b));
4009 INIT_LIST_HEAD(&candidates); 4176 INIT_LIST_HEAD(&candidates);
4010 rdev_for_each_list(rdev, tmp, pending_raid_disks) 4177 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
4011 if (super_90_load(rdev, rdev0, 0) >= 0) { 4178 if (super_90_load(rdev, rdev0, 0) >= 0) {
4012 printk(KERN_INFO "md: adding %s ...\n", 4179 printk(KERN_INFO "md: adding %s ...\n",
4013 bdevname(rdev->bdev,b)); 4180 bdevname(rdev->bdev,b));
@@ -4053,7 +4220,7 @@ static void autorun_devices(int part)
4053 } else { 4220 } else {
4054 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 4221 printk(KERN_INFO "md: created %s\n", mdname(mddev));
4055 mddev->persistent = 1; 4222 mddev->persistent = 1;
4056 rdev_for_each_list(rdev, tmp, candidates) { 4223 rdev_for_each_list(rdev, tmp, &candidates) {
4057 list_del_init(&rdev->same_set); 4224 list_del_init(&rdev->same_set);
4058 if (bind_rdev_to_array(rdev, mddev)) 4225 if (bind_rdev_to_array(rdev, mddev))
4059 export_rdev(rdev); 4226 export_rdev(rdev);
@@ -4064,7 +4231,7 @@ static void autorun_devices(int part)
4064 /* on success, candidates will be empty, on error 4231 /* on success, candidates will be empty, on error
4065 * it won't... 4232 * it won't...
4066 */ 4233 */
4067 rdev_for_each_list(rdev, tmp, candidates) { 4234 rdev_for_each_list(rdev, tmp, &candidates) {
4068 list_del_init(&rdev->same_set); 4235 list_del_init(&rdev->same_set);
4069 export_rdev(rdev); 4236 export_rdev(rdev);
4070 } 4237 }
@@ -4093,10 +4260,9 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
4093 mdu_array_info_t info; 4260 mdu_array_info_t info;
4094 int nr,working,active,failed,spare; 4261 int nr,working,active,failed,spare;
4095 mdk_rdev_t *rdev; 4262 mdk_rdev_t *rdev;
4096 struct list_head *tmp;
4097 4263
4098 nr=working=active=failed=spare=0; 4264 nr=working=active=failed=spare=0;
4099 rdev_for_each(rdev, tmp, mddev) { 4265 list_for_each_entry(rdev, &mddev->disks, same_set) {
4100 nr++; 4266 nr++;
4101 if (test_bit(Faulty, &rdev->flags)) 4267 if (test_bit(Faulty, &rdev->flags))
4102 failed++; 4268 failed++;
@@ -4614,9 +4780,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4614 4780
4615static int update_size(mddev_t *mddev, sector_t num_sectors) 4781static int update_size(mddev_t *mddev, sector_t num_sectors)
4616{ 4782{
4617 mdk_rdev_t * rdev; 4783 mdk_rdev_t *rdev;
4618 int rv; 4784 int rv;
4619 struct list_head *tmp;
4620 int fit = (num_sectors == 0); 4785 int fit = (num_sectors == 0);
4621 4786
4622 if (mddev->pers->resize == NULL) 4787 if (mddev->pers->resize == NULL)
@@ -4638,7 +4803,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
4638 * grow, and re-add. 4803 * grow, and re-add.
4639 */ 4804 */
4640 return -EBUSY; 4805 return -EBUSY;
4641 rdev_for_each(rdev, tmp, mddev) { 4806 list_for_each_entry(rdev, &mddev->disks, same_set) {
4642 sector_t avail; 4807 sector_t avail;
4643 avail = rdev->size * 2; 4808 avail = rdev->size * 2;
4644 4809
@@ -5000,6 +5165,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
5000 5165
5001done_unlock: 5166done_unlock:
5002abort_unlock: 5167abort_unlock:
5168 if (mddev->hold_active == UNTIL_IOCTL &&
5169 err != -EINVAL)
5170 mddev->hold_active = 0;
5003 mddev_unlock(mddev); 5171 mddev_unlock(mddev);
5004 5172
5005 return err; 5173 return err;
@@ -5016,14 +5184,25 @@ static int md_open(struct block_device *bdev, fmode_t mode)
5016 * Succeed if we can lock the mddev, which confirms that 5184 * Succeed if we can lock the mddev, which confirms that
5017 * it isn't being stopped right now. 5185 * it isn't being stopped right now.
5018 */ 5186 */
5019 mddev_t *mddev = bdev->bd_disk->private_data; 5187 mddev_t *mddev = mddev_find(bdev->bd_dev);
5020 int err; 5188 int err;
5021 5189
5190 if (mddev->gendisk != bdev->bd_disk) {
5191 /* we are racing with mddev_put which is discarding this
5192 * bd_disk.
5193 */
5194 mddev_put(mddev);
5195 /* Wait until bdev->bd_disk is definitely gone */
5196 flush_scheduled_work();
5197 /* Then retry the open from the top */
5198 return -ERESTARTSYS;
5199 }
5200 BUG_ON(mddev != bdev->bd_disk->private_data);
5201
5022 if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) 5202 if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
5023 goto out; 5203 goto out;
5024 5204
5025 err = 0; 5205 err = 0;
5026 mddev_get(mddev);
5027 atomic_inc(&mddev->openers); 5206 atomic_inc(&mddev->openers);
5028 mddev_unlock(mddev); 5207 mddev_unlock(mddev);
5029 5208
@@ -5187,11 +5366,10 @@ static void status_unused(struct seq_file *seq)
5187{ 5366{
5188 int i = 0; 5367 int i = 0;
5189 mdk_rdev_t *rdev; 5368 mdk_rdev_t *rdev;
5190 struct list_head *tmp;
5191 5369
5192 seq_printf(seq, "unused devices: "); 5370 seq_printf(seq, "unused devices: ");
5193 5371
5194 rdev_for_each_list(rdev, tmp, pending_raid_disks) { 5372 list_for_each_entry(rdev, &pending_raid_disks, same_set) {
5195 char b[BDEVNAME_SIZE]; 5373 char b[BDEVNAME_SIZE];
5196 i++; 5374 i++;
5197 seq_printf(seq, "%s ", 5375 seq_printf(seq, "%s ",
@@ -5350,7 +5528,6 @@ static int md_seq_show(struct seq_file *seq, void *v)
5350{ 5528{
5351 mddev_t *mddev = v; 5529 mddev_t *mddev = v;
5352 sector_t size; 5530 sector_t size;
5353 struct list_head *tmp2;
5354 mdk_rdev_t *rdev; 5531 mdk_rdev_t *rdev;
5355 struct mdstat_info *mi = seq->private; 5532 struct mdstat_info *mi = seq->private;
5356 struct bitmap *bitmap; 5533 struct bitmap *bitmap;
@@ -5387,7 +5564,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
5387 } 5564 }
5388 5565
5389 size = 0; 5566 size = 0;
5390 rdev_for_each(rdev, tmp2, mddev) { 5567 list_for_each_entry(rdev, &mddev->disks, same_set) {
5391 char b[BDEVNAME_SIZE]; 5568 char b[BDEVNAME_SIZE];
5392 seq_printf(seq, " %s[%d]", 5569 seq_printf(seq, " %s[%d]",
5393 bdevname(rdev->bdev,b), rdev->desc_nr); 5570 bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -5694,7 +5871,6 @@ void md_do_sync(mddev_t *mddev)
5694 struct list_head *tmp; 5871 struct list_head *tmp;
5695 sector_t last_check; 5872 sector_t last_check;
5696 int skipped = 0; 5873 int skipped = 0;
5697 struct list_head *rtmp;
5698 mdk_rdev_t *rdev; 5874 mdk_rdev_t *rdev;
5699 char *desc; 5875 char *desc;
5700 5876
@@ -5799,7 +5975,7 @@ void md_do_sync(mddev_t *mddev)
5799 /* recovery follows the physical size of devices */ 5975 /* recovery follows the physical size of devices */
5800 max_sectors = mddev->size << 1; 5976 max_sectors = mddev->size << 1;
5801 j = MaxSector; 5977 j = MaxSector;
5802 rdev_for_each(rdev, rtmp, mddev) 5978 list_for_each_entry(rdev, &mddev->disks, same_set)
5803 if (rdev->raid_disk >= 0 && 5979 if (rdev->raid_disk >= 0 &&
5804 !test_bit(Faulty, &rdev->flags) && 5980 !test_bit(Faulty, &rdev->flags) &&
5805 !test_bit(In_sync, &rdev->flags) && 5981 !test_bit(In_sync, &rdev->flags) &&
@@ -5949,7 +6125,7 @@ void md_do_sync(mddev_t *mddev)
5949 } else { 6125 } else {
5950 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6126 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5951 mddev->curr_resync = MaxSector; 6127 mddev->curr_resync = MaxSector;
5952 rdev_for_each(rdev, rtmp, mddev) 6128 list_for_each_entry(rdev, &mddev->disks, same_set)
5953 if (rdev->raid_disk >= 0 && 6129 if (rdev->raid_disk >= 0 &&
5954 !test_bit(Faulty, &rdev->flags) && 6130 !test_bit(Faulty, &rdev->flags) &&
5955 !test_bit(In_sync, &rdev->flags) && 6131 !test_bit(In_sync, &rdev->flags) &&
@@ -5985,10 +6161,9 @@ EXPORT_SYMBOL_GPL(md_do_sync);
5985static int remove_and_add_spares(mddev_t *mddev) 6161static int remove_and_add_spares(mddev_t *mddev)
5986{ 6162{
5987 mdk_rdev_t *rdev; 6163 mdk_rdev_t *rdev;
5988 struct list_head *rtmp;
5989 int spares = 0; 6164 int spares = 0;
5990 6165
5991 rdev_for_each(rdev, rtmp, mddev) 6166 list_for_each_entry(rdev, &mddev->disks, same_set)
5992 if (rdev->raid_disk >= 0 && 6167 if (rdev->raid_disk >= 0 &&
5993 !test_bit(Blocked, &rdev->flags) && 6168 !test_bit(Blocked, &rdev->flags) &&
5994 (test_bit(Faulty, &rdev->flags) || 6169 (test_bit(Faulty, &rdev->flags) ||
@@ -6003,8 +6178,8 @@ static int remove_and_add_spares(mddev_t *mddev)
6003 } 6178 }
6004 } 6179 }
6005 6180
6006 if (mddev->degraded && ! mddev->ro) { 6181 if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) {
6007 rdev_for_each(rdev, rtmp, mddev) { 6182 list_for_each_entry(rdev, &mddev->disks, same_set) {
6008 if (rdev->raid_disk >= 0 && 6183 if (rdev->raid_disk >= 0 &&
6009 !test_bit(In_sync, &rdev->flags) && 6184 !test_bit(In_sync, &rdev->flags) &&
6010 !test_bit(Blocked, &rdev->flags)) 6185 !test_bit(Blocked, &rdev->flags))
@@ -6056,7 +6231,6 @@ static int remove_and_add_spares(mddev_t *mddev)
6056void md_check_recovery(mddev_t *mddev) 6231void md_check_recovery(mddev_t *mddev)
6057{ 6232{
6058 mdk_rdev_t *rdev; 6233 mdk_rdev_t *rdev;
6059 struct list_head *rtmp;
6060 6234
6061 6235
6062 if (mddev->bitmap) 6236 if (mddev->bitmap)
@@ -6120,7 +6294,7 @@ void md_check_recovery(mddev_t *mddev)
6120 if (mddev->flags) 6294 if (mddev->flags)
6121 md_update_sb(mddev, 0); 6295 md_update_sb(mddev, 0);
6122 6296
6123 rdev_for_each(rdev, rtmp, mddev) 6297 list_for_each_entry(rdev, &mddev->disks, same_set)
6124 if (test_and_clear_bit(StateChanged, &rdev->flags)) 6298 if (test_and_clear_bit(StateChanged, &rdev->flags))
6125 sysfs_notify_dirent(rdev->sysfs_state); 6299 sysfs_notify_dirent(rdev->sysfs_state);
6126 6300
@@ -6149,13 +6323,13 @@ void md_check_recovery(mddev_t *mddev)
6149 * information must be scrapped 6323 * information must be scrapped
6150 */ 6324 */
6151 if (!mddev->degraded) 6325 if (!mddev->degraded)
6152 rdev_for_each(rdev, rtmp, mddev) 6326 list_for_each_entry(rdev, &mddev->disks, same_set)
6153 rdev->saved_raid_disk = -1; 6327 rdev->saved_raid_disk = -1;
6154 6328
6155 mddev->recovery = 0; 6329 mddev->recovery = 0;
6156 /* flag recovery needed just to double check */ 6330 /* flag recovery needed just to double check */
6157 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6331 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6158 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 6332 sysfs_notify_dirent(mddev->sysfs_action);
6159 md_new_event(mddev); 6333 md_new_event(mddev);
6160 goto unlock; 6334 goto unlock;
6161 } 6335 }
@@ -6216,7 +6390,7 @@ void md_check_recovery(mddev_t *mddev)
6216 mddev->recovery = 0; 6390 mddev->recovery = 0;
6217 } else 6391 } else
6218 md_wakeup_thread(mddev->sync_thread); 6392 md_wakeup_thread(mddev->sync_thread);
6219 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 6393 sysfs_notify_dirent(mddev->sysfs_action);
6220 md_new_event(mddev); 6394 md_new_event(mddev);
6221 } 6395 }
6222 unlock: 6396 unlock:
@@ -6224,7 +6398,8 @@ void md_check_recovery(mddev_t *mddev)
6224 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6398 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6225 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 6399 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
6226 &mddev->recovery)) 6400 &mddev->recovery))
6227 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 6401 if (mddev->sysfs_action)
6402 sysfs_notify_dirent(mddev->sysfs_action);
6228 } 6403 }
6229 mddev_unlock(mddev); 6404 mddev_unlock(mddev);
6230 } 6405 }
@@ -6386,14 +6561,8 @@ static __exit void md_exit(void)
6386 unregister_sysctl_table(raid_table_header); 6561 unregister_sysctl_table(raid_table_header);
6387 remove_proc_entry("mdstat", NULL); 6562 remove_proc_entry("mdstat", NULL);
6388 for_each_mddev(mddev, tmp) { 6563 for_each_mddev(mddev, tmp) {
6389 struct gendisk *disk = mddev->gendisk;
6390 if (!disk)
6391 continue;
6392 export_array(mddev); 6564 export_array(mddev);
6393 del_gendisk(disk); 6565 mddev->hold_active = 0;
6394 put_disk(disk);
6395 mddev->gendisk = NULL;
6396 mddev_put(mddev);
6397 } 6566 }
6398} 6567}
6399 6568
@@ -6418,6 +6587,7 @@ static int set_ro(const char *val, struct kernel_param *kp)
6418module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 6587module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
6419module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 6588module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
6420 6589
6590module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
6421 6591
6422EXPORT_SYMBOL(register_md_personality); 6592EXPORT_SYMBOL(register_md_personality);
6423EXPORT_SYMBOL(unregister_md_personality); 6593EXPORT_SYMBOL(unregister_md_personality);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index d4ac47d11279..f6d08f241671 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -408,7 +408,6 @@ static int multipath_run (mddev_t *mddev)
408 int disk_idx; 408 int disk_idx;
409 struct multipath_info *disk; 409 struct multipath_info *disk;
410 mdk_rdev_t *rdev; 410 mdk_rdev_t *rdev;
411 struct list_head *tmp;
412 411
413 if (mddev->level != LEVEL_MULTIPATH) { 412 if (mddev->level != LEVEL_MULTIPATH) {
414 printk("multipath: %s: raid level not set to multipath IO (%d)\n", 413 printk("multipath: %s: raid level not set to multipath IO (%d)\n",
@@ -441,7 +440,7 @@ static int multipath_run (mddev_t *mddev)
441 } 440 }
442 441
443 conf->working_disks = 0; 442 conf->working_disks = 0;
444 rdev_for_each(rdev, tmp, mddev) { 443 list_for_each_entry(rdev, &mddev->disks, same_set) {
445 disk_idx = rdev->raid_disk; 444 disk_idx = rdev->raid_disk;
446 if (disk_idx < 0 || 445 if (disk_idx < 0 ||
447 disk_idx >= mddev->raid_disks) 446 disk_idx >= mddev->raid_disks)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 8ac6488ad0dc..c605ba805586 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -53,11 +53,10 @@ static int raid0_congested(void *data, int bits)
53static int create_strip_zones (mddev_t *mddev) 53static int create_strip_zones (mddev_t *mddev)
54{ 54{
55 int i, c, j; 55 int i, c, j;
56 sector_t current_offset, curr_zone_offset; 56 sector_t current_start, curr_zone_start;
57 sector_t min_spacing; 57 sector_t min_spacing;
58 raid0_conf_t *conf = mddev_to_conf(mddev); 58 raid0_conf_t *conf = mddev_to_conf(mddev);
59 mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; 59 mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
60 struct list_head *tmp1, *tmp2;
61 struct strip_zone *zone; 60 struct strip_zone *zone;
62 int cnt; 61 int cnt;
63 char b[BDEVNAME_SIZE]; 62 char b[BDEVNAME_SIZE];
@@ -67,19 +66,19 @@ static int create_strip_zones (mddev_t *mddev)
67 */ 66 */
68 conf->nr_strip_zones = 0; 67 conf->nr_strip_zones = 0;
69 68
70 rdev_for_each(rdev1, tmp1, mddev) { 69 list_for_each_entry(rdev1, &mddev->disks, same_set) {
71 printk("raid0: looking at %s\n", 70 printk(KERN_INFO "raid0: looking at %s\n",
72 bdevname(rdev1->bdev,b)); 71 bdevname(rdev1->bdev,b));
73 c = 0; 72 c = 0;
74 rdev_for_each(rdev2, tmp2, mddev) { 73 list_for_each_entry(rdev2, &mddev->disks, same_set) {
75 printk("raid0: comparing %s(%llu)", 74 printk(KERN_INFO "raid0: comparing %s(%llu)",
76 bdevname(rdev1->bdev,b), 75 bdevname(rdev1->bdev,b),
77 (unsigned long long)rdev1->size); 76 (unsigned long long)rdev1->size);
78 printk(" with %s(%llu)\n", 77 printk(KERN_INFO " with %s(%llu)\n",
79 bdevname(rdev2->bdev,b), 78 bdevname(rdev2->bdev,b),
80 (unsigned long long)rdev2->size); 79 (unsigned long long)rdev2->size);
81 if (rdev2 == rdev1) { 80 if (rdev2 == rdev1) {
82 printk("raid0: END\n"); 81 printk(KERN_INFO "raid0: END\n");
83 break; 82 break;
84 } 83 }
85 if (rdev2->size == rdev1->size) 84 if (rdev2->size == rdev1->size)
@@ -88,19 +87,20 @@ static int create_strip_zones (mddev_t *mddev)
88 * Not unique, don't count it as a new 87 * Not unique, don't count it as a new
89 * group 88 * group
90 */ 89 */
91 printk("raid0: EQUAL\n"); 90 printk(KERN_INFO "raid0: EQUAL\n");
92 c = 1; 91 c = 1;
93 break; 92 break;
94 } 93 }
95 printk("raid0: NOT EQUAL\n"); 94 printk(KERN_INFO "raid0: NOT EQUAL\n");
96 } 95 }
97 if (!c) { 96 if (!c) {
98 printk("raid0: ==> UNIQUE\n"); 97 printk(KERN_INFO "raid0: ==> UNIQUE\n");
99 conf->nr_strip_zones++; 98 conf->nr_strip_zones++;
100 printk("raid0: %d zones\n", conf->nr_strip_zones); 99 printk(KERN_INFO "raid0: %d zones\n",
100 conf->nr_strip_zones);
101 } 101 }
102 } 102 }
103 printk("raid0: FINAL %d zones\n", conf->nr_strip_zones); 103 printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones);
104 104
105 conf->strip_zone = kzalloc(sizeof(struct strip_zone)* 105 conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
106 conf->nr_strip_zones, GFP_KERNEL); 106 conf->nr_strip_zones, GFP_KERNEL);
@@ -119,16 +119,17 @@ static int create_strip_zones (mddev_t *mddev)
119 cnt = 0; 119 cnt = 0;
120 smallest = NULL; 120 smallest = NULL;
121 zone->dev = conf->devlist; 121 zone->dev = conf->devlist;
122 rdev_for_each(rdev1, tmp1, mddev) { 122 list_for_each_entry(rdev1, &mddev->disks, same_set) {
123 int j = rdev1->raid_disk; 123 int j = rdev1->raid_disk;
124 124
125 if (j < 0 || j >= mddev->raid_disks) { 125 if (j < 0 || j >= mddev->raid_disks) {
126 printk("raid0: bad disk number %d - aborting!\n", j); 126 printk(KERN_ERR "raid0: bad disk number %d - "
127 "aborting!\n", j);
127 goto abort; 128 goto abort;
128 } 129 }
129 if (zone->dev[j]) { 130 if (zone->dev[j]) {
130 printk("raid0: multiple devices for %d - aborting!\n", 131 printk(KERN_ERR "raid0: multiple devices for %d - "
131 j); 132 "aborting!\n", j);
132 goto abort; 133 goto abort;
133 } 134 }
134 zone->dev[j] = rdev1; 135 zone->dev[j] = rdev1;
@@ -149,16 +150,16 @@ static int create_strip_zones (mddev_t *mddev)
149 cnt++; 150 cnt++;
150 } 151 }
151 if (cnt != mddev->raid_disks) { 152 if (cnt != mddev->raid_disks) {
152 printk("raid0: too few disks (%d of %d) - aborting!\n", 153 printk(KERN_ERR "raid0: too few disks (%d of %d) - "
153 cnt, mddev->raid_disks); 154 "aborting!\n", cnt, mddev->raid_disks);
154 goto abort; 155 goto abort;
155 } 156 }
156 zone->nb_dev = cnt; 157 zone->nb_dev = cnt;
157 zone->size = smallest->size * cnt; 158 zone->sectors = smallest->size * cnt * 2;
158 zone->zone_offset = 0; 159 zone->zone_start = 0;
159 160
160 current_offset = smallest->size; 161 current_start = smallest->size * 2;
161 curr_zone_offset = zone->size; 162 curr_zone_start = zone->sectors;
162 163
163 /* now do the other zones */ 164 /* now do the other zones */
164 for (i = 1; i < conf->nr_strip_zones; i++) 165 for (i = 1; i < conf->nr_strip_zones; i++)
@@ -166,40 +167,41 @@ static int create_strip_zones (mddev_t *mddev)
166 zone = conf->strip_zone + i; 167 zone = conf->strip_zone + i;
167 zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks; 168 zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks;
168 169
169 printk("raid0: zone %d\n", i); 170 printk(KERN_INFO "raid0: zone %d\n", i);
170 zone->dev_offset = current_offset; 171 zone->dev_start = current_start;
171 smallest = NULL; 172 smallest = NULL;
172 c = 0; 173 c = 0;
173 174
174 for (j=0; j<cnt; j++) { 175 for (j=0; j<cnt; j++) {
175 char b[BDEVNAME_SIZE]; 176 char b[BDEVNAME_SIZE];
176 rdev = conf->strip_zone[0].dev[j]; 177 rdev = conf->strip_zone[0].dev[j];
177 printk("raid0: checking %s ...", bdevname(rdev->bdev,b)); 178 printk(KERN_INFO "raid0: checking %s ...",
178 if (rdev->size > current_offset) 179 bdevname(rdev->bdev, b));
179 { 180 if (rdev->size > current_start / 2) {
180 printk(" contained as device %d\n", c); 181 printk(KERN_INFO " contained as device %d\n",
182 c);
181 zone->dev[c] = rdev; 183 zone->dev[c] = rdev;
182 c++; 184 c++;
183 if (!smallest || (rdev->size <smallest->size)) { 185 if (!smallest || (rdev->size <smallest->size)) {
184 smallest = rdev; 186 smallest = rdev;
185 printk(" (%llu) is smallest!.\n", 187 printk(KERN_INFO " (%llu) is smallest!.\n",
186 (unsigned long long)rdev->size); 188 (unsigned long long)rdev->size);
187 } 189 }
188 } else 190 } else
189 printk(" nope.\n"); 191 printk(KERN_INFO " nope.\n");
190 } 192 }
191 193
192 zone->nb_dev = c; 194 zone->nb_dev = c;
193 zone->size = (smallest->size - current_offset) * c; 195 zone->sectors = (smallest->size * 2 - current_start) * c;
194 printk("raid0: zone->nb_dev: %d, size: %llu\n", 196 printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
195 zone->nb_dev, (unsigned long long)zone->size); 197 zone->nb_dev, (unsigned long long)zone->sectors);
196 198
197 zone->zone_offset = curr_zone_offset; 199 zone->zone_start = curr_zone_start;
198 curr_zone_offset += zone->size; 200 curr_zone_start += zone->sectors;
199 201
200 current_offset = smallest->size; 202 current_start = smallest->size * 2;
201 printk("raid0: current zone offset: %llu\n", 203 printk(KERN_INFO "raid0: current zone start: %llu\n",
202 (unsigned long long)current_offset); 204 (unsigned long long)current_start);
203 } 205 }
204 206
205 /* Now find appropriate hash spacing. 207 /* Now find appropriate hash spacing.
@@ -210,16 +212,16 @@ static int create_strip_zones (mddev_t *mddev)
210 * strip though as it's size has no bearing on the efficacy of the hash 212 * strip though as it's size has no bearing on the efficacy of the hash
211 * table. 213 * table.
212 */ 214 */
213 conf->hash_spacing = curr_zone_offset; 215 conf->spacing = curr_zone_start;
214 min_spacing = curr_zone_offset; 216 min_spacing = curr_zone_start;
215 sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*)); 217 sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*));
216 for (i=0; i < conf->nr_strip_zones-1; i++) { 218 for (i=0; i < conf->nr_strip_zones-1; i++) {
217 sector_t sz = 0; 219 sector_t s = 0;
218 for (j=i; j<conf->nr_strip_zones-1 && 220 for (j = i; j < conf->nr_strip_zones - 1 &&
219 sz < min_spacing ; j++) 221 s < min_spacing; j++)
220 sz += conf->strip_zone[j].size; 222 s += conf->strip_zone[j].sectors;
221 if (sz >= min_spacing && sz < conf->hash_spacing) 223 if (s >= min_spacing && s < conf->spacing)
222 conf->hash_spacing = sz; 224 conf->spacing = s;
223 } 225 }
224 226
225 mddev->queue->unplug_fn = raid0_unplug; 227 mddev->queue->unplug_fn = raid0_unplug;
@@ -227,7 +229,7 @@ static int create_strip_zones (mddev_t *mddev)
227 mddev->queue->backing_dev_info.congested_fn = raid0_congested; 229 mddev->queue->backing_dev_info.congested_fn = raid0_congested;
228 mddev->queue->backing_dev_info.congested_data = mddev; 230 mddev->queue->backing_dev_info.congested_data = mddev;
229 231
230 printk("raid0: done.\n"); 232 printk(KERN_INFO "raid0: done.\n");
231 return 0; 233 return 0;
232 abort: 234 abort:
233 return 1; 235 return 1;
@@ -262,10 +264,9 @@ static int raid0_mergeable_bvec(struct request_queue *q,
262static int raid0_run (mddev_t *mddev) 264static int raid0_run (mddev_t *mddev)
263{ 265{
264 unsigned cur=0, i=0, nb_zone; 266 unsigned cur=0, i=0, nb_zone;
265 s64 size; 267 s64 sectors;
266 raid0_conf_t *conf; 268 raid0_conf_t *conf;
267 mdk_rdev_t *rdev; 269 mdk_rdev_t *rdev;
268 struct list_head *tmp;
269 270
270 if (mddev->chunk_size == 0) { 271 if (mddev->chunk_size == 0) {
271 printk(KERN_ERR "md/raid0: non-zero chunk size required.\n"); 272 printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
@@ -291,54 +292,54 @@ static int raid0_run (mddev_t *mddev)
291 292
292 /* calculate array device size */ 293 /* calculate array device size */
293 mddev->array_sectors = 0; 294 mddev->array_sectors = 0;
294 rdev_for_each(rdev, tmp, mddev) 295 list_for_each_entry(rdev, &mddev->disks, same_set)
295 mddev->array_sectors += rdev->size * 2; 296 mddev->array_sectors += rdev->size * 2;
296 297
297 printk("raid0 : md_size is %llu blocks.\n", 298 printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
298 (unsigned long long)mddev->array_sectors / 2); 299 (unsigned long long)mddev->array_sectors);
299 printk("raid0 : conf->hash_spacing is %llu blocks.\n", 300 printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n",
300 (unsigned long long)conf->hash_spacing); 301 (unsigned long long)conf->spacing);
301 { 302 {
302 sector_t s = mddev->array_sectors / 2; 303 sector_t s = mddev->array_sectors;
303 sector_t space = conf->hash_spacing; 304 sector_t space = conf->spacing;
304 int round; 305 int round;
305 conf->preshift = 0; 306 conf->sector_shift = 0;
306 if (sizeof(sector_t) > sizeof(u32)) { 307 if (sizeof(sector_t) > sizeof(u32)) {
307 /*shift down space and s so that sector_div will work */ 308 /*shift down space and s so that sector_div will work */
308 while (space > (sector_t) (~(u32)0)) { 309 while (space > (sector_t) (~(u32)0)) {
309 s >>= 1; 310 s >>= 1;
310 space >>= 1; 311 space >>= 1;
311 s += 1; /* force round-up */ 312 s += 1; /* force round-up */
312 conf->preshift++; 313 conf->sector_shift++;
313 } 314 }
314 } 315 }
315 round = sector_div(s, (u32)space) ? 1 : 0; 316 round = sector_div(s, (u32)space) ? 1 : 0;
316 nb_zone = s + round; 317 nb_zone = s + round;
317 } 318 }
318 printk("raid0 : nb_zone is %d.\n", nb_zone); 319 printk(KERN_INFO "raid0 : nb_zone is %d.\n", nb_zone);
319 320
320 printk("raid0 : Allocating %Zd bytes for hash.\n", 321 printk(KERN_INFO "raid0 : Allocating %zu bytes for hash.\n",
321 nb_zone*sizeof(struct strip_zone*)); 322 nb_zone*sizeof(struct strip_zone*));
322 conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL); 323 conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL);
323 if (!conf->hash_table) 324 if (!conf->hash_table)
324 goto out_free_conf; 325 goto out_free_conf;
325 size = conf->strip_zone[cur].size; 326 sectors = conf->strip_zone[cur].sectors;
326 327
327 conf->hash_table[0] = conf->strip_zone + cur; 328 conf->hash_table[0] = conf->strip_zone + cur;
328 for (i=1; i< nb_zone; i++) { 329 for (i=1; i< nb_zone; i++) {
329 while (size <= conf->hash_spacing) { 330 while (sectors <= conf->spacing) {
330 cur++; 331 cur++;
331 size += conf->strip_zone[cur].size; 332 sectors += conf->strip_zone[cur].sectors;
332 } 333 }
333 size -= conf->hash_spacing; 334 sectors -= conf->spacing;
334 conf->hash_table[i] = conf->strip_zone + cur; 335 conf->hash_table[i] = conf->strip_zone + cur;
335 } 336 }
336 if (conf->preshift) { 337 if (conf->sector_shift) {
337 conf->hash_spacing >>= conf->preshift; 338 conf->spacing >>= conf->sector_shift;
338 /* round hash_spacing up so when we divide by it, we 339 /* round spacing up so when we divide by it, we
339 * err on the side of too-low, which is safest 340 * err on the side of too-low, which is safest
340 */ 341 */
341 conf->hash_spacing++; 342 conf->spacing++;
342 } 343 }
343 344
344 /* calculate the max read-ahead size. 345 /* calculate the max read-ahead size.
@@ -387,12 +388,12 @@ static int raid0_stop (mddev_t *mddev)
387static int raid0_make_request (struct request_queue *q, struct bio *bio) 388static int raid0_make_request (struct request_queue *q, struct bio *bio)
388{ 389{
389 mddev_t *mddev = q->queuedata; 390 mddev_t *mddev = q->queuedata;
390 unsigned int sect_in_chunk, chunksize_bits, chunk_size, chunk_sects; 391 unsigned int sect_in_chunk, chunksect_bits, chunk_sects;
391 raid0_conf_t *conf = mddev_to_conf(mddev); 392 raid0_conf_t *conf = mddev_to_conf(mddev);
392 struct strip_zone *zone; 393 struct strip_zone *zone;
393 mdk_rdev_t *tmp_dev; 394 mdk_rdev_t *tmp_dev;
394 sector_t chunk; 395 sector_t chunk;
395 sector_t block, rsect; 396 sector_t sector, rsect;
396 const int rw = bio_data_dir(bio); 397 const int rw = bio_data_dir(bio);
397 int cpu; 398 int cpu;
398 399
@@ -407,11 +408,9 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
407 bio_sectors(bio)); 408 bio_sectors(bio));
408 part_stat_unlock(); 409 part_stat_unlock();
409 410
410 chunk_size = mddev->chunk_size >> 10;
411 chunk_sects = mddev->chunk_size >> 9; 411 chunk_sects = mddev->chunk_size >> 9;
412 chunksize_bits = ffz(~chunk_size); 412 chunksect_bits = ffz(~chunk_sects);
413 block = bio->bi_sector >> 1; 413 sector = bio->bi_sector;
414
415 414
416 if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) { 415 if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) {
417 struct bio_pair *bp; 416 struct bio_pair *bp;
@@ -434,28 +433,27 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
434 433
435 434
436 { 435 {
437 sector_t x = block >> conf->preshift; 436 sector_t x = sector >> conf->sector_shift;
438 sector_div(x, (u32)conf->hash_spacing); 437 sector_div(x, (u32)conf->spacing);
439 zone = conf->hash_table[x]; 438 zone = conf->hash_table[x];
440 } 439 }
441 440
442 while (block >= (zone->zone_offset + zone->size)) 441 while (sector >= zone->zone_start + zone->sectors)
443 zone++; 442 zone++;
444 443
445 sect_in_chunk = bio->bi_sector & ((chunk_size<<1) -1); 444 sect_in_chunk = bio->bi_sector & (chunk_sects - 1);
446 445
447 446
448 { 447 {
449 sector_t x = (block - zone->zone_offset) >> chunksize_bits; 448 sector_t x = (sector - zone->zone_start) >> chunksect_bits;
450 449
451 sector_div(x, zone->nb_dev); 450 sector_div(x, zone->nb_dev);
452 chunk = x; 451 chunk = x;
453 452
454 x = block >> chunksize_bits; 453 x = sector >> chunksect_bits;
455 tmp_dev = zone->dev[sector_div(x, zone->nb_dev)]; 454 tmp_dev = zone->dev[sector_div(x, zone->nb_dev)];
456 } 455 }
457 rsect = (((chunk << chunksize_bits) + zone->dev_offset)<<1) 456 rsect = (chunk << chunksect_bits) + zone->dev_start + sect_in_chunk;
458 + sect_in_chunk;
459 457
460 bio->bi_bdev = tmp_dev->bdev; 458 bio->bi_bdev = tmp_dev->bdev;
461 bio->bi_sector = rsect + tmp_dev->data_offset; 459 bio->bi_sector = rsect + tmp_dev->data_offset;
@@ -467,7 +465,7 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
467 465
468bad_map: 466bad_map:
469 printk("raid0_make_request bug: can't convert block across chunks" 467 printk("raid0_make_request bug: can't convert block across chunks"
470 " or bigger than %dk %llu %d\n", chunk_size, 468 " or bigger than %dk %llu %d\n", chunk_sects / 2,
471 (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 469 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
472 470
473 bio_io_error(bio); 471 bio_io_error(bio);
@@ -492,10 +490,10 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev)
492 seq_printf(seq, "%s/", bdevname( 490 seq_printf(seq, "%s/", bdevname(
493 conf->strip_zone[j].dev[k]->bdev,b)); 491 conf->strip_zone[j].dev[k]->bdev,b));
494 492
495 seq_printf(seq, "] zo=%d do=%d s=%d\n", 493 seq_printf(seq, "] zs=%d ds=%d s=%d\n",
496 conf->strip_zone[j].zone_offset, 494 conf->strip_zone[j].zone_start,
497 conf->strip_zone[j].dev_offset, 495 conf->strip_zone[j].dev_start,
498 conf->strip_zone[j].size); 496 conf->strip_zone[j].sectors);
499 } 497 }
500#endif 498#endif
501 seq_printf(seq, " %dk chunks", mddev->chunk_size/1024); 499 seq_printf(seq, " %dk chunks", mddev->chunk_size/1024);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 9c788e2489b1..7b4f5f7155d8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1016,12 +1016,16 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1016 * else mark the drive as failed 1016 * else mark the drive as failed
1017 */ 1017 */
1018 if (test_bit(In_sync, &rdev->flags) 1018 if (test_bit(In_sync, &rdev->flags)
1019 && (conf->raid_disks - mddev->degraded) == 1) 1019 && (conf->raid_disks - mddev->degraded) == 1) {
1020 /* 1020 /*
1021 * Don't fail the drive, act as though we were just a 1021 * Don't fail the drive, act as though we were just a
1022 * normal single drive 1022 * normal single drive.
1023 * However don't try a recovery from this drive as
1024 * it is very likely to fail.
1023 */ 1025 */
1026 mddev->recovery_disabled = 1;
1024 return; 1027 return;
1028 }
1025 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1029 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1026 unsigned long flags; 1030 unsigned long flags;
1027 spin_lock_irqsave(&conf->device_lock, flags); 1031 spin_lock_irqsave(&conf->device_lock, flags);
@@ -1919,7 +1923,6 @@ static int run(mddev_t *mddev)
1919 int i, j, disk_idx; 1923 int i, j, disk_idx;
1920 mirror_info_t *disk; 1924 mirror_info_t *disk;
1921 mdk_rdev_t *rdev; 1925 mdk_rdev_t *rdev;
1922 struct list_head *tmp;
1923 1926
1924 if (mddev->level != 1) { 1927 if (mddev->level != 1) {
1925 printk("raid1: %s: raid level not set to mirroring (%d)\n", 1928 printk("raid1: %s: raid level not set to mirroring (%d)\n",
@@ -1964,7 +1967,7 @@ static int run(mddev_t *mddev)
1964 spin_lock_init(&conf->device_lock); 1967 spin_lock_init(&conf->device_lock);
1965 mddev->queue->queue_lock = &conf->device_lock; 1968 mddev->queue->queue_lock = &conf->device_lock;
1966 1969
1967 rdev_for_each(rdev, tmp, mddev) { 1970 list_for_each_entry(rdev, &mddev->disks, same_set) {
1968 disk_idx = rdev->raid_disk; 1971 disk_idx = rdev->raid_disk;
1969 if (disk_idx >= mddev->raid_disks 1972 if (disk_idx >= mddev->raid_disks
1970 || disk_idx < 0) 1973 || disk_idx < 0)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 970a96ef9b18..6736d6dff981 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2025,7 +2025,6 @@ static int run(mddev_t *mddev)
2025 int i, disk_idx; 2025 int i, disk_idx;
2026 mirror_info_t *disk; 2026 mirror_info_t *disk;
2027 mdk_rdev_t *rdev; 2027 mdk_rdev_t *rdev;
2028 struct list_head *tmp;
2029 int nc, fc, fo; 2028 int nc, fc, fo;
2030 sector_t stride, size; 2029 sector_t stride, size;
2031 2030
@@ -2108,7 +2107,7 @@ static int run(mddev_t *mddev)
2108 spin_lock_init(&conf->device_lock); 2107 spin_lock_init(&conf->device_lock);
2109 mddev->queue->queue_lock = &conf->device_lock; 2108 mddev->queue->queue_lock = &conf->device_lock;
2110 2109
2111 rdev_for_each(rdev, tmp, mddev) { 2110 list_for_each_entry(rdev, &mddev->disks, same_set) {
2112 disk_idx = rdev->raid_disk; 2111 disk_idx = rdev->raid_disk;
2113 if (disk_idx >= mddev->raid_disks 2112 if (disk_idx >= mddev->raid_disks
2114 || disk_idx < 0) 2113 || disk_idx < 0)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a36a7435edf5..a5ba080d303b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3998,7 +3998,6 @@ static int run(mddev_t *mddev)
3998 int raid_disk, memory; 3998 int raid_disk, memory;
3999 mdk_rdev_t *rdev; 3999 mdk_rdev_t *rdev;
4000 struct disk_info *disk; 4000 struct disk_info *disk;
4001 struct list_head *tmp;
4002 int working_disks = 0; 4001 int working_disks = 0;
4003 4002
4004 if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) { 4003 if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
@@ -4108,7 +4107,7 @@ static int run(mddev_t *mddev)
4108 4107
4109 pr_debug("raid5: run(%s) called.\n", mdname(mddev)); 4108 pr_debug("raid5: run(%s) called.\n", mdname(mddev));
4110 4109
4111 rdev_for_each(rdev, tmp, mddev) { 4110 list_for_each_entry(rdev, &mddev->disks, same_set) {
4112 raid_disk = rdev->raid_disk; 4111 raid_disk = rdev->raid_disk;
4113 if (raid_disk >= conf->raid_disks 4112 if (raid_disk >= conf->raid_disks
4114 || raid_disk < 0) 4113 || raid_disk < 0)
@@ -4533,7 +4532,6 @@ static int raid5_start_reshape(mddev_t *mddev)
4533{ 4532{
4534 raid5_conf_t *conf = mddev_to_conf(mddev); 4533 raid5_conf_t *conf = mddev_to_conf(mddev);
4535 mdk_rdev_t *rdev; 4534 mdk_rdev_t *rdev;
4536 struct list_head *rtmp;
4537 int spares = 0; 4535 int spares = 0;
4538 int added_devices = 0; 4536 int added_devices = 0;
4539 unsigned long flags; 4537 unsigned long flags;
@@ -4541,7 +4539,7 @@ static int raid5_start_reshape(mddev_t *mddev)
4541 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4539 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4542 return -EBUSY; 4540 return -EBUSY;
4543 4541
4544 rdev_for_each(rdev, rtmp, mddev) 4542 list_for_each_entry(rdev, &mddev->disks, same_set)
4545 if (rdev->raid_disk < 0 && 4543 if (rdev->raid_disk < 0 &&
4546 !test_bit(Faulty, &rdev->flags)) 4544 !test_bit(Faulty, &rdev->flags))
4547 spares++; 4545 spares++;
@@ -4563,7 +4561,7 @@ static int raid5_start_reshape(mddev_t *mddev)
4563 /* Add some new drives, as many as will fit. 4561 /* Add some new drives, as many as will fit.
4564 * We know there are enough to make the newly sized array work. 4562 * We know there are enough to make the newly sized array work.
4565 */ 4563 */
4566 rdev_for_each(rdev, rtmp, mddev) 4564 list_for_each_entry(rdev, &mddev->disks, same_set)
4567 if (rdev->raid_disk < 0 && 4565 if (rdev->raid_disk < 0 &&
4568 !test_bit(Faulty, &rdev->flags)) { 4566 !test_bit(Faulty, &rdev->flags)) {
4569 if (raid5_add_disk(mddev, rdev) == 0) { 4567 if (raid5_add_disk(mddev, rdev) == 0) {