aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2006-03-28 21:24:50 -0500
committerPaul Mackerras <paulus@samba.org>2006-03-28 21:24:50 -0500
commitbac30d1a78d0f11c613968fc8b351a91ed465386 (patch)
treee52f3c876522a2f6047a6ec1c27df2e8a79486b8 /drivers/md
parente8222502ee6157e2713da9e0792c21f4ad458d50 (diff)
parentca9ba4471c1203bb6e759b76e83167fec54fe590 (diff)
Merge ../linux-2.6
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig26
-rw-r--r--drivers/md/dm-crypt.c11
-rw-r--r--drivers/md/dm-ioctl.c100
-rw-r--r--drivers/md/dm-linear.c8
-rw-r--r--drivers/md/dm-raid1.c29
-rw-r--r--drivers/md/dm-snap.c409
-rw-r--r--drivers/md/dm-stripe.c13
-rw-r--r--drivers/md/dm-table.c53
-rw-r--r--drivers/md/dm.c88
-rw-r--r--drivers/md/dm.h23
-rw-r--r--drivers/md/kcopyd.c28
-rw-r--r--drivers/md/md.c235
-rw-r--r--drivers/md/raid1.c27
-rw-r--r--drivers/md/raid5.c719
-rw-r--r--drivers/md/raid6main.c14
15 files changed, 1423 insertions, 360 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index ac43f98062fd..fd2aae150ccc 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -127,6 +127,32 @@ config MD_RAID5
127 127
128 If unsure, say Y. 128 If unsure, say Y.
129 129
130config MD_RAID5_RESHAPE
131 bool "Support adding drives to a raid-5 array (experimental)"
132 depends on MD_RAID5 && EXPERIMENTAL
133 ---help---
134 A RAID-5 set can be expanded by adding extra drives. This
135 requires "restriping" the array which means (almost) every
136 block must be written to a different place.
137
138 This option allows such restriping to be done while the array
139 is online. However it is still EXPERIMENTAL code. It should
140 work, but please be sure that you have backups.
141
142 You will need a version of mdadm newer than 2.3.1. During the
143 early stage of reshape there is a critical section where live data
144 is being over-written. A crash during this time needs extra care
145 for recovery. The newer mdadm takes a copy of the data in the
146 critical section and will restore it, if necessary, after a crash.
147
148 The mdadm usage is e.g.
149 mdadm --grow /dev/md1 --raid-disks=6
150 to grow '/dev/md1' to having 6 disks.
151
152 Note: The array can only be expanded, not contracted.
153 There should be enough spares already present to make the new
154 array workable.
155
130config MD_RAID6 156config MD_RAID6
131 tristate "RAID-6 mode" 157 tristate "RAID-6 mode"
132 depends on BLK_DEV_MD 158 depends on BLK_DEV_MD
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 259e86f26549..61a590bb6241 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -518,6 +518,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
518 char *ivopts; 518 char *ivopts;
519 unsigned int crypto_flags; 519 unsigned int crypto_flags;
520 unsigned int key_size; 520 unsigned int key_size;
521 unsigned long long tmpll;
521 522
522 if (argc != 5) { 523 if (argc != 5) {
523 ti->error = PFX "Not enough arguments"; 524 ti->error = PFX "Not enough arguments";
@@ -633,15 +634,17 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
633 goto bad5; 634 goto bad5;
634 } 635 }
635 636
636 if (sscanf(argv[2], SECTOR_FORMAT, &cc->iv_offset) != 1) { 637 if (sscanf(argv[2], "%llu", &tmpll) != 1) {
637 ti->error = PFX "Invalid iv_offset sector"; 638 ti->error = PFX "Invalid iv_offset sector";
638 goto bad5; 639 goto bad5;
639 } 640 }
641 cc->iv_offset = tmpll;
640 642
641 if (sscanf(argv[4], SECTOR_FORMAT, &cc->start) != 1) { 643 if (sscanf(argv[4], "%llu", &tmpll) != 1) {
642 ti->error = PFX "Invalid device sector"; 644 ti->error = PFX "Invalid device sector";
643 goto bad5; 645 goto bad5;
644 } 646 }
647 cc->start = tmpll;
645 648
646 if (dm_get_device(ti, argv[3], cc->start, ti->len, 649 if (dm_get_device(ti, argv[3], cc->start, ti->len,
647 dm_table_get_mode(ti->table), &cc->dev)) { 650 dm_table_get_mode(ti->table), &cc->dev)) {
@@ -885,8 +888,8 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
885 result[sz++] = '-'; 888 result[sz++] = '-';
886 } 889 }
887 890
888 DMEMIT(" " SECTOR_FORMAT " %s " SECTOR_FORMAT, 891 DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
889 cc->iv_offset, cc->dev->name, cc->start); 892 cc->dev->name, (unsigned long long)cc->start);
890 break; 893 break;
891 } 894 }
892 return 0; 895 return 0;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 442e2be6052e..8edd6435414d 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -15,6 +15,7 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/devfs_fs_kernel.h> 16#include <linux/devfs_fs_kernel.h>
17#include <linux/dm-ioctl.h> 17#include <linux/dm-ioctl.h>
18#include <linux/hdreg.h>
18 19
19#include <asm/uaccess.h> 20#include <asm/uaccess.h>
20 21
@@ -244,9 +245,9 @@ static void __hash_remove(struct hash_cell *hc)
244 dm_table_put(table); 245 dm_table_put(table);
245 } 246 }
246 247
247 dm_put(hc->md);
248 if (hc->new_map) 248 if (hc->new_map)
249 dm_table_put(hc->new_map); 249 dm_table_put(hc->new_map);
250 dm_put(hc->md);
250 free_cell(hc); 251 free_cell(hc);
251} 252}
252 253
@@ -600,12 +601,22 @@ static int dev_create(struct dm_ioctl *param, size_t param_size)
600 */ 601 */
601static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) 602static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
602{ 603{
604 struct mapped_device *md;
605 void *mdptr = NULL;
606
603 if (*param->uuid) 607 if (*param->uuid)
604 return __get_uuid_cell(param->uuid); 608 return __get_uuid_cell(param->uuid);
605 else if (*param->name) 609
610 if (*param->name)
606 return __get_name_cell(param->name); 611 return __get_name_cell(param->name);
607 else 612
608 return dm_get_mdptr(huge_decode_dev(param->dev)); 613 md = dm_get_md(huge_decode_dev(param->dev));
614 if (md) {
615 mdptr = dm_get_mdptr(md);
616 dm_put(md);
617 }
618
619 return mdptr;
609} 620}
610 621
611static struct mapped_device *find_device(struct dm_ioctl *param) 622static struct mapped_device *find_device(struct dm_ioctl *param)
@@ -690,6 +701,54 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
690 return dm_hash_rename(param->name, new_name); 701 return dm_hash_rename(param->name, new_name);
691} 702}
692 703
704static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
705{
706 int r = -EINVAL, x;
707 struct mapped_device *md;
708 struct hd_geometry geometry;
709 unsigned long indata[4];
710 char *geostr = (char *) param + param->data_start;
711
712 md = find_device(param);
713 if (!md)
714 return -ENXIO;
715
716 if (geostr < (char *) (param + 1) ||
717 invalid_str(geostr, (void *) param + param_size)) {
718 DMWARN("Invalid geometry supplied.");
719 goto out;
720 }
721
722 x = sscanf(geostr, "%lu %lu %lu %lu", indata,
723 indata + 1, indata + 2, indata + 3);
724
725 if (x != 4) {
726 DMWARN("Unable to interpret geometry settings.");
727 goto out;
728 }
729
730 if (indata[0] > 65535 || indata[1] > 255 ||
731 indata[2] > 255 || indata[3] > ULONG_MAX) {
732 DMWARN("Geometry exceeds range limits.");
733 goto out;
734 }
735
736 geometry.cylinders = indata[0];
737 geometry.heads = indata[1];
738 geometry.sectors = indata[2];
739 geometry.start = indata[3];
740
741 r = dm_set_geometry(md, &geometry);
742 if (!r)
743 r = __dev_status(md, param);
744
745 param->data_size = 0;
746
747out:
748 dm_put(md);
749 return r;
750}
751
693static int do_suspend(struct dm_ioctl *param) 752static int do_suspend(struct dm_ioctl *param)
694{ 753{
695 int r = 0; 754 int r = 0;
@@ -975,33 +1034,43 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
975 int r; 1034 int r;
976 struct hash_cell *hc; 1035 struct hash_cell *hc;
977 struct dm_table *t; 1036 struct dm_table *t;
1037 struct mapped_device *md;
978 1038
979 r = dm_table_create(&t, get_mode(param), param->target_count); 1039 md = find_device(param);
1040 if (!md)
1041 return -ENXIO;
1042
1043 r = dm_table_create(&t, get_mode(param), param->target_count, md);
980 if (r) 1044 if (r)
981 return r; 1045 goto out;
982 1046
983 r = populate_table(t, param, param_size); 1047 r = populate_table(t, param, param_size);
984 if (r) { 1048 if (r) {
985 dm_table_put(t); 1049 dm_table_put(t);
986 return r; 1050 goto out;
987 } 1051 }
988 1052
989 down_write(&_hash_lock); 1053 down_write(&_hash_lock);
990 hc = __find_device_hash_cell(param); 1054 hc = dm_get_mdptr(md);
991 if (!hc) { 1055 if (!hc || hc->md != md) {
992 DMWARN("device doesn't appear to be in the dev hash table."); 1056 DMWARN("device has been removed from the dev hash table.");
993 up_write(&_hash_lock);
994 dm_table_put(t); 1057 dm_table_put(t);
995 return -ENXIO; 1058 up_write(&_hash_lock);
1059 r = -ENXIO;
1060 goto out;
996 } 1061 }
997 1062
998 if (hc->new_map) 1063 if (hc->new_map)
999 dm_table_put(hc->new_map); 1064 dm_table_put(hc->new_map);
1000 hc->new_map = t; 1065 hc->new_map = t;
1066 up_write(&_hash_lock);
1067
1001 param->flags |= DM_INACTIVE_PRESENT_FLAG; 1068 param->flags |= DM_INACTIVE_PRESENT_FLAG;
1069 r = __dev_status(md, param);
1070
1071out:
1072 dm_put(md);
1002 1073
1003 r = __dev_status(hc->md, param);
1004 up_write(&_hash_lock);
1005 return r; 1074 return r;
1006} 1075}
1007 1076
@@ -1214,7 +1283,8 @@ static ioctl_fn lookup_ioctl(unsigned int cmd)
1214 1283
1215 {DM_LIST_VERSIONS_CMD, list_versions}, 1284 {DM_LIST_VERSIONS_CMD, list_versions},
1216 1285
1217 {DM_TARGET_MSG_CMD, target_message} 1286 {DM_TARGET_MSG_CMD, target_message},
1287 {DM_DEV_SET_GEOMETRY_CMD, dev_set_geometry}
1218 }; 1288 };
1219 1289
1220 return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn; 1290 return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 6a2cd5dc8a63..daf586c0898d 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -26,6 +26,7 @@ struct linear_c {
26static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) 26static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
27{ 27{
28 struct linear_c *lc; 28 struct linear_c *lc;
29 unsigned long long tmp;
29 30
30 if (argc != 2) { 31 if (argc != 2) {
31 ti->error = "dm-linear: Invalid argument count"; 32 ti->error = "dm-linear: Invalid argument count";
@@ -38,10 +39,11 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
38 return -ENOMEM; 39 return -ENOMEM;
39 } 40 }
40 41
41 if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) { 42 if (sscanf(argv[1], "%llu", &tmp) != 1) {
42 ti->error = "dm-linear: Invalid device sector"; 43 ti->error = "dm-linear: Invalid device sector";
43 goto bad; 44 goto bad;
44 } 45 }
46 lc->start = tmp;
45 47
46 if (dm_get_device(ti, argv[0], lc->start, ti->len, 48 if (dm_get_device(ti, argv[0], lc->start, ti->len,
47 dm_table_get_mode(ti->table), &lc->dev)) { 49 dm_table_get_mode(ti->table), &lc->dev)) {
@@ -87,8 +89,8 @@ static int linear_status(struct dm_target *ti, status_type_t type,
87 break; 89 break;
88 90
89 case STATUSTYPE_TABLE: 91 case STATUSTYPE_TABLE:
90 snprintf(result, maxlen, "%s " SECTOR_FORMAT, lc->dev->name, 92 snprintf(result, maxlen, "%s %llu", lc->dev->name,
91 lc->start); 93 (unsigned long long)lc->start);
92 break; 94 break;
93 } 95 }
94 return 0; 96 return 0;
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 4e90f231fbfb..d12cf3e5e076 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -402,9 +402,21 @@ static void rh_dec(struct region_hash *rh, region_t region)
402 402
403 spin_lock_irqsave(&rh->region_lock, flags); 403 spin_lock_irqsave(&rh->region_lock, flags);
404 if (atomic_dec_and_test(&reg->pending)) { 404 if (atomic_dec_and_test(&reg->pending)) {
405 /*
406 * There is no pending I/O for this region.
407 * We can move the region to corresponding list for next action.
408 * At this point, the region is not yet connected to any list.
409 *
410 * If the state is RH_NOSYNC, the region should be kept off
411 * from clean list.
412 * The hash entry for RH_NOSYNC will remain in memory
413 * until the region is recovered or the map is reloaded.
414 */
415
416 /* do nothing for RH_NOSYNC */
405 if (reg->state == RH_RECOVERING) { 417 if (reg->state == RH_RECOVERING) {
406 list_add_tail(&reg->list, &rh->quiesced_regions); 418 list_add_tail(&reg->list, &rh->quiesced_regions);
407 } else { 419 } else if (reg->state == RH_DIRTY) {
408 reg->state = RH_CLEAN; 420 reg->state = RH_CLEAN;
409 list_add(&reg->list, &rh->clean_regions); 421 list_add(&reg->list, &rh->clean_regions);
410 } 422 }
@@ -922,9 +934,9 @@ static inline int _check_region_size(struct dm_target *ti, uint32_t size)
922static int get_mirror(struct mirror_set *ms, struct dm_target *ti, 934static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
923 unsigned int mirror, char **argv) 935 unsigned int mirror, char **argv)
924{ 936{
925 sector_t offset; 937 unsigned long long offset;
926 938
927 if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) { 939 if (sscanf(argv[1], "%llu", &offset) != 1) {
928 ti->error = "dm-mirror: Invalid offset"; 940 ti->error = "dm-mirror: Invalid offset";
929 return -EINVAL; 941 return -EINVAL;
930 } 942 }
@@ -1191,16 +1203,17 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
1191 for (m = 0; m < ms->nr_mirrors; m++) 1203 for (m = 0; m < ms->nr_mirrors; m++)
1192 DMEMIT("%s ", ms->mirror[m].dev->name); 1204 DMEMIT("%s ", ms->mirror[m].dev->name);
1193 1205
1194 DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT, 1206 DMEMIT("%llu/%llu",
1195 ms->rh.log->type->get_sync_count(ms->rh.log), 1207 (unsigned long long)ms->rh.log->type->
1196 ms->nr_regions); 1208 get_sync_count(ms->rh.log),
1209 (unsigned long long)ms->nr_regions);
1197 break; 1210 break;
1198 1211
1199 case STATUSTYPE_TABLE: 1212 case STATUSTYPE_TABLE:
1200 DMEMIT("%d ", ms->nr_mirrors); 1213 DMEMIT("%d ", ms->nr_mirrors);
1201 for (m = 0; m < ms->nr_mirrors; m++) 1214 for (m = 0; m < ms->nr_mirrors; m++)
1202 DMEMIT("%s " SECTOR_FORMAT " ", 1215 DMEMIT("%s %llu ", ms->mirror[m].dev->name,
1203 ms->mirror[m].dev->name, ms->mirror[m].offset); 1216 (unsigned long long)ms->mirror[m].offset);
1204 } 1217 }
1205 1218
1206 return 0; 1219 return 0;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 7401540086df..08312b46463a 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -49,11 +49,26 @@ struct pending_exception {
49 struct bio_list snapshot_bios; 49 struct bio_list snapshot_bios;
50 50
51 /* 51 /*
52 * Other pending_exceptions that are processing this 52 * Short-term queue of pending exceptions prior to submission.
53 * chunk. When this list is empty, we know we can
54 * complete the origins.
55 */ 53 */
56 struct list_head siblings; 54 struct list_head list;
55
56 /*
57 * The primary pending_exception is the one that holds
58 * the sibling_count and the list of origin_bios for a
59 * group of pending_exceptions. It is always last to get freed.
60 * These fields get set up when writing to the origin.
61 */
62 struct pending_exception *primary_pe;
63
64 /*
65 * Number of pending_exceptions processing this chunk.
66 * When this drops to zero we must complete the origin bios.
67 * If incrementing or decrementing this, hold pe->snap->lock for
68 * the sibling concerned and not pe->primary_pe->snap->lock unless
69 * they are the same.
70 */
71 atomic_t sibling_count;
57 72
58 /* Pointer back to snapshot context */ 73 /* Pointer back to snapshot context */
59 struct dm_snapshot *snap; 74 struct dm_snapshot *snap;
@@ -377,6 +392,8 @@ static void read_snapshot_metadata(struct dm_snapshot *s)
377 down_write(&s->lock); 392 down_write(&s->lock);
378 s->valid = 0; 393 s->valid = 0;
379 up_write(&s->lock); 394 up_write(&s->lock);
395
396 dm_table_event(s->table);
380 } 397 }
381} 398}
382 399
@@ -542,8 +559,12 @@ static void snapshot_dtr(struct dm_target *ti)
542{ 559{
543 struct dm_snapshot *s = (struct dm_snapshot *) ti->private; 560 struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
544 561
562 /* Prevent further origin writes from using this snapshot. */
563 /* After this returns there can be no new kcopyd jobs. */
545 unregister_snapshot(s); 564 unregister_snapshot(s);
546 565
566 kcopyd_client_destroy(s->kcopyd_client);
567
547 exit_exception_table(&s->pending, pending_cache); 568 exit_exception_table(&s->pending, pending_cache);
548 exit_exception_table(&s->complete, exception_cache); 569 exit_exception_table(&s->complete, exception_cache);
549 570
@@ -552,7 +573,7 @@ static void snapshot_dtr(struct dm_target *ti)
552 573
553 dm_put_device(ti, s->origin); 574 dm_put_device(ti, s->origin);
554 dm_put_device(ti, s->cow); 575 dm_put_device(ti, s->cow);
555 kcopyd_client_destroy(s->kcopyd_client); 576
556 kfree(s); 577 kfree(s);
557} 578}
558 579
@@ -586,78 +607,117 @@ static void error_bios(struct bio *bio)
586 } 607 }
587} 608}
588 609
610static inline void error_snapshot_bios(struct pending_exception *pe)
611{
612 error_bios(bio_list_get(&pe->snapshot_bios));
613}
614
589static struct bio *__flush_bios(struct pending_exception *pe) 615static struct bio *__flush_bios(struct pending_exception *pe)
590{ 616{
591 struct pending_exception *sibling; 617 /*
618 * If this pe is involved in a write to the origin and
619 * it is the last sibling to complete then release
620 * the bios for the original write to the origin.
621 */
622
623 if (pe->primary_pe &&
624 atomic_dec_and_test(&pe->primary_pe->sibling_count))
625 return bio_list_get(&pe->primary_pe->origin_bios);
626
627 return NULL;
628}
629
630static void __invalidate_snapshot(struct dm_snapshot *s,
631 struct pending_exception *pe, int err)
632{
633 if (!s->valid)
634 return;
592 635
593 if (list_empty(&pe->siblings)) 636 if (err == -EIO)
594 return bio_list_get(&pe->origin_bios); 637 DMERR("Invalidating snapshot: Error reading/writing.");
638 else if (err == -ENOMEM)
639 DMERR("Invalidating snapshot: Unable to allocate exception.");
595 640
596 sibling = list_entry(pe->siblings.next, 641 if (pe)
597 struct pending_exception, siblings); 642 remove_exception(&pe->e);
598 643
599 list_del(&pe->siblings); 644 if (s->store.drop_snapshot)
645 s->store.drop_snapshot(&s->store);
600 646
601 /* This is fine as long as kcopyd is single-threaded. If kcopyd 647 s->valid = 0;
602 * becomes multi-threaded, we'll need some locking here.
603 */
604 bio_list_merge(&sibling->origin_bios, &pe->origin_bios);
605 648
606 return NULL; 649 dm_table_event(s->table);
607} 650}
608 651
609static void pending_complete(struct pending_exception *pe, int success) 652static void pending_complete(struct pending_exception *pe, int success)
610{ 653{
611 struct exception *e; 654 struct exception *e;
655 struct pending_exception *primary_pe;
612 struct dm_snapshot *s = pe->snap; 656 struct dm_snapshot *s = pe->snap;
613 struct bio *flush = NULL; 657 struct bio *flush = NULL;
614 658
615 if (success) { 659 if (!success) {
616 e = alloc_exception(); 660 /* Read/write error - snapshot is unusable */
617 if (!e) {
618 DMWARN("Unable to allocate exception.");
619 down_write(&s->lock);
620 s->store.drop_snapshot(&s->store);
621 s->valid = 0;
622 flush = __flush_bios(pe);
623 up_write(&s->lock);
624
625 error_bios(bio_list_get(&pe->snapshot_bios));
626 goto out;
627 }
628 *e = pe->e;
629
630 /*
631 * Add a proper exception, and remove the
632 * in-flight exception from the list.
633 */
634 down_write(&s->lock); 661 down_write(&s->lock);
635 insert_exception(&s->complete, e); 662 __invalidate_snapshot(s, pe, -EIO);
636 remove_exception(&pe->e);
637 flush = __flush_bios(pe); 663 flush = __flush_bios(pe);
638
639 /* Submit any pending write bios */
640 up_write(&s->lock); 664 up_write(&s->lock);
641 665
642 flush_bios(bio_list_get(&pe->snapshot_bios)); 666 error_snapshot_bios(pe);
643 } else { 667 goto out;
644 /* Read/write error - snapshot is unusable */ 668 }
669
670 e = alloc_exception();
671 if (!e) {
645 down_write(&s->lock); 672 down_write(&s->lock);
646 if (s->valid) 673 __invalidate_snapshot(s, pe, -ENOMEM);
647 DMERR("Error reading/writing snapshot");
648 s->store.drop_snapshot(&s->store);
649 s->valid = 0;
650 remove_exception(&pe->e);
651 flush = __flush_bios(pe); 674 flush = __flush_bios(pe);
652 up_write(&s->lock); 675 up_write(&s->lock);
653 676
654 error_bios(bio_list_get(&pe->snapshot_bios)); 677 error_snapshot_bios(pe);
678 goto out;
679 }
680 *e = pe->e;
655 681
656 dm_table_event(s->table); 682 /*
683 * Add a proper exception, and remove the
684 * in-flight exception from the list.
685 */
686 down_write(&s->lock);
687 if (!s->valid) {
688 flush = __flush_bios(pe);
689 up_write(&s->lock);
690
691 free_exception(e);
692
693 error_snapshot_bios(pe);
694 goto out;
657 } 695 }
658 696
697 insert_exception(&s->complete, e);
698 remove_exception(&pe->e);
699 flush = __flush_bios(pe);
700
701 up_write(&s->lock);
702
703 /* Submit any pending write bios */
704 flush_bios(bio_list_get(&pe->snapshot_bios));
705
659 out: 706 out:
660 free_pending_exception(pe); 707 primary_pe = pe->primary_pe;
708
709 /*
710 * Free the pe if it's not linked to an origin write or if
711 * it's not itself a primary pe.
712 */
713 if (!primary_pe || primary_pe != pe)
714 free_pending_exception(pe);
715
716 /*
717 * Free the primary pe if nothing references it.
718 */
719 if (primary_pe && !atomic_read(&primary_pe->sibling_count))
720 free_pending_exception(primary_pe);
661 721
662 if (flush) 722 if (flush)
663 flush_bios(flush); 723 flush_bios(flush);
@@ -734,38 +794,45 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
734 if (e) { 794 if (e) {
735 /* cast the exception to a pending exception */ 795 /* cast the exception to a pending exception */
736 pe = container_of(e, struct pending_exception, e); 796 pe = container_of(e, struct pending_exception, e);
797 goto out;
798 }
737 799
738 } else { 800 /*
739 /* 801 * Create a new pending exception, we don't want
740 * Create a new pending exception, we don't want 802 * to hold the lock while we do this.
741 * to hold the lock while we do this. 803 */
742 */ 804 up_write(&s->lock);
743 up_write(&s->lock); 805 pe = alloc_pending_exception();
744 pe = alloc_pending_exception(); 806 down_write(&s->lock);
745 down_write(&s->lock);
746 807
747 e = lookup_exception(&s->pending, chunk); 808 if (!s->valid) {
748 if (e) { 809 free_pending_exception(pe);
749 free_pending_exception(pe); 810 return NULL;
750 pe = container_of(e, struct pending_exception, e); 811 }
751 } else {
752 pe->e.old_chunk = chunk;
753 bio_list_init(&pe->origin_bios);
754 bio_list_init(&pe->snapshot_bios);
755 INIT_LIST_HEAD(&pe->siblings);
756 pe->snap = s;
757 pe->started = 0;
758
759 if (s->store.prepare_exception(&s->store, &pe->e)) {
760 free_pending_exception(pe);
761 s->valid = 0;
762 return NULL;
763 }
764 812
765 insert_exception(&s->pending, &pe->e); 813 e = lookup_exception(&s->pending, chunk);
766 } 814 if (e) {
815 free_pending_exception(pe);
816 pe = container_of(e, struct pending_exception, e);
817 goto out;
818 }
819
820 pe->e.old_chunk = chunk;
821 bio_list_init(&pe->origin_bios);
822 bio_list_init(&pe->snapshot_bios);
823 pe->primary_pe = NULL;
824 atomic_set(&pe->sibling_count, 1);
825 pe->snap = s;
826 pe->started = 0;
827
828 if (s->store.prepare_exception(&s->store, &pe->e)) {
829 free_pending_exception(pe);
830 return NULL;
767 } 831 }
768 832
833 insert_exception(&s->pending, &pe->e);
834
835 out:
769 return pe; 836 return pe;
770} 837}
771 838
@@ -782,13 +849,15 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
782{ 849{
783 struct exception *e; 850 struct exception *e;
784 struct dm_snapshot *s = (struct dm_snapshot *) ti->private; 851 struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
852 int copy_needed = 0;
785 int r = 1; 853 int r = 1;
786 chunk_t chunk; 854 chunk_t chunk;
787 struct pending_exception *pe; 855 struct pending_exception *pe = NULL;
788 856
789 chunk = sector_to_chunk(s, bio->bi_sector); 857 chunk = sector_to_chunk(s, bio->bi_sector);
790 858
791 /* Full snapshots are not usable */ 859 /* Full snapshots are not usable */
860 /* To get here the table must be live so s->active is always set. */
792 if (!s->valid) 861 if (!s->valid)
793 return -EIO; 862 return -EIO;
794 863
@@ -806,36 +875,41 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
806 * to copy an exception */ 875 * to copy an exception */
807 down_write(&s->lock); 876 down_write(&s->lock);
808 877
878 if (!s->valid) {
879 r = -EIO;
880 goto out_unlock;
881 }
882
809 /* If the block is already remapped - use that, else remap it */ 883 /* If the block is already remapped - use that, else remap it */
810 e = lookup_exception(&s->complete, chunk); 884 e = lookup_exception(&s->complete, chunk);
811 if (e) { 885 if (e) {
812 remap_exception(s, e, bio); 886 remap_exception(s, e, bio);
813 up_write(&s->lock); 887 goto out_unlock;
814 888 }
815 } else { 889
816 pe = __find_pending_exception(s, bio); 890 pe = __find_pending_exception(s, bio);
817 891 if (!pe) {
818 if (!pe) { 892 __invalidate_snapshot(s, pe, -ENOMEM);
819 if (s->store.drop_snapshot) 893 r = -EIO;
820 s->store.drop_snapshot(&s->store); 894 goto out_unlock;
821 s->valid = 0; 895 }
822 r = -EIO; 896
823 up_write(&s->lock); 897 remap_exception(s, &pe->e, bio);
824 } else { 898 bio_list_add(&pe->snapshot_bios, bio);
825 remap_exception(s, &pe->e, bio); 899
826 bio_list_add(&pe->snapshot_bios, bio); 900 if (!pe->started) {
827 901 /* this is protected by snap->lock */
828 if (!pe->started) { 902 pe->started = 1;
829 /* this is protected by snap->lock */ 903 copy_needed = 1;
830 pe->started = 1;
831 up_write(&s->lock);
832 start_copy(pe);
833 } else
834 up_write(&s->lock);
835 r = 0;
836 }
837 } 904 }
838 905
906 r = 0;
907
908 out_unlock:
909 up_write(&s->lock);
910
911 if (copy_needed)
912 start_copy(pe);
839 } else { 913 } else {
840 /* 914 /*
841 * FIXME: this read path scares me because we 915 * FIXME: this read path scares me because we
@@ -847,6 +921,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
847 /* Do reads */ 921 /* Do reads */
848 down_read(&s->lock); 922 down_read(&s->lock);
849 923
924 if (!s->valid) {
925 up_read(&s->lock);
926 return -EIO;
927 }
928
850 /* See if it it has been remapped */ 929 /* See if it it has been remapped */
851 e = lookup_exception(&s->complete, chunk); 930 e = lookup_exception(&s->complete, chunk);
852 if (e) 931 if (e)
@@ -884,9 +963,9 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
884 snap->store.fraction_full(&snap->store, 963 snap->store.fraction_full(&snap->store,
885 &numerator, 964 &numerator,
886 &denominator); 965 &denominator);
887 snprintf(result, maxlen, 966 snprintf(result, maxlen, "%llu/%llu",
888 SECTOR_FORMAT "/" SECTOR_FORMAT, 967 (unsigned long long)numerator,
889 numerator, denominator); 968 (unsigned long long)denominator);
890 } 969 }
891 else 970 else
892 snprintf(result, maxlen, "Unknown"); 971 snprintf(result, maxlen, "Unknown");
@@ -899,9 +978,10 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
899 * to make private copies if the output is to 978 * to make private copies if the output is to
900 * make sense. 979 * make sense.
901 */ 980 */
902 snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT, 981 snprintf(result, maxlen, "%s %s %c %llu",
903 snap->origin->name, snap->cow->name, 982 snap->origin->name, snap->cow->name,
904 snap->type, snap->chunk_size); 983 snap->type,
984 (unsigned long long)snap->chunk_size);
905 break; 985 break;
906 } 986 }
907 987
@@ -911,40 +991,27 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
911/*----------------------------------------------------------------- 991/*-----------------------------------------------------------------
912 * Origin methods 992 * Origin methods
913 *---------------------------------------------------------------*/ 993 *---------------------------------------------------------------*/
914static void list_merge(struct list_head *l1, struct list_head *l2)
915{
916 struct list_head *l1_n, *l2_p;
917
918 l1_n = l1->next;
919 l2_p = l2->prev;
920
921 l1->next = l2;
922 l2->prev = l1;
923
924 l2_p->next = l1_n;
925 l1_n->prev = l2_p;
926}
927
928static int __origin_write(struct list_head *snapshots, struct bio *bio) 994static int __origin_write(struct list_head *snapshots, struct bio *bio)
929{ 995{
930 int r = 1, first = 1; 996 int r = 1, first = 0;
931 struct dm_snapshot *snap; 997 struct dm_snapshot *snap;
932 struct exception *e; 998 struct exception *e;
933 struct pending_exception *pe, *last = NULL; 999 struct pending_exception *pe, *next_pe, *primary_pe = NULL;
934 chunk_t chunk; 1000 chunk_t chunk;
1001 LIST_HEAD(pe_queue);
935 1002
936 /* Do all the snapshots on this origin */ 1003 /* Do all the snapshots on this origin */
937 list_for_each_entry (snap, snapshots, list) { 1004 list_for_each_entry (snap, snapshots, list) {
938 1005
1006 down_write(&snap->lock);
1007
939 /* Only deal with valid and active snapshots */ 1008 /* Only deal with valid and active snapshots */
940 if (!snap->valid || !snap->active) 1009 if (!snap->valid || !snap->active)
941 continue; 1010 goto next_snapshot;
942 1011
943 /* Nothing to do if writing beyond end of snapshot */ 1012 /* Nothing to do if writing beyond end of snapshot */
944 if (bio->bi_sector >= dm_table_get_size(snap->table)) 1013 if (bio->bi_sector >= dm_table_get_size(snap->table))
945 continue; 1014 goto next_snapshot;
946
947 down_write(&snap->lock);
948 1015
949 /* 1016 /*
950 * Remember, different snapshots can have 1017 * Remember, different snapshots can have
@@ -956,49 +1023,75 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
956 * Check exception table to see if block 1023 * Check exception table to see if block
957 * is already remapped in this snapshot 1024 * is already remapped in this snapshot
958 * and trigger an exception if not. 1025 * and trigger an exception if not.
1026 *
1027 * sibling_count is initialised to 1 so pending_complete()
1028 * won't destroy the primary_pe while we're inside this loop.
959 */ 1029 */
960 e = lookup_exception(&snap->complete, chunk); 1030 e = lookup_exception(&snap->complete, chunk);
961 if (!e) { 1031 if (e)
962 pe = __find_pending_exception(snap, bio); 1032 goto next_snapshot;
963 if (!pe) { 1033
964 snap->store.drop_snapshot(&snap->store); 1034 pe = __find_pending_exception(snap, bio);
965 snap->valid = 0; 1035 if (!pe) {
966 1036 __invalidate_snapshot(snap, pe, ENOMEM);
967 } else { 1037 goto next_snapshot;
968 if (last) 1038 }
969 list_merge(&pe->siblings, 1039
970 &last->siblings); 1040 if (!primary_pe) {
971 1041 /*
972 last = pe; 1042 * Either every pe here has same
973 r = 0; 1043 * primary_pe or none has one yet.
1044 */
1045 if (pe->primary_pe)
1046 primary_pe = pe->primary_pe;
1047 else {
1048 primary_pe = pe;
1049 first = 1;
974 } 1050 }
1051
1052 bio_list_add(&primary_pe->origin_bios, bio);
1053
1054 r = 0;
1055 }
1056
1057 if (!pe->primary_pe) {
1058 atomic_inc(&primary_pe->sibling_count);
1059 pe->primary_pe = primary_pe;
1060 }
1061
1062 if (!pe->started) {
1063 pe->started = 1;
1064 list_add_tail(&pe->list, &pe_queue);
975 } 1065 }
976 1066
1067 next_snapshot:
977 up_write(&snap->lock); 1068 up_write(&snap->lock);
978 } 1069 }
979 1070
1071 if (!primary_pe)
1072 goto out;
1073
980 /* 1074 /*
981 * Now that we have a complete pe list we can start the copying. 1075 * If this is the first time we're processing this chunk and
1076 * sibling_count is now 1 it means all the pending exceptions
1077 * got completed while we were in the loop above, so it falls to
1078 * us here to remove the primary_pe and submit any origin_bios.
982 */ 1079 */
983 if (last) { 1080
984 pe = last; 1081 if (first && atomic_dec_and_test(&primary_pe->sibling_count)) {
985 do { 1082 flush_bios(bio_list_get(&primary_pe->origin_bios));
986 down_write(&pe->snap->lock); 1083 free_pending_exception(primary_pe);
987 if (first) 1084 /* If we got here, pe_queue is necessarily empty. */
988 bio_list_add(&pe->origin_bios, bio); 1085 goto out;
989 if (!pe->started) {
990 pe->started = 1;
991 up_write(&pe->snap->lock);
992 start_copy(pe);
993 } else
994 up_write(&pe->snap->lock);
995 first = 0;
996 pe = list_entry(pe->siblings.next,
997 struct pending_exception, siblings);
998
999 } while (pe != last);
1000 } 1086 }
1001 1087
1088 /*
1089 * Now that we have a complete pe list we can start the copying.
1090 */
1091 list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
1092 start_copy(pe);
1093
1094 out:
1002 return r; 1095 return r;
1003} 1096}
1004 1097
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 697aacafb02a..08328a8f5a3c 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -49,9 +49,9 @@ static inline struct stripe_c *alloc_context(unsigned int stripes)
49static int get_stripe(struct dm_target *ti, struct stripe_c *sc, 49static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
50 unsigned int stripe, char **argv) 50 unsigned int stripe, char **argv)
51{ 51{
52 sector_t start; 52 unsigned long long start;
53 53
54 if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1) 54 if (sscanf(argv[1], "%llu", &start) != 1)
55 return -EINVAL; 55 return -EINVAL;
56 56
57 if (dm_get_device(ti, argv[0], start, sc->stripe_width, 57 if (dm_get_device(ti, argv[0], start, sc->stripe_width,
@@ -103,7 +103,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
103 return -EINVAL; 103 return -EINVAL;
104 } 104 }
105 105
106 if (((uint32_t)ti->len) & (chunk_size - 1)) { 106 if (ti->len & (chunk_size - 1)) {
107 ti->error = "dm-stripe: Target length not divisible by " 107 ti->error = "dm-stripe: Target length not divisible by "
108 "chunk size"; 108 "chunk size";
109 return -EINVAL; 109 return -EINVAL;
@@ -201,10 +201,11 @@ static int stripe_status(struct dm_target *ti,
201 break; 201 break;
202 202
203 case STATUSTYPE_TABLE: 203 case STATUSTYPE_TABLE:
204 DMEMIT("%d " SECTOR_FORMAT, sc->stripes, sc->chunk_mask + 1); 204 DMEMIT("%d %llu", sc->stripes,
205 (unsigned long long)sc->chunk_mask + 1);
205 for (i = 0; i < sc->stripes; i++) 206 for (i = 0; i < sc->stripes; i++)
206 DMEMIT(" %s " SECTOR_FORMAT, sc->stripe[i].dev->name, 207 DMEMIT(" %s %llu", sc->stripe[i].dev->name,
207 sc->stripe[i].physical_start); 208 (unsigned long long)sc->stripe[i].physical_start);
208 break; 209 break;
209 } 210 }
210 return 0; 211 return 0;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 907b08ddb783..8f56a54cf0ce 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -14,6 +14,7 @@
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/mutex.h>
17#include <asm/atomic.h> 18#include <asm/atomic.h>
18 19
19#define MAX_DEPTH 16 20#define MAX_DEPTH 16
@@ -22,6 +23,7 @@
22#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) 23#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
23 24
24struct dm_table { 25struct dm_table {
26 struct mapped_device *md;
25 atomic_t holders; 27 atomic_t holders;
26 28
27 /* btree table */ 29 /* btree table */
@@ -97,6 +99,8 @@ static void combine_restrictions_low(struct io_restrictions *lhs,
97 99
98 lhs->seg_boundary_mask = 100 lhs->seg_boundary_mask =
99 min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask); 101 min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask);
102
103 lhs->no_cluster |= rhs->no_cluster;
100} 104}
101 105
102/* 106/*
@@ -204,7 +208,8 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
204 return 0; 208 return 0;
205} 209}
206 210
207int dm_table_create(struct dm_table **result, int mode, unsigned num_targets) 211int dm_table_create(struct dm_table **result, int mode,
212 unsigned num_targets, struct mapped_device *md)
208{ 213{
209 struct dm_table *t = kmalloc(sizeof(*t), GFP_KERNEL); 214 struct dm_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
210 215
@@ -227,6 +232,7 @@ int dm_table_create(struct dm_table **result, int mode, unsigned num_targets)
227 } 232 }
228 233
229 t->mode = mode; 234 t->mode = mode;
235 t->md = md;
230 *result = t; 236 *result = t;
231 return 0; 237 return 0;
232} 238}
@@ -345,7 +351,7 @@ static struct dm_dev *find_device(struct list_head *l, dev_t dev)
345/* 351/*
346 * Open a device so we can use it as a map destination. 352 * Open a device so we can use it as a map destination.
347 */ 353 */
348static int open_dev(struct dm_dev *d, dev_t dev) 354static int open_dev(struct dm_dev *d, dev_t dev, struct mapped_device *md)
349{ 355{
350 static char *_claim_ptr = "I belong to device-mapper"; 356 static char *_claim_ptr = "I belong to device-mapper";
351 struct block_device *bdev; 357 struct block_device *bdev;
@@ -357,7 +363,7 @@ static int open_dev(struct dm_dev *d, dev_t dev)
357 bdev = open_by_devnum(dev, d->mode); 363 bdev = open_by_devnum(dev, d->mode);
358 if (IS_ERR(bdev)) 364 if (IS_ERR(bdev))
359 return PTR_ERR(bdev); 365 return PTR_ERR(bdev);
360 r = bd_claim(bdev, _claim_ptr); 366 r = bd_claim_by_disk(bdev, _claim_ptr, dm_disk(md));
361 if (r) 367 if (r)
362 blkdev_put(bdev); 368 blkdev_put(bdev);
363 else 369 else
@@ -368,12 +374,12 @@ static int open_dev(struct dm_dev *d, dev_t dev)
368/* 374/*
369 * Close a device that we've been using. 375 * Close a device that we've been using.
370 */ 376 */
371static void close_dev(struct dm_dev *d) 377static void close_dev(struct dm_dev *d, struct mapped_device *md)
372{ 378{
373 if (!d->bdev) 379 if (!d->bdev)
374 return; 380 return;
375 381
376 bd_release(d->bdev); 382 bd_release_from_disk(d->bdev, dm_disk(md));
377 blkdev_put(d->bdev); 383 blkdev_put(d->bdev);
378 d->bdev = NULL; 384 d->bdev = NULL;
379} 385}
@@ -394,7 +400,7 @@ static int check_device_area(struct dm_dev *dd, sector_t start, sector_t len)
394 * careful to leave things as they were if we fail to reopen the 400 * careful to leave things as they were if we fail to reopen the
395 * device. 401 * device.
396 */ 402 */
397static int upgrade_mode(struct dm_dev *dd, int new_mode) 403static int upgrade_mode(struct dm_dev *dd, int new_mode, struct mapped_device *md)
398{ 404{
399 int r; 405 int r;
400 struct dm_dev dd_copy; 406 struct dm_dev dd_copy;
@@ -404,9 +410,9 @@ static int upgrade_mode(struct dm_dev *dd, int new_mode)
404 410
405 dd->mode |= new_mode; 411 dd->mode |= new_mode;
406 dd->bdev = NULL; 412 dd->bdev = NULL;
407 r = open_dev(dd, dev); 413 r = open_dev(dd, dev, md);
408 if (!r) 414 if (!r)
409 close_dev(&dd_copy); 415 close_dev(&dd_copy, md);
410 else 416 else
411 *dd = dd_copy; 417 *dd = dd_copy;
412 418
@@ -448,7 +454,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
448 dd->mode = mode; 454 dd->mode = mode;
449 dd->bdev = NULL; 455 dd->bdev = NULL;
450 456
451 if ((r = open_dev(dd, dev))) { 457 if ((r = open_dev(dd, dev, t->md))) {
452 kfree(dd); 458 kfree(dd);
453 return r; 459 return r;
454 } 460 }
@@ -459,7 +465,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
459 list_add(&dd->list, &t->devices); 465 list_add(&dd->list, &t->devices);
460 466
461 } else if (dd->mode != (mode | dd->mode)) { 467 } else if (dd->mode != (mode | dd->mode)) {
462 r = upgrade_mode(dd, mode); 468 r = upgrade_mode(dd, mode, t->md);
463 if (r) 469 if (r)
464 return r; 470 return r;
465 } 471 }
@@ -523,6 +529,8 @@ int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
523 rs->seg_boundary_mask = 529 rs->seg_boundary_mask =
524 min_not_zero(rs->seg_boundary_mask, 530 min_not_zero(rs->seg_boundary_mask,
525 q->seg_boundary_mask); 531 q->seg_boundary_mask);
532
533 rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
526 } 534 }
527 535
528 return r; 536 return r;
@@ -534,7 +542,7 @@ int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
534void dm_put_device(struct dm_target *ti, struct dm_dev *dd) 542void dm_put_device(struct dm_target *ti, struct dm_dev *dd)
535{ 543{
536 if (atomic_dec_and_test(&dd->count)) { 544 if (atomic_dec_and_test(&dd->count)) {
537 close_dev(dd); 545 close_dev(dd, ti->table->md);
538 list_del(&dd->list); 546 list_del(&dd->list);
539 kfree(dd); 547 kfree(dd);
540 } 548 }
@@ -763,14 +771,14 @@ int dm_table_complete(struct dm_table *t)
763 return r; 771 return r;
764} 772}
765 773
766static DECLARE_MUTEX(_event_lock); 774static DEFINE_MUTEX(_event_lock);
767void dm_table_event_callback(struct dm_table *t, 775void dm_table_event_callback(struct dm_table *t,
768 void (*fn)(void *), void *context) 776 void (*fn)(void *), void *context)
769{ 777{
770 down(&_event_lock); 778 mutex_lock(&_event_lock);
771 t->event_fn = fn; 779 t->event_fn = fn;
772 t->event_context = context; 780 t->event_context = context;
773 up(&_event_lock); 781 mutex_unlock(&_event_lock);
774} 782}
775 783
776void dm_table_event(struct dm_table *t) 784void dm_table_event(struct dm_table *t)
@@ -781,10 +789,10 @@ void dm_table_event(struct dm_table *t)
781 */ 789 */
782 BUG_ON(in_interrupt()); 790 BUG_ON(in_interrupt());
783 791
784 down(&_event_lock); 792 mutex_lock(&_event_lock);
785 if (t->event_fn) 793 if (t->event_fn)
786 t->event_fn(t->event_context); 794 t->event_fn(t->event_context);
787 up(&_event_lock); 795 mutex_unlock(&_event_lock);
788} 796}
789 797
790sector_t dm_table_get_size(struct dm_table *t) 798sector_t dm_table_get_size(struct dm_table *t)
@@ -832,6 +840,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
832 q->hardsect_size = t->limits.hardsect_size; 840 q->hardsect_size = t->limits.hardsect_size;
833 q->max_segment_size = t->limits.max_segment_size; 841 q->max_segment_size = t->limits.max_segment_size;
834 q->seg_boundary_mask = t->limits.seg_boundary_mask; 842 q->seg_boundary_mask = t->limits.seg_boundary_mask;
843 if (t->limits.no_cluster)
844 q->queue_flags &= ~(1 << QUEUE_FLAG_CLUSTER);
845 else
846 q->queue_flags |= (1 << QUEUE_FLAG_CLUSTER);
847
835} 848}
836 849
837unsigned int dm_table_get_num_targets(struct dm_table *t) 850unsigned int dm_table_get_num_targets(struct dm_table *t)
@@ -943,12 +956,20 @@ int dm_table_flush_all(struct dm_table *t)
943 return ret; 956 return ret;
944} 957}
945 958
959struct mapped_device *dm_table_get_md(struct dm_table *t)
960{
961 dm_get(t->md);
962
963 return t->md;
964}
965
946EXPORT_SYMBOL(dm_vcalloc); 966EXPORT_SYMBOL(dm_vcalloc);
947EXPORT_SYMBOL(dm_get_device); 967EXPORT_SYMBOL(dm_get_device);
948EXPORT_SYMBOL(dm_put_device); 968EXPORT_SYMBOL(dm_put_device);
949EXPORT_SYMBOL(dm_table_event); 969EXPORT_SYMBOL(dm_table_event);
950EXPORT_SYMBOL(dm_table_get_size); 970EXPORT_SYMBOL(dm_table_get_size);
951EXPORT_SYMBOL(dm_table_get_mode); 971EXPORT_SYMBOL(dm_table_get_mode);
972EXPORT_SYMBOL(dm_table_get_md);
952EXPORT_SYMBOL(dm_table_put); 973EXPORT_SYMBOL(dm_table_put);
953EXPORT_SYMBOL(dm_table_get); 974EXPORT_SYMBOL(dm_table_get);
954EXPORT_SYMBOL(dm_table_unplug_all); 975EXPORT_SYMBOL(dm_table_unplug_all);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a64798ef481e..4d710b7a133b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -10,6 +10,7 @@
10 10
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/mutex.h>
13#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
14#include <linux/blkpg.h> 15#include <linux/blkpg.h>
15#include <linux/bio.h> 16#include <linux/bio.h>
@@ -17,6 +18,7 @@
17#include <linux/mempool.h> 18#include <linux/mempool.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
19#include <linux/idr.h> 20#include <linux/idr.h>
21#include <linux/hdreg.h>
20#include <linux/blktrace_api.h> 22#include <linux/blktrace_api.h>
21 23
22static const char *_name = DM_NAME; 24static const char *_name = DM_NAME;
@@ -69,6 +71,7 @@ struct mapped_device {
69 71
70 request_queue_t *queue; 72 request_queue_t *queue;
71 struct gendisk *disk; 73 struct gendisk *disk;
74 char name[16];
72 75
73 void *interface_ptr; 76 void *interface_ptr;
74 77
@@ -101,6 +104,9 @@ struct mapped_device {
101 */ 104 */
102 struct super_block *frozen_sb; 105 struct super_block *frozen_sb;
103 struct block_device *suspended_bdev; 106 struct block_device *suspended_bdev;
107
108 /* forced geometry settings */
109 struct hd_geometry geometry;
104}; 110};
105 111
106#define MIN_IOS 256 112#define MIN_IOS 256
@@ -226,6 +232,13 @@ static int dm_blk_close(struct inode *inode, struct file *file)
226 return 0; 232 return 0;
227} 233}
228 234
235static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
236{
237 struct mapped_device *md = bdev->bd_disk->private_data;
238
239 return dm_get_geometry(md, geo);
240}
241
229static inline struct dm_io *alloc_io(struct mapped_device *md) 242static inline struct dm_io *alloc_io(struct mapped_device *md)
230{ 243{
231 return mempool_alloc(md->io_pool, GFP_NOIO); 244 return mempool_alloc(md->io_pool, GFP_NOIO);
@@ -312,6 +325,33 @@ struct dm_table *dm_get_table(struct mapped_device *md)
312 return t; 325 return t;
313} 326}
314 327
328/*
329 * Get the geometry associated with a dm device
330 */
331int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
332{
333 *geo = md->geometry;
334
335 return 0;
336}
337
338/*
339 * Set the geometry of a device.
340 */
341int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
342{
343 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
344
345 if (geo->start > sz) {
346 DMWARN("Start sector is beyond the geometry limits.");
347 return -EINVAL;
348 }
349
350 md->geometry = *geo;
351
352 return 0;
353}
354
315/*----------------------------------------------------------------- 355/*-----------------------------------------------------------------
316 * CRUD START: 356 * CRUD START:
317 * A more elegant soln is in the works that uses the queue 357 * A more elegant soln is in the works that uses the queue
@@ -704,14 +744,14 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
704/*----------------------------------------------------------------- 744/*-----------------------------------------------------------------
705 * An IDR is used to keep track of allocated minor numbers. 745 * An IDR is used to keep track of allocated minor numbers.
706 *---------------------------------------------------------------*/ 746 *---------------------------------------------------------------*/
707static DECLARE_MUTEX(_minor_lock); 747static DEFINE_MUTEX(_minor_lock);
708static DEFINE_IDR(_minor_idr); 748static DEFINE_IDR(_minor_idr);
709 749
710static void free_minor(unsigned int minor) 750static void free_minor(unsigned int minor)
711{ 751{
712 down(&_minor_lock); 752 mutex_lock(&_minor_lock);
713 idr_remove(&_minor_idr, minor); 753 idr_remove(&_minor_idr, minor);
714 up(&_minor_lock); 754 mutex_unlock(&_minor_lock);
715} 755}
716 756
717/* 757/*
@@ -724,7 +764,7 @@ static int specific_minor(struct mapped_device *md, unsigned int minor)
724 if (minor >= (1 << MINORBITS)) 764 if (minor >= (1 << MINORBITS))
725 return -EINVAL; 765 return -EINVAL;
726 766
727 down(&_minor_lock); 767 mutex_lock(&_minor_lock);
728 768
729 if (idr_find(&_minor_idr, minor)) { 769 if (idr_find(&_minor_idr, minor)) {
730 r = -EBUSY; 770 r = -EBUSY;
@@ -749,7 +789,7 @@ static int specific_minor(struct mapped_device *md, unsigned int minor)
749 } 789 }
750 790
751out: 791out:
752 up(&_minor_lock); 792 mutex_unlock(&_minor_lock);
753 return r; 793 return r;
754} 794}
755 795
@@ -758,7 +798,7 @@ static int next_free_minor(struct mapped_device *md, unsigned int *minor)
758 int r; 798 int r;
759 unsigned int m; 799 unsigned int m;
760 800
761 down(&_minor_lock); 801 mutex_lock(&_minor_lock);
762 802
763 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 803 r = idr_pre_get(&_minor_idr, GFP_KERNEL);
764 if (!r) { 804 if (!r) {
@@ -780,7 +820,7 @@ static int next_free_minor(struct mapped_device *md, unsigned int *minor)
780 *minor = m; 820 *minor = m;
781 821
782out: 822out:
783 up(&_minor_lock); 823 mutex_unlock(&_minor_lock);
784 return r; 824 return r;
785} 825}
786 826
@@ -842,6 +882,7 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
842 md->disk->private_data = md; 882 md->disk->private_data = md;
843 sprintf(md->disk->disk_name, "dm-%d", minor); 883 sprintf(md->disk->disk_name, "dm-%d", minor);
844 add_disk(md->disk); 884 add_disk(md->disk);
885 format_dev_t(md->name, MKDEV(_major, minor));
845 886
846 atomic_set(&md->pending, 0); 887 atomic_set(&md->pending, 0);
847 init_waitqueue_head(&md->wait); 888 init_waitqueue_head(&md->wait);
@@ -904,6 +945,13 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
904 sector_t size; 945 sector_t size;
905 946
906 size = dm_table_get_size(t); 947 size = dm_table_get_size(t);
948
949 /*
950 * Wipe any geometry if the size of the table changed.
951 */
952 if (size != get_capacity(md->disk))
953 memset(&md->geometry, 0, sizeof(md->geometry));
954
907 __set_size(md, size); 955 __set_size(md, size);
908 if (size == 0) 956 if (size == 0)
909 return 0; 957 return 0;
@@ -967,13 +1015,13 @@ static struct mapped_device *dm_find_md(dev_t dev)
967 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 1015 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
968 return NULL; 1016 return NULL;
969 1017
970 down(&_minor_lock); 1018 mutex_lock(&_minor_lock);
971 1019
972 md = idr_find(&_minor_idr, minor); 1020 md = idr_find(&_minor_idr, minor);
973 if (!md || (dm_disk(md)->first_minor != minor)) 1021 if (!md || (dm_disk(md)->first_minor != minor))
974 md = NULL; 1022 md = NULL;
975 1023
976 up(&_minor_lock); 1024 mutex_unlock(&_minor_lock);
977 1025
978 return md; 1026 return md;
979} 1027}
@@ -988,15 +1036,9 @@ struct mapped_device *dm_get_md(dev_t dev)
988 return md; 1036 return md;
989} 1037}
990 1038
991void *dm_get_mdptr(dev_t dev) 1039void *dm_get_mdptr(struct mapped_device *md)
992{ 1040{
993 struct mapped_device *md; 1041 return md->interface_ptr;
994 void *mdptr = NULL;
995
996 md = dm_find_md(dev);
997 if (md)
998 mdptr = md->interface_ptr;
999 return mdptr;
1000} 1042}
1001 1043
1002void dm_set_mdptr(struct mapped_device *md, void *ptr) 1044void dm_set_mdptr(struct mapped_device *md, void *ptr)
@@ -1011,18 +1053,18 @@ void dm_get(struct mapped_device *md)
1011 1053
1012void dm_put(struct mapped_device *md) 1054void dm_put(struct mapped_device *md)
1013{ 1055{
1014 struct dm_table *map = dm_get_table(md); 1056 struct dm_table *map;
1015 1057
1016 if (atomic_dec_and_test(&md->holders)) { 1058 if (atomic_dec_and_test(&md->holders)) {
1059 map = dm_get_table(md);
1017 if (!dm_suspended(md)) { 1060 if (!dm_suspended(md)) {
1018 dm_table_presuspend_targets(map); 1061 dm_table_presuspend_targets(map);
1019 dm_table_postsuspend_targets(map); 1062 dm_table_postsuspend_targets(map);
1020 } 1063 }
1021 __unbind(md); 1064 __unbind(md);
1065 dm_table_put(map);
1022 free_dev(md); 1066 free_dev(md);
1023 } 1067 }
1024
1025 dm_table_put(map);
1026} 1068}
1027 1069
1028/* 1070/*
@@ -1107,6 +1149,7 @@ int dm_suspend(struct mapped_device *md, int do_lockfs)
1107{ 1149{
1108 struct dm_table *map = NULL; 1150 struct dm_table *map = NULL;
1109 DECLARE_WAITQUEUE(wait, current); 1151 DECLARE_WAITQUEUE(wait, current);
1152 struct bio *def;
1110 int r = -EINVAL; 1153 int r = -EINVAL;
1111 1154
1112 down(&md->suspend_lock); 1155 down(&md->suspend_lock);
@@ -1166,9 +1209,11 @@ int dm_suspend(struct mapped_device *md, int do_lockfs)
1166 /* were we interrupted ? */ 1209 /* were we interrupted ? */
1167 r = -EINTR; 1210 r = -EINTR;
1168 if (atomic_read(&md->pending)) { 1211 if (atomic_read(&md->pending)) {
1212 clear_bit(DMF_BLOCK_IO, &md->flags);
1213 def = bio_list_get(&md->deferred);
1214 __flush_deferred_io(md, def);
1169 up_write(&md->io_lock); 1215 up_write(&md->io_lock);
1170 unlock_fs(md); 1216 unlock_fs(md);
1171 clear_bit(DMF_BLOCK_IO, &md->flags);
1172 goto out; 1217 goto out;
1173 } 1218 }
1174 up_write(&md->io_lock); 1219 up_write(&md->io_lock);
@@ -1262,6 +1307,7 @@ int dm_suspended(struct mapped_device *md)
1262static struct block_device_operations dm_blk_dops = { 1307static struct block_device_operations dm_blk_dops = {
1263 .open = dm_blk_open, 1308 .open = dm_blk_open,
1264 .release = dm_blk_close, 1309 .release = dm_blk_close,
1310 .getgeo = dm_blk_getgeo,
1265 .owner = THIS_MODULE 1311 .owner = THIS_MODULE
1266}; 1312};
1267 1313
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 4eaf075da217..fd90bc8f9e45 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -14,6 +14,7 @@
14#include <linux/device-mapper.h> 14#include <linux/device-mapper.h>
15#include <linux/list.h> 15#include <linux/list.h>
16#include <linux/blkdev.h> 16#include <linux/blkdev.h>
17#include <linux/hdreg.h>
17 18
18#define DM_NAME "device-mapper" 19#define DM_NAME "device-mapper"
19#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x) 20#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
@@ -23,16 +24,6 @@
23#define DMEMIT(x...) sz += ((sz >= maxlen) ? \ 24#define DMEMIT(x...) sz += ((sz >= maxlen) ? \
24 0 : scnprintf(result + sz, maxlen - sz, x)) 25 0 : scnprintf(result + sz, maxlen - sz, x))
25 26
26/*
27 * FIXME: I think this should be with the definition of sector_t
28 * in types.h.
29 */
30#ifdef CONFIG_LBD
31#define SECTOR_FORMAT "%llu"
32#else
33#define SECTOR_FORMAT "%lu"
34#endif
35
36#define SECTOR_SHIFT 9 27#define SECTOR_SHIFT 9
37 28
38/* 29/*
@@ -57,7 +48,7 @@ struct mapped_device;
57int dm_create(struct mapped_device **md); 48int dm_create(struct mapped_device **md);
58int dm_create_with_minor(unsigned int minor, struct mapped_device **md); 49int dm_create_with_minor(unsigned int minor, struct mapped_device **md);
59void dm_set_mdptr(struct mapped_device *md, void *ptr); 50void dm_set_mdptr(struct mapped_device *md, void *ptr);
60void *dm_get_mdptr(dev_t dev); 51void *dm_get_mdptr(struct mapped_device *md);
61struct mapped_device *dm_get_md(dev_t dev); 52struct mapped_device *dm_get_md(dev_t dev);
62 53
63/* 54/*
@@ -95,11 +86,18 @@ int dm_wait_event(struct mapped_device *md, int event_nr);
95struct gendisk *dm_disk(struct mapped_device *md); 86struct gendisk *dm_disk(struct mapped_device *md);
96int dm_suspended(struct mapped_device *md); 87int dm_suspended(struct mapped_device *md);
97 88
89/*
90 * Geometry functions.
91 */
92int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo);
93int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo);
94
98/*----------------------------------------------------------------- 95/*-----------------------------------------------------------------
99 * Functions for manipulating a table. Tables are also reference 96 * Functions for manipulating a table. Tables are also reference
100 * counted. 97 * counted.
101 *---------------------------------------------------------------*/ 98 *---------------------------------------------------------------*/
102int dm_table_create(struct dm_table **result, int mode, unsigned num_targets); 99int dm_table_create(struct dm_table **result, int mode,
100 unsigned num_targets, struct mapped_device *md);
103 101
104void dm_table_get(struct dm_table *t); 102void dm_table_get(struct dm_table *t);
105void dm_table_put(struct dm_table *t); 103void dm_table_put(struct dm_table *t);
@@ -117,6 +115,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
117unsigned int dm_table_get_num_targets(struct dm_table *t); 115unsigned int dm_table_get_num_targets(struct dm_table *t);
118struct list_head *dm_table_get_devices(struct dm_table *t); 116struct list_head *dm_table_get_devices(struct dm_table *t);
119int dm_table_get_mode(struct dm_table *t); 117int dm_table_get_mode(struct dm_table *t);
118struct mapped_device *dm_table_get_md(struct dm_table *t);
120void dm_table_presuspend_targets(struct dm_table *t); 119void dm_table_presuspend_targets(struct dm_table *t);
121void dm_table_postsuspend_targets(struct dm_table *t); 120void dm_table_postsuspend_targets(struct dm_table *t);
122void dm_table_resume_targets(struct dm_table *t); 121void dm_table_resume_targets(struct dm_table *t);
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index 9dcb2c8a3853..72480a48d88b 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -22,6 +22,7 @@
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/workqueue.h> 24#include <linux/workqueue.h>
25#include <linux/mutex.h>
25 26
26#include "kcopyd.h" 27#include "kcopyd.h"
27 28
@@ -44,6 +45,9 @@ struct kcopyd_client {
44 struct page_list *pages; 45 struct page_list *pages;
45 unsigned int nr_pages; 46 unsigned int nr_pages;
46 unsigned int nr_free_pages; 47 unsigned int nr_free_pages;
48
49 wait_queue_head_t destroyq;
50 atomic_t nr_jobs;
47}; 51};
48 52
49static struct page_list *alloc_pl(void) 53static struct page_list *alloc_pl(void)
@@ -292,10 +296,15 @@ static int run_complete_job(struct kcopyd_job *job)
292 int read_err = job->read_err; 296 int read_err = job->read_err;
293 unsigned int write_err = job->write_err; 297 unsigned int write_err = job->write_err;
294 kcopyd_notify_fn fn = job->fn; 298 kcopyd_notify_fn fn = job->fn;
299 struct kcopyd_client *kc = job->kc;
295 300
296 kcopyd_put_pages(job->kc, job->pages); 301 kcopyd_put_pages(kc, job->pages);
297 mempool_free(job, _job_pool); 302 mempool_free(job, _job_pool);
298 fn(read_err, write_err, context); 303 fn(read_err, write_err, context);
304
305 if (atomic_dec_and_test(&kc->nr_jobs))
306 wake_up(&kc->destroyq);
307
299 return 0; 308 return 0;
300} 309}
301 310
@@ -430,6 +439,7 @@ static void do_work(void *ignored)
430 */ 439 */
431static void dispatch_job(struct kcopyd_job *job) 440static void dispatch_job(struct kcopyd_job *job)
432{ 441{
442 atomic_inc(&job->kc->nr_jobs);
433 push(&_pages_jobs, job); 443 push(&_pages_jobs, job);
434 wake(); 444 wake();
435} 445}
@@ -572,21 +582,21 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
572/*----------------------------------------------------------------- 582/*-----------------------------------------------------------------
573 * Unit setup 583 * Unit setup
574 *---------------------------------------------------------------*/ 584 *---------------------------------------------------------------*/
575static DECLARE_MUTEX(_client_lock); 585static DEFINE_MUTEX(_client_lock);
576static LIST_HEAD(_clients); 586static LIST_HEAD(_clients);
577 587
578static void client_add(struct kcopyd_client *kc) 588static void client_add(struct kcopyd_client *kc)
579{ 589{
580 down(&_client_lock); 590 mutex_lock(&_client_lock);
581 list_add(&kc->list, &_clients); 591 list_add(&kc->list, &_clients);
582 up(&_client_lock); 592 mutex_unlock(&_client_lock);
583} 593}
584 594
585static void client_del(struct kcopyd_client *kc) 595static void client_del(struct kcopyd_client *kc)
586{ 596{
587 down(&_client_lock); 597 mutex_lock(&_client_lock);
588 list_del(&kc->list); 598 list_del(&kc->list);
589 up(&_client_lock); 599 mutex_unlock(&_client_lock);
590} 600}
591 601
592static DEFINE_MUTEX(kcopyd_init_lock); 602static DEFINE_MUTEX(kcopyd_init_lock);
@@ -669,6 +679,9 @@ int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
669 return r; 679 return r;
670 } 680 }
671 681
682 init_waitqueue_head(&kc->destroyq);
683 atomic_set(&kc->nr_jobs, 0);
684
672 client_add(kc); 685 client_add(kc);
673 *result = kc; 686 *result = kc;
674 return 0; 687 return 0;
@@ -676,6 +689,9 @@ int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
676 689
677void kcopyd_client_destroy(struct kcopyd_client *kc) 690void kcopyd_client_destroy(struct kcopyd_client *kc)
678{ 691{
692 /* Wait for completion of all jobs submitted by this client. */
693 wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
694
679 dm_io_put(kc->nr_pages); 695 dm_io_put(kc->nr_pages);
680 client_free_pages(kc); 696 client_free_pages(kc);
681 client_del(kc); 697 client_del(kc);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5ed2228745cb..039e071c1007 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -43,6 +43,7 @@
43#include <linux/buffer_head.h> /* for invalidate_bdev */ 43#include <linux/buffer_head.h> /* for invalidate_bdev */
44#include <linux/suspend.h> 44#include <linux/suspend.h>
45#include <linux/poll.h> 45#include <linux/poll.h>
46#include <linux/mutex.h>
46 47
47#include <linux/init.h> 48#include <linux/init.h>
48 49
@@ -158,11 +159,12 @@ static int start_readonly;
158 */ 159 */
159static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 160static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
160static atomic_t md_event_count; 161static atomic_t md_event_count;
161static void md_new_event(mddev_t *mddev) 162void md_new_event(mddev_t *mddev)
162{ 163{
163 atomic_inc(&md_event_count); 164 atomic_inc(&md_event_count);
164 wake_up(&md_event_waiters); 165 wake_up(&md_event_waiters);
165} 166}
167EXPORT_SYMBOL_GPL(md_new_event);
166 168
167/* 169/*
168 * Enables to iterate over all existing md arrays 170 * Enables to iterate over all existing md arrays
@@ -253,7 +255,7 @@ static mddev_t * mddev_find(dev_t unit)
253 else 255 else
254 new->md_minor = MINOR(unit) >> MdpMinorShift; 256 new->md_minor = MINOR(unit) >> MdpMinorShift;
255 257
256 init_MUTEX(&new->reconfig_sem); 258 mutex_init(&new->reconfig_mutex);
257 INIT_LIST_HEAD(&new->disks); 259 INIT_LIST_HEAD(&new->disks);
258 INIT_LIST_HEAD(&new->all_mddevs); 260 INIT_LIST_HEAD(&new->all_mddevs);
259 init_timer(&new->safemode_timer); 261 init_timer(&new->safemode_timer);
@@ -266,6 +268,7 @@ static mddev_t * mddev_find(dev_t unit)
266 kfree(new); 268 kfree(new);
267 return NULL; 269 return NULL;
268 } 270 }
271 set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
269 272
270 blk_queue_make_request(new->queue, md_fail_request); 273 blk_queue_make_request(new->queue, md_fail_request);
271 274
@@ -274,22 +277,22 @@ static mddev_t * mddev_find(dev_t unit)
274 277
275static inline int mddev_lock(mddev_t * mddev) 278static inline int mddev_lock(mddev_t * mddev)
276{ 279{
277 return down_interruptible(&mddev->reconfig_sem); 280 return mutex_lock_interruptible(&mddev->reconfig_mutex);
278} 281}
279 282
280static inline void mddev_lock_uninterruptible(mddev_t * mddev) 283static inline void mddev_lock_uninterruptible(mddev_t * mddev)
281{ 284{
282 down(&mddev->reconfig_sem); 285 mutex_lock(&mddev->reconfig_mutex);
283} 286}
284 287
285static inline int mddev_trylock(mddev_t * mddev) 288static inline int mddev_trylock(mddev_t * mddev)
286{ 289{
287 return down_trylock(&mddev->reconfig_sem); 290 return mutex_trylock(&mddev->reconfig_mutex);
288} 291}
289 292
290static inline void mddev_unlock(mddev_t * mddev) 293static inline void mddev_unlock(mddev_t * mddev)
291{ 294{
292 up(&mddev->reconfig_sem); 295 mutex_unlock(&mddev->reconfig_mutex);
293 296
294 md_wakeup_thread(mddev->thread); 297 md_wakeup_thread(mddev->thread);
295} 298}
@@ -660,7 +663,8 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
660 } 663 }
661 664
662 if (sb->major_version != 0 || 665 if (sb->major_version != 0 ||
663 sb->minor_version != 90) { 666 sb->minor_version < 90 ||
667 sb->minor_version > 91) {
664 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 668 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
665 sb->major_version, sb->minor_version, 669 sb->major_version, sb->minor_version,
666 b); 670 b);
@@ -745,6 +749,20 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
745 mddev->bitmap_offset = 0; 749 mddev->bitmap_offset = 0;
746 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 750 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
747 751
752 if (mddev->minor_version >= 91) {
753 mddev->reshape_position = sb->reshape_position;
754 mddev->delta_disks = sb->delta_disks;
755 mddev->new_level = sb->new_level;
756 mddev->new_layout = sb->new_layout;
757 mddev->new_chunk = sb->new_chunk;
758 } else {
759 mddev->reshape_position = MaxSector;
760 mddev->delta_disks = 0;
761 mddev->new_level = mddev->level;
762 mddev->new_layout = mddev->layout;
763 mddev->new_chunk = mddev->chunk_size;
764 }
765
748 if (sb->state & (1<<MD_SB_CLEAN)) 766 if (sb->state & (1<<MD_SB_CLEAN))
749 mddev->recovery_cp = MaxSector; 767 mddev->recovery_cp = MaxSector;
750 else { 768 else {
@@ -764,7 +782,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
764 782
765 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 783 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
766 mddev->bitmap_file == NULL) { 784 mddev->bitmap_file == NULL) {
767 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6 785 if (mddev->level != 1 && mddev->level != 4
786 && mddev->level != 5 && mddev->level != 6
768 && mddev->level != 10) { 787 && mddev->level != 10) {
769 /* FIXME use a better test */ 788 /* FIXME use a better test */
770 printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); 789 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
@@ -838,7 +857,6 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
838 857
839 sb->md_magic = MD_SB_MAGIC; 858 sb->md_magic = MD_SB_MAGIC;
840 sb->major_version = mddev->major_version; 859 sb->major_version = mddev->major_version;
841 sb->minor_version = mddev->minor_version;
842 sb->patch_version = mddev->patch_version; 860 sb->patch_version = mddev->patch_version;
843 sb->gvalid_words = 0; /* ignored */ 861 sb->gvalid_words = 0; /* ignored */
844 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 862 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
@@ -857,6 +875,17 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
857 sb->events_hi = (mddev->events>>32); 875 sb->events_hi = (mddev->events>>32);
858 sb->events_lo = (u32)mddev->events; 876 sb->events_lo = (u32)mddev->events;
859 877
878 if (mddev->reshape_position == MaxSector)
879 sb->minor_version = 90;
880 else {
881 sb->minor_version = 91;
882 sb->reshape_position = mddev->reshape_position;
883 sb->new_level = mddev->new_level;
884 sb->delta_disks = mddev->delta_disks;
885 sb->new_layout = mddev->new_layout;
886 sb->new_chunk = mddev->new_chunk;
887 }
888 mddev->minor_version = sb->minor_version;
860 if (mddev->in_sync) 889 if (mddev->in_sync)
861 { 890 {
862 sb->recovery_cp = mddev->recovery_cp; 891 sb->recovery_cp = mddev->recovery_cp;
@@ -893,10 +922,9 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
893 d->raid_disk = rdev2->raid_disk; 922 d->raid_disk = rdev2->raid_disk;
894 else 923 else
895 d->raid_disk = rdev2->desc_nr; /* compatibility */ 924 d->raid_disk = rdev2->desc_nr; /* compatibility */
896 if (test_bit(Faulty, &rdev2->flags)) { 925 if (test_bit(Faulty, &rdev2->flags))
897 d->state = (1<<MD_DISK_FAULTY); 926 d->state = (1<<MD_DISK_FAULTY);
898 failed++; 927 else if (test_bit(In_sync, &rdev2->flags)) {
899 } else if (test_bit(In_sync, &rdev2->flags)) {
900 d->state = (1<<MD_DISK_ACTIVE); 928 d->state = (1<<MD_DISK_ACTIVE);
901 d->state |= (1<<MD_DISK_SYNC); 929 d->state |= (1<<MD_DISK_SYNC);
902 active++; 930 active++;
@@ -1102,6 +1130,20 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1102 } 1130 }
1103 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1131 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1104 } 1132 }
1133 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1134 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1135 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1136 mddev->new_level = le32_to_cpu(sb->new_level);
1137 mddev->new_layout = le32_to_cpu(sb->new_layout);
1138 mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
1139 } else {
1140 mddev->reshape_position = MaxSector;
1141 mddev->delta_disks = 0;
1142 mddev->new_level = mddev->level;
1143 mddev->new_layout = mddev->layout;
1144 mddev->new_chunk = mddev->chunk_size;
1145 }
1146
1105 } else if (mddev->pers == NULL) { 1147 } else if (mddev->pers == NULL) {
1106 /* Insist of good event counter while assembling */ 1148 /* Insist of good event counter while assembling */
1107 __u64 ev1 = le64_to_cpu(sb->events); 1149 __u64 ev1 = le64_to_cpu(sb->events);
@@ -1173,6 +1215,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1173 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1215 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1174 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1216 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1175 } 1217 }
1218 if (mddev->reshape_position != MaxSector) {
1219 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1220 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1221 sb->new_layout = cpu_to_le32(mddev->new_layout);
1222 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1223 sb->new_level = cpu_to_le32(mddev->new_level);
1224 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
1225 }
1176 1226
1177 max_dev = 0; 1227 max_dev = 0;
1178 ITERATE_RDEV(mddev,rdev2,tmp) 1228 ITERATE_RDEV(mddev,rdev2,tmp)
@@ -1301,6 +1351,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1301 else 1351 else
1302 ko = &rdev->bdev->bd_disk->kobj; 1352 ko = &rdev->bdev->bd_disk->kobj;
1303 sysfs_create_link(&rdev->kobj, ko, "block"); 1353 sysfs_create_link(&rdev->kobj, ko, "block");
1354 bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk);
1304 return 0; 1355 return 0;
1305} 1356}
1306 1357
@@ -1311,6 +1362,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1311 MD_BUG(); 1362 MD_BUG();
1312 return; 1363 return;
1313 } 1364 }
1365 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1314 list_del_init(&rdev->same_set); 1366 list_del_init(&rdev->same_set);
1315 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1367 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1316 rdev->mddev = NULL; 1368 rdev->mddev = NULL;
@@ -1493,7 +1545,7 @@ static void sync_sbs(mddev_t * mddev)
1493 } 1545 }
1494} 1546}
1495 1547
1496static void md_update_sb(mddev_t * mddev) 1548void md_update_sb(mddev_t * mddev)
1497{ 1549{
1498 int err; 1550 int err;
1499 struct list_head *tmp; 1551 struct list_head *tmp;
@@ -1570,6 +1622,7 @@ repeat:
1570 wake_up(&mddev->sb_wait); 1622 wake_up(&mddev->sb_wait);
1571 1623
1572} 1624}
1625EXPORT_SYMBOL_GPL(md_update_sb);
1573 1626
1574/* words written to sysfs files may, or my not, be \n terminated. 1627/* words written to sysfs files may, or my not, be \n terminated.
1575 * We want to accept with case. For this we use cmd_match. 1628 * We want to accept with case. For this we use cmd_match.
@@ -2162,7 +2215,9 @@ action_show(mddev_t *mddev, char *page)
2162 char *type = "idle"; 2215 char *type = "idle";
2163 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2216 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2164 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { 2217 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
2165 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2218 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2219 type = "reshape";
2220 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2166 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 2221 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2167 type = "resync"; 2222 type = "resync";
2168 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2223 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
@@ -2193,7 +2248,14 @@ action_store(mddev_t *mddev, const char *page, size_t len)
2193 return -EBUSY; 2248 return -EBUSY;
2194 else if (cmd_match(page, "resync") || cmd_match(page, "recover")) 2249 else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
2195 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2250 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2196 else { 2251 else if (cmd_match(page, "reshape")) {
2252 int err;
2253 if (mddev->pers->start_reshape == NULL)
2254 return -EINVAL;
2255 err = mddev->pers->start_reshape(mddev);
2256 if (err)
2257 return err;
2258 } else {
2197 if (cmd_match(page, "check")) 2259 if (cmd_match(page, "check"))
2198 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2260 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2199 else if (cmd_match(page, "repair")) 2261 else if (cmd_match(page, "repair"))
@@ -2304,6 +2366,63 @@ sync_completed_show(mddev_t *mddev, char *page)
2304static struct md_sysfs_entry 2366static struct md_sysfs_entry
2305md_sync_completed = __ATTR_RO(sync_completed); 2367md_sync_completed = __ATTR_RO(sync_completed);
2306 2368
2369static ssize_t
2370suspend_lo_show(mddev_t *mddev, char *page)
2371{
2372 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
2373}
2374
2375static ssize_t
2376suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
2377{
2378 char *e;
2379 unsigned long long new = simple_strtoull(buf, &e, 10);
2380
2381 if (mddev->pers->quiesce == NULL)
2382 return -EINVAL;
2383 if (buf == e || (*e && *e != '\n'))
2384 return -EINVAL;
2385 if (new >= mddev->suspend_hi ||
2386 (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
2387 mddev->suspend_lo = new;
2388 mddev->pers->quiesce(mddev, 2);
2389 return len;
2390 } else
2391 return -EINVAL;
2392}
2393static struct md_sysfs_entry md_suspend_lo =
2394__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
2395
2396
2397static ssize_t
2398suspend_hi_show(mddev_t *mddev, char *page)
2399{
2400 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
2401}
2402
2403static ssize_t
2404suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
2405{
2406 char *e;
2407 unsigned long long new = simple_strtoull(buf, &e, 10);
2408
2409 if (mddev->pers->quiesce == NULL)
2410 return -EINVAL;
2411 if (buf == e || (*e && *e != '\n'))
2412 return -EINVAL;
2413 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
2414 (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
2415 mddev->suspend_hi = new;
2416 mddev->pers->quiesce(mddev, 1);
2417 mddev->pers->quiesce(mddev, 0);
2418 return len;
2419 } else
2420 return -EINVAL;
2421}
2422static struct md_sysfs_entry md_suspend_hi =
2423__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
2424
2425
2307static struct attribute *md_default_attrs[] = { 2426static struct attribute *md_default_attrs[] = {
2308 &md_level.attr, 2427 &md_level.attr,
2309 &md_raid_disks.attr, 2428 &md_raid_disks.attr,
@@ -2321,6 +2440,8 @@ static struct attribute *md_redundancy_attrs[] = {
2321 &md_sync_max.attr, 2440 &md_sync_max.attr,
2322 &md_sync_speed.attr, 2441 &md_sync_speed.attr,
2323 &md_sync_completed.attr, 2442 &md_sync_completed.attr,
2443 &md_suspend_lo.attr,
2444 &md_suspend_hi.attr,
2324 NULL, 2445 NULL,
2325}; 2446};
2326static struct attribute_group md_redundancy_group = { 2447static struct attribute_group md_redundancy_group = {
@@ -2380,7 +2501,7 @@ int mdp_major = 0;
2380 2501
2381static struct kobject *md_probe(dev_t dev, int *part, void *data) 2502static struct kobject *md_probe(dev_t dev, int *part, void *data)
2382{ 2503{
2383 static DECLARE_MUTEX(disks_sem); 2504 static DEFINE_MUTEX(disks_mutex);
2384 mddev_t *mddev = mddev_find(dev); 2505 mddev_t *mddev = mddev_find(dev);
2385 struct gendisk *disk; 2506 struct gendisk *disk;
2386 int partitioned = (MAJOR(dev) != MD_MAJOR); 2507 int partitioned = (MAJOR(dev) != MD_MAJOR);
@@ -2390,15 +2511,15 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
2390 if (!mddev) 2511 if (!mddev)
2391 return NULL; 2512 return NULL;
2392 2513
2393 down(&disks_sem); 2514 mutex_lock(&disks_mutex);
2394 if (mddev->gendisk) { 2515 if (mddev->gendisk) {
2395 up(&disks_sem); 2516 mutex_unlock(&disks_mutex);
2396 mddev_put(mddev); 2517 mddev_put(mddev);
2397 return NULL; 2518 return NULL;
2398 } 2519 }
2399 disk = alloc_disk(1 << shift); 2520 disk = alloc_disk(1 << shift);
2400 if (!disk) { 2521 if (!disk) {
2401 up(&disks_sem); 2522 mutex_unlock(&disks_mutex);
2402 mddev_put(mddev); 2523 mddev_put(mddev);
2403 return NULL; 2524 return NULL;
2404 } 2525 }
@@ -2416,7 +2537,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
2416 disk->queue = mddev->queue; 2537 disk->queue = mddev->queue;
2417 add_disk(disk); 2538 add_disk(disk);
2418 mddev->gendisk = disk; 2539 mddev->gendisk = disk;
2419 up(&disks_sem); 2540 mutex_unlock(&disks_mutex);
2420 mddev->kobj.parent = &disk->kobj; 2541 mddev->kobj.parent = &disk->kobj;
2421 mddev->kobj.k_name = NULL; 2542 mddev->kobj.k_name = NULL;
2422 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); 2543 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md");
@@ -2539,6 +2660,14 @@ static int do_md_run(mddev_t * mddev)
2539 mddev->level = pers->level; 2660 mddev->level = pers->level;
2540 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 2661 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
2541 2662
2663 if (mddev->reshape_position != MaxSector &&
2664 pers->start_reshape == NULL) {
2665 /* This personality cannot handle reshaping... */
2666 mddev->pers = NULL;
2667 module_put(pers->owner);
2668 return -EINVAL;
2669 }
2670
2542 mddev->recovery = 0; 2671 mddev->recovery = 0;
2543 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 2672 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
2544 mddev->barriers_work = 1; 2673 mddev->barriers_work = 1;
@@ -2772,7 +2901,6 @@ static void autorun_array(mddev_t *mddev)
2772 */ 2901 */
2773static void autorun_devices(int part) 2902static void autorun_devices(int part)
2774{ 2903{
2775 struct list_head candidates;
2776 struct list_head *tmp; 2904 struct list_head *tmp;
2777 mdk_rdev_t *rdev0, *rdev; 2905 mdk_rdev_t *rdev0, *rdev;
2778 mddev_t *mddev; 2906 mddev_t *mddev;
@@ -2781,6 +2909,7 @@ static void autorun_devices(int part)
2781 printk(KERN_INFO "md: autorun ...\n"); 2909 printk(KERN_INFO "md: autorun ...\n");
2782 while (!list_empty(&pending_raid_disks)) { 2910 while (!list_empty(&pending_raid_disks)) {
2783 dev_t dev; 2911 dev_t dev;
2912 LIST_HEAD(candidates);
2784 rdev0 = list_entry(pending_raid_disks.next, 2913 rdev0 = list_entry(pending_raid_disks.next,
2785 mdk_rdev_t, same_set); 2914 mdk_rdev_t, same_set);
2786 2915
@@ -3427,11 +3556,18 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
3427 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 3556 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
3428 mddev->bitmap_offset = 0; 3557 mddev->bitmap_offset = 0;
3429 3558
3559 mddev->reshape_position = MaxSector;
3560
3430 /* 3561 /*
3431 * Generate a 128 bit UUID 3562 * Generate a 128 bit UUID
3432 */ 3563 */
3433 get_random_bytes(mddev->uuid, 16); 3564 get_random_bytes(mddev->uuid, 16);
3434 3565
3566 mddev->new_level = mddev->level;
3567 mddev->new_chunk = mddev->chunk_size;
3568 mddev->new_layout = mddev->layout;
3569 mddev->delta_disks = 0;
3570
3435 return 0; 3571 return 0;
3436} 3572}
3437 3573
@@ -3440,6 +3576,7 @@ static int update_size(mddev_t *mddev, unsigned long size)
3440 mdk_rdev_t * rdev; 3576 mdk_rdev_t * rdev;
3441 int rv; 3577 int rv;
3442 struct list_head *tmp; 3578 struct list_head *tmp;
3579 int fit = (size == 0);
3443 3580
3444 if (mddev->pers->resize == NULL) 3581 if (mddev->pers->resize == NULL)
3445 return -EINVAL; 3582 return -EINVAL;
@@ -3457,7 +3594,6 @@ static int update_size(mddev_t *mddev, unsigned long size)
3457 return -EBUSY; 3594 return -EBUSY;
3458 ITERATE_RDEV(mddev,rdev,tmp) { 3595 ITERATE_RDEV(mddev,rdev,tmp) {
3459 sector_t avail; 3596 sector_t avail;
3460 int fit = (size == 0);
3461 if (rdev->sb_offset > rdev->data_offset) 3597 if (rdev->sb_offset > rdev->data_offset)
3462 avail = (rdev->sb_offset*2) - rdev->data_offset; 3598 avail = (rdev->sb_offset*2) - rdev->data_offset;
3463 else 3599 else
@@ -3487,14 +3623,16 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
3487{ 3623{
3488 int rv; 3624 int rv;
3489 /* change the number of raid disks */ 3625 /* change the number of raid disks */
3490 if (mddev->pers->reshape == NULL) 3626 if (mddev->pers->check_reshape == NULL)
3491 return -EINVAL; 3627 return -EINVAL;
3492 if (raid_disks <= 0 || 3628 if (raid_disks <= 0 ||
3493 raid_disks >= mddev->max_disks) 3629 raid_disks >= mddev->max_disks)
3494 return -EINVAL; 3630 return -EINVAL;
3495 if (mddev->sync_thread) 3631 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
3496 return -EBUSY; 3632 return -EBUSY;
3497 rv = mddev->pers->reshape(mddev, raid_disks); 3633 mddev->delta_disks = raid_disks - mddev->raid_disks;
3634
3635 rv = mddev->pers->check_reshape(mddev);
3498 return rv; 3636 return rv;
3499} 3637}
3500 3638
@@ -4041,7 +4179,10 @@ static void status_unused(struct seq_file *seq)
4041 4179
4042static void status_resync(struct seq_file *seq, mddev_t * mddev) 4180static void status_resync(struct seq_file *seq, mddev_t * mddev)
4043{ 4181{
4044 unsigned long max_blocks, resync, res, dt, db, rt; 4182 sector_t max_blocks, resync, res;
4183 unsigned long dt, db, rt;
4184 int scale;
4185 unsigned int per_milli;
4045 4186
4046 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 4187 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
4047 4188
@@ -4057,9 +4198,22 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
4057 MD_BUG(); 4198 MD_BUG();
4058 return; 4199 return;
4059 } 4200 }
4060 res = (resync/1024)*1000/(max_blocks/1024 + 1); 4201 /* Pick 'scale' such that (resync>>scale)*1000 will fit
4202 * in a sector_t, and (max_blocks>>scale) will fit in a
4203 * u32, as those are the requirements for sector_div.
4204 * Thus 'scale' must be at least 10
4205 */
4206 scale = 10;
4207 if (sizeof(sector_t) > sizeof(unsigned long)) {
4208 while ( max_blocks/2 > (1ULL<<(scale+32)))
4209 scale++;
4210 }
4211 res = (resync>>scale)*1000;
4212 sector_div(res, (u32)((max_blocks>>scale)+1));
4213
4214 per_milli = res;
4061 { 4215 {
4062 int i, x = res/50, y = 20-x; 4216 int i, x = per_milli/50, y = 20-x;
4063 seq_printf(seq, "["); 4217 seq_printf(seq, "[");
4064 for (i = 0; i < x; i++) 4218 for (i = 0; i < x; i++)
4065 seq_printf(seq, "="); 4219 seq_printf(seq, "=");
@@ -4068,10 +4222,14 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
4068 seq_printf(seq, "."); 4222 seq_printf(seq, ".");
4069 seq_printf(seq, "] "); 4223 seq_printf(seq, "] ");
4070 } 4224 }
4071 seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", 4225 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
4226 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
4227 "reshape" :
4072 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 4228 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
4073 "resync" : "recovery"), 4229 "resync" : "recovery")),
4074 res/10, res % 10, resync, max_blocks); 4230 per_milli/10, per_milli % 10,
4231 (unsigned long long) resync,
4232 (unsigned long long) max_blocks);
4075 4233
4076 /* 4234 /*
4077 * We do not want to overflow, so the order of operands and 4235 * We do not want to overflow, so the order of operands and
@@ -4085,7 +4243,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
4085 dt = ((jiffies - mddev->resync_mark) / HZ); 4243 dt = ((jiffies - mddev->resync_mark) / HZ);
4086 if (!dt) dt++; 4244 if (!dt) dt++;
4087 db = resync - (mddev->resync_mark_cnt/2); 4245 db = resync - (mddev->resync_mark_cnt/2);
4088 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; 4246 rt = (dt * ((unsigned long)(max_blocks-resync) / (db/100+1)))/100;
4089 4247
4090 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 4248 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
4091 4249
@@ -4442,7 +4600,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
4442 4600
4443#define SYNC_MARKS 10 4601#define SYNC_MARKS 10
4444#define SYNC_MARK_STEP (3*HZ) 4602#define SYNC_MARK_STEP (3*HZ)
4445static void md_do_sync(mddev_t *mddev) 4603void md_do_sync(mddev_t *mddev)
4446{ 4604{
4447 mddev_t *mddev2; 4605 mddev_t *mddev2;
4448 unsigned int currspeed = 0, 4606 unsigned int currspeed = 0,
@@ -4522,7 +4680,9 @@ static void md_do_sync(mddev_t *mddev)
4522 */ 4680 */
4523 max_sectors = mddev->resync_max_sectors; 4681 max_sectors = mddev->resync_max_sectors;
4524 mddev->resync_mismatches = 0; 4682 mddev->resync_mismatches = 0;
4525 } else 4683 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4684 max_sectors = mddev->size << 1;
4685 else
4526 /* recovery follows the physical size of devices */ 4686 /* recovery follows the physical size of devices */
4527 max_sectors = mddev->size << 1; 4687 max_sectors = mddev->size << 1;
4528 4688
@@ -4658,6 +4818,8 @@ static void md_do_sync(mddev_t *mddev)
4658 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 4818 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
4659 4819
4660 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 4820 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
4821 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
4822 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
4661 mddev->curr_resync > 2 && 4823 mddev->curr_resync > 2 &&
4662 mddev->curr_resync >= mddev->recovery_cp) { 4824 mddev->curr_resync >= mddev->recovery_cp) {
4663 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 4825 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -4675,6 +4837,7 @@ static void md_do_sync(mddev_t *mddev)
4675 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 4837 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
4676 md_wakeup_thread(mddev->thread); 4838 md_wakeup_thread(mddev->thread);
4677} 4839}
4840EXPORT_SYMBOL_GPL(md_do_sync);
4678 4841
4679 4842
4680/* 4843/*
@@ -4730,7 +4893,7 @@ void md_check_recovery(mddev_t *mddev)
4730 )) 4893 ))
4731 return; 4894 return;
4732 4895
4733 if (mddev_trylock(mddev)==0) { 4896 if (mddev_trylock(mddev)) {
4734 int spares =0; 4897 int spares =0;
4735 4898
4736 spin_lock_irq(&mddev->write_lock); 4899 spin_lock_irq(&mddev->write_lock);
@@ -4866,7 +5029,7 @@ static int md_notify_reboot(struct notifier_block *this,
4866 printk(KERN_INFO "md: stopping all md devices.\n"); 5029 printk(KERN_INFO "md: stopping all md devices.\n");
4867 5030
4868 ITERATE_MDDEV(mddev,tmp) 5031 ITERATE_MDDEV(mddev,tmp)
4869 if (mddev_trylock(mddev)==0) 5032 if (mddev_trylock(mddev))
4870 do_md_stop (mddev, 1); 5033 do_md_stop (mddev, 1);
4871 /* 5034 /*
4872 * certain more exotic SCSI devices are known to be 5035 * certain more exotic SCSI devices are known to be
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5d88329e3c7a..3cb0872a845d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1402,6 +1402,9 @@ static void raid1d(mddev_t *mddev)
1402 clear_bit(R1BIO_BarrierRetry, &r1_bio->state); 1402 clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
1403 clear_bit(R1BIO_Barrier, &r1_bio->state); 1403 clear_bit(R1BIO_Barrier, &r1_bio->state);
1404 for (i=0; i < conf->raid_disks; i++) 1404 for (i=0; i < conf->raid_disks; i++)
1405 if (r1_bio->bios[i])
1406 atomic_inc(&r1_bio->remaining);
1407 for (i=0; i < conf->raid_disks; i++)
1405 if (r1_bio->bios[i]) { 1408 if (r1_bio->bios[i]) {
1406 struct bio_vec *bvec; 1409 struct bio_vec *bvec;
1407 int j; 1410 int j;
@@ -1789,6 +1792,11 @@ static int run(mddev_t *mddev)
1789 mdname(mddev), mddev->level); 1792 mdname(mddev), mddev->level);
1790 goto out; 1793 goto out;
1791 } 1794 }
1795 if (mddev->reshape_position != MaxSector) {
1796 printk("raid1: %s: reshape_position set but not supported\n",
1797 mdname(mddev));
1798 goto out;
1799 }
1792 /* 1800 /*
1793 * copy the already verified devices into our private RAID1 1801 * copy the already verified devices into our private RAID1
1794 * bookkeeping area. [whatever we allocate in run(), 1802 * bookkeeping area. [whatever we allocate in run(),
@@ -1971,7 +1979,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
1971 return 0; 1979 return 0;
1972} 1980}
1973 1981
1974static int raid1_reshape(mddev_t *mddev, int raid_disks) 1982static int raid1_reshape(mddev_t *mddev)
1975{ 1983{
1976 /* We need to: 1984 /* We need to:
1977 * 1/ resize the r1bio_pool 1985 * 1/ resize the r1bio_pool
@@ -1988,10 +1996,22 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
1988 struct pool_info *newpoolinfo; 1996 struct pool_info *newpoolinfo;
1989 mirror_info_t *newmirrors; 1997 mirror_info_t *newmirrors;
1990 conf_t *conf = mddev_to_conf(mddev); 1998 conf_t *conf = mddev_to_conf(mddev);
1991 int cnt; 1999 int cnt, raid_disks;
1992 2000
1993 int d, d2; 2001 int d, d2;
1994 2002
2003 /* Cannot change chunk_size, layout, or level */
2004 if (mddev->chunk_size != mddev->new_chunk ||
2005 mddev->layout != mddev->new_layout ||
2006 mddev->level != mddev->new_level) {
2007 mddev->new_chunk = mddev->chunk_size;
2008 mddev->new_layout = mddev->layout;
2009 mddev->new_level = mddev->level;
2010 return -EINVAL;
2011 }
2012
2013 raid_disks = mddev->raid_disks + mddev->delta_disks;
2014
1995 if (raid_disks < conf->raid_disks) { 2015 if (raid_disks < conf->raid_disks) {
1996 cnt=0; 2016 cnt=0;
1997 for (d= 0; d < conf->raid_disks; d++) 2017 for (d= 0; d < conf->raid_disks; d++)
@@ -2038,6 +2058,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
2038 2058
2039 mddev->degraded += (raid_disks - conf->raid_disks); 2059 mddev->degraded += (raid_disks - conf->raid_disks);
2040 conf->raid_disks = mddev->raid_disks = raid_disks; 2060 conf->raid_disks = mddev->raid_disks = raid_disks;
2061 mddev->delta_disks = 0;
2041 2062
2042 conf->last_used = 0; /* just make sure it is in-range */ 2063 conf->last_used = 0; /* just make sure it is in-range */
2043 lower_barrier(conf); 2064 lower_barrier(conf);
@@ -2079,7 +2100,7 @@ static struct mdk_personality raid1_personality =
2079 .spare_active = raid1_spare_active, 2100 .spare_active = raid1_spare_active,
2080 .sync_request = sync_request, 2101 .sync_request = sync_request,
2081 .resize = raid1_resize, 2102 .resize = raid1_resize,
2082 .reshape = raid1_reshape, 2103 .check_reshape = raid1_reshape,
2083 .quiesce = raid1_quiesce, 2104 .quiesce = raid1_quiesce,
2084}; 2105};
2085 2106
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2dba305daf3c..dae740adaf65 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -22,6 +22,7 @@
22#include <linux/raid/raid5.h> 22#include <linux/raid/raid5.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/bitops.h> 24#include <linux/bitops.h>
25#include <linux/kthread.h>
25#include <asm/atomic.h> 26#include <asm/atomic.h>
26 27
27#include <linux/raid/bitmap.h> 28#include <linux/raid/bitmap.h>
@@ -93,11 +94,11 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
93 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 94 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
94 md_wakeup_thread(conf->mddev->thread); 95 md_wakeup_thread(conf->mddev->thread);
95 } 96 }
96 list_add_tail(&sh->lru, &conf->inactive_list);
97 atomic_dec(&conf->active_stripes); 97 atomic_dec(&conf->active_stripes);
98 if (!conf->inactive_blocked || 98 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
99 atomic_read(&conf->active_stripes) < (conf->max_nr_stripes*3/4)) 99 list_add_tail(&sh->lru, &conf->inactive_list);
100 wake_up(&conf->wait_for_stripe); 100 wake_up(&conf->wait_for_stripe);
101 }
101 } 102 }
102 } 103 }
103} 104}
@@ -178,10 +179,10 @@ static int grow_buffers(struct stripe_head *sh, int num)
178 179
179static void raid5_build_block (struct stripe_head *sh, int i); 180static void raid5_build_block (struct stripe_head *sh, int i);
180 181
181static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx) 182static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks)
182{ 183{
183 raid5_conf_t *conf = sh->raid_conf; 184 raid5_conf_t *conf = sh->raid_conf;
184 int disks = conf->raid_disks, i; 185 int i;
185 186
186 if (atomic_read(&sh->count) != 0) 187 if (atomic_read(&sh->count) != 0)
187 BUG(); 188 BUG();
@@ -198,7 +199,9 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
198 sh->pd_idx = pd_idx; 199 sh->pd_idx = pd_idx;
199 sh->state = 0; 200 sh->state = 0;
200 201
201 for (i=disks; i--; ) { 202 sh->disks = disks;
203
204 for (i = sh->disks; i--; ) {
202 struct r5dev *dev = &sh->dev[i]; 205 struct r5dev *dev = &sh->dev[i];
203 206
204 if (dev->toread || dev->towrite || dev->written || 207 if (dev->toread || dev->towrite || dev->written ||
@@ -215,7 +218,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
215 insert_hash(conf, sh); 218 insert_hash(conf, sh);
216} 219}
217 220
218static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector) 221static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
219{ 222{
220 struct stripe_head *sh; 223 struct stripe_head *sh;
221 struct hlist_node *hn; 224 struct hlist_node *hn;
@@ -223,7 +226,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
223 CHECK_DEVLOCK(); 226 CHECK_DEVLOCK();
224 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); 227 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
225 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 228 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
226 if (sh->sector == sector) 229 if (sh->sector == sector && sh->disks == disks)
227 return sh; 230 return sh;
228 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); 231 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
229 return NULL; 232 return NULL;
@@ -232,8 +235,8 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
232static void unplug_slaves(mddev_t *mddev); 235static void unplug_slaves(mddev_t *mddev);
233static void raid5_unplug_device(request_queue_t *q); 236static void raid5_unplug_device(request_queue_t *q);
234 237
235static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, 238static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
236 int pd_idx, int noblock) 239 int pd_idx, int noblock)
237{ 240{
238 struct stripe_head *sh; 241 struct stripe_head *sh;
239 242
@@ -245,7 +248,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
245 wait_event_lock_irq(conf->wait_for_stripe, 248 wait_event_lock_irq(conf->wait_for_stripe,
246 conf->quiesce == 0, 249 conf->quiesce == 0,
247 conf->device_lock, /* nothing */); 250 conf->device_lock, /* nothing */);
248 sh = __find_stripe(conf, sector); 251 sh = __find_stripe(conf, sector, disks);
249 if (!sh) { 252 if (!sh) {
250 if (!conf->inactive_blocked) 253 if (!conf->inactive_blocked)
251 sh = get_free_stripe(conf); 254 sh = get_free_stripe(conf);
@@ -259,11 +262,11 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
259 < (conf->max_nr_stripes *3/4) 262 < (conf->max_nr_stripes *3/4)
260 || !conf->inactive_blocked), 263 || !conf->inactive_blocked),
261 conf->device_lock, 264 conf->device_lock,
262 unplug_slaves(conf->mddev); 265 unplug_slaves(conf->mddev)
263 ); 266 );
264 conf->inactive_blocked = 0; 267 conf->inactive_blocked = 0;
265 } else 268 } else
266 init_stripe(sh, sector, pd_idx); 269 init_stripe(sh, sector, pd_idx, disks);
267 } else { 270 } else {
268 if (atomic_read(&sh->count)) { 271 if (atomic_read(&sh->count)) {
269 if (!list_empty(&sh->lru)) 272 if (!list_empty(&sh->lru))
@@ -271,9 +274,8 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
271 } else { 274 } else {
272 if (!test_bit(STRIPE_HANDLE, &sh->state)) 275 if (!test_bit(STRIPE_HANDLE, &sh->state))
273 atomic_inc(&conf->active_stripes); 276 atomic_inc(&conf->active_stripes);
274 if (list_empty(&sh->lru)) 277 if (!list_empty(&sh->lru))
275 BUG(); 278 list_del_init(&sh->lru);
276 list_del_init(&sh->lru);
277 } 279 }
278 } 280 }
279 } while (sh == NULL); 281 } while (sh == NULL);
@@ -300,6 +302,7 @@ static int grow_one_stripe(raid5_conf_t *conf)
300 kmem_cache_free(conf->slab_cache, sh); 302 kmem_cache_free(conf->slab_cache, sh);
301 return 0; 303 return 0;
302 } 304 }
305 sh->disks = conf->raid_disks;
303 /* we just created an active stripe so... */ 306 /* we just created an active stripe so... */
304 atomic_set(&sh->count, 1); 307 atomic_set(&sh->count, 1);
305 atomic_inc(&conf->active_stripes); 308 atomic_inc(&conf->active_stripes);
@@ -313,14 +316,16 @@ static int grow_stripes(raid5_conf_t *conf, int num)
313 kmem_cache_t *sc; 316 kmem_cache_t *sc;
314 int devs = conf->raid_disks; 317 int devs = conf->raid_disks;
315 318
316 sprintf(conf->cache_name, "raid5/%s", mdname(conf->mddev)); 319 sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
317 320 sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev));
318 sc = kmem_cache_create(conf->cache_name, 321 conf->active_name = 0;
322 sc = kmem_cache_create(conf->cache_name[conf->active_name],
319 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 323 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
320 0, 0, NULL, NULL); 324 0, 0, NULL, NULL);
321 if (!sc) 325 if (!sc)
322 return 1; 326 return 1;
323 conf->slab_cache = sc; 327 conf->slab_cache = sc;
328 conf->pool_size = devs;
324 while (num--) { 329 while (num--) {
325 if (!grow_one_stripe(conf)) 330 if (!grow_one_stripe(conf))
326 return 1; 331 return 1;
@@ -328,6 +333,129 @@ static int grow_stripes(raid5_conf_t *conf, int num)
328 return 0; 333 return 0;
329} 334}
330 335
336#ifdef CONFIG_MD_RAID5_RESHAPE
337static int resize_stripes(raid5_conf_t *conf, int newsize)
338{
339 /* Make all the stripes able to hold 'newsize' devices.
340 * New slots in each stripe get 'page' set to a new page.
341 *
342 * This happens in stages:
343 * 1/ create a new kmem_cache and allocate the required number of
344 * stripe_heads.
345 * 2/ gather all the old stripe_heads and tranfer the pages across
346 * to the new stripe_heads. This will have the side effect of
347 * freezing the array as once all stripe_heads have been collected,
348 * no IO will be possible. Old stripe heads are freed once their
349 * pages have been transferred over, and the old kmem_cache is
350 * freed when all stripes are done.
351 * 3/ reallocate conf->disks to be suitable bigger. If this fails,
352 * we simple return a failre status - no need to clean anything up.
353 * 4/ allocate new pages for the new slots in the new stripe_heads.
354 * If this fails, we don't bother trying the shrink the
355 * stripe_heads down again, we just leave them as they are.
356 * As each stripe_head is processed the new one is released into
357 * active service.
358 *
359 * Once step2 is started, we cannot afford to wait for a write,
360 * so we use GFP_NOIO allocations.
361 */
362 struct stripe_head *osh, *nsh;
363 LIST_HEAD(newstripes);
364 struct disk_info *ndisks;
365 int err = 0;
366 kmem_cache_t *sc;
367 int i;
368
369 if (newsize <= conf->pool_size)
370 return 0; /* never bother to shrink */
371
372 /* Step 1 */
373 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
374 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
375 0, 0, NULL, NULL);
376 if (!sc)
377 return -ENOMEM;
378
379 for (i = conf->max_nr_stripes; i; i--) {
380 nsh = kmem_cache_alloc(sc, GFP_KERNEL);
381 if (!nsh)
382 break;
383
384 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
385
386 nsh->raid_conf = conf;
387 spin_lock_init(&nsh->lock);
388
389 list_add(&nsh->lru, &newstripes);
390 }
391 if (i) {
392 /* didn't get enough, give up */
393 while (!list_empty(&newstripes)) {
394 nsh = list_entry(newstripes.next, struct stripe_head, lru);
395 list_del(&nsh->lru);
396 kmem_cache_free(sc, nsh);
397 }
398 kmem_cache_destroy(sc);
399 return -ENOMEM;
400 }
401 /* Step 2 - Must use GFP_NOIO now.
402 * OK, we have enough stripes, start collecting inactive
403 * stripes and copying them over
404 */
405 list_for_each_entry(nsh, &newstripes, lru) {
406 spin_lock_irq(&conf->device_lock);
407 wait_event_lock_irq(conf->wait_for_stripe,
408 !list_empty(&conf->inactive_list),
409 conf->device_lock,
410 unplug_slaves(conf->mddev)
411 );
412 osh = get_free_stripe(conf);
413 spin_unlock_irq(&conf->device_lock);
414 atomic_set(&nsh->count, 1);
415 for(i=0; i<conf->pool_size; i++)
416 nsh->dev[i].page = osh->dev[i].page;
417 for( ; i<newsize; i++)
418 nsh->dev[i].page = NULL;
419 kmem_cache_free(conf->slab_cache, osh);
420 }
421 kmem_cache_destroy(conf->slab_cache);
422
423 /* Step 3.
424 * At this point, we are holding all the stripes so the array
425 * is completely stalled, so now is a good time to resize
426 * conf->disks.
427 */
428 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
429 if (ndisks) {
430 for (i=0; i<conf->raid_disks; i++)
431 ndisks[i] = conf->disks[i];
432 kfree(conf->disks);
433 conf->disks = ndisks;
434 } else
435 err = -ENOMEM;
436
437 /* Step 4, return new stripes to service */
438 while(!list_empty(&newstripes)) {
439 nsh = list_entry(newstripes.next, struct stripe_head, lru);
440 list_del_init(&nsh->lru);
441 for (i=conf->raid_disks; i < newsize; i++)
442 if (nsh->dev[i].page == NULL) {
443 struct page *p = alloc_page(GFP_NOIO);
444 nsh->dev[i].page = p;
445 if (!p)
446 err = -ENOMEM;
447 }
448 release_stripe(nsh);
449 }
450 /* critical section pass, GFP_NOIO no longer needed */
451
452 conf->slab_cache = sc;
453 conf->active_name = 1-conf->active_name;
454 conf->pool_size = newsize;
455 return err;
456}
457#endif
458
331static int drop_one_stripe(raid5_conf_t *conf) 459static int drop_one_stripe(raid5_conf_t *conf)
332{ 460{
333 struct stripe_head *sh; 461 struct stripe_head *sh;
@@ -339,7 +467,7 @@ static int drop_one_stripe(raid5_conf_t *conf)
339 return 0; 467 return 0;
340 if (atomic_read(&sh->count)) 468 if (atomic_read(&sh->count))
341 BUG(); 469 BUG();
342 shrink_buffers(sh, conf->raid_disks); 470 shrink_buffers(sh, conf->pool_size);
343 kmem_cache_free(conf->slab_cache, sh); 471 kmem_cache_free(conf->slab_cache, sh);
344 atomic_dec(&conf->active_stripes); 472 atomic_dec(&conf->active_stripes);
345 return 1; 473 return 1;
@@ -360,7 +488,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
360{ 488{
361 struct stripe_head *sh = bi->bi_private; 489 struct stripe_head *sh = bi->bi_private;
362 raid5_conf_t *conf = sh->raid_conf; 490 raid5_conf_t *conf = sh->raid_conf;
363 int disks = conf->raid_disks, i; 491 int disks = sh->disks, i;
364 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 492 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
365 493
366 if (bi->bi_size) 494 if (bi->bi_size)
@@ -458,7 +586,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
458{ 586{
459 struct stripe_head *sh = bi->bi_private; 587 struct stripe_head *sh = bi->bi_private;
460 raid5_conf_t *conf = sh->raid_conf; 588 raid5_conf_t *conf = sh->raid_conf;
461 int disks = conf->raid_disks, i; 589 int disks = sh->disks, i;
462 unsigned long flags; 590 unsigned long flags;
463 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 591 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
464 592
@@ -612,7 +740,7 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
612static sector_t compute_blocknr(struct stripe_head *sh, int i) 740static sector_t compute_blocknr(struct stripe_head *sh, int i)
613{ 741{
614 raid5_conf_t *conf = sh->raid_conf; 742 raid5_conf_t *conf = sh->raid_conf;
615 int raid_disks = conf->raid_disks, data_disks = raid_disks - 1; 743 int raid_disks = sh->disks, data_disks = raid_disks - 1;
616 sector_t new_sector = sh->sector, check; 744 sector_t new_sector = sh->sector, check;
617 int sectors_per_chunk = conf->chunk_size >> 9; 745 int sectors_per_chunk = conf->chunk_size >> 9;
618 sector_t stripe; 746 sector_t stripe;
@@ -713,8 +841,7 @@ static void copy_data(int frombio, struct bio *bio,
713 841
714static void compute_block(struct stripe_head *sh, int dd_idx) 842static void compute_block(struct stripe_head *sh, int dd_idx)
715{ 843{
716 raid5_conf_t *conf = sh->raid_conf; 844 int i, count, disks = sh->disks;
717 int i, count, disks = conf->raid_disks;
718 void *ptr[MAX_XOR_BLOCKS], *p; 845 void *ptr[MAX_XOR_BLOCKS], *p;
719 846
720 PRINTK("compute_block, stripe %llu, idx %d\n", 847 PRINTK("compute_block, stripe %llu, idx %d\n",
@@ -744,7 +871,7 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
744static void compute_parity(struct stripe_head *sh, int method) 871static void compute_parity(struct stripe_head *sh, int method)
745{ 872{
746 raid5_conf_t *conf = sh->raid_conf; 873 raid5_conf_t *conf = sh->raid_conf;
747 int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count; 874 int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
748 void *ptr[MAX_XOR_BLOCKS]; 875 void *ptr[MAX_XOR_BLOCKS];
749 struct bio *chosen; 876 struct bio *chosen;
750 877
@@ -910,6 +1037,20 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
910 return 0; 1037 return 0;
911} 1038}
912 1039
1040static void end_reshape(raid5_conf_t *conf);
1041
1042static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1043{
1044 int sectors_per_chunk = conf->chunk_size >> 9;
1045 sector_t x = stripe;
1046 int pd_idx, dd_idx;
1047 int chunk_offset = sector_div(x, sectors_per_chunk);
1048 stripe = x;
1049 raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk
1050 + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf);
1051 return pd_idx;
1052}
1053
913 1054
914/* 1055/*
915 * handle_stripe - do things to a stripe. 1056 * handle_stripe - do things to a stripe.
@@ -932,11 +1073,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
932static void handle_stripe(struct stripe_head *sh) 1073static void handle_stripe(struct stripe_head *sh)
933{ 1074{
934 raid5_conf_t *conf = sh->raid_conf; 1075 raid5_conf_t *conf = sh->raid_conf;
935 int disks = conf->raid_disks; 1076 int disks = sh->disks;
936 struct bio *return_bi= NULL; 1077 struct bio *return_bi= NULL;
937 struct bio *bi; 1078 struct bio *bi;
938 int i; 1079 int i;
939 int syncing; 1080 int syncing, expanding, expanded;
940 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; 1081 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
941 int non_overwrite = 0; 1082 int non_overwrite = 0;
942 int failed_num=0; 1083 int failed_num=0;
@@ -951,6 +1092,8 @@ static void handle_stripe(struct stripe_head *sh)
951 clear_bit(STRIPE_DELAYED, &sh->state); 1092 clear_bit(STRIPE_DELAYED, &sh->state);
952 1093
953 syncing = test_bit(STRIPE_SYNCING, &sh->state); 1094 syncing = test_bit(STRIPE_SYNCING, &sh->state);
1095 expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1096 expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
954 /* Now to look around and see what can be done */ 1097 /* Now to look around and see what can be done */
955 1098
956 rcu_read_lock(); 1099 rcu_read_lock();
@@ -1143,13 +1286,14 @@ static void handle_stripe(struct stripe_head *sh)
1143 * parity, or to satisfy requests 1286 * parity, or to satisfy requests
1144 * or to load a block that is being partially written. 1287 * or to load a block that is being partially written.
1145 */ 1288 */
1146 if (to_read || non_overwrite || (syncing && (uptodate < disks))) { 1289 if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) {
1147 for (i=disks; i--;) { 1290 for (i=disks; i--;) {
1148 dev = &sh->dev[i]; 1291 dev = &sh->dev[i];
1149 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && 1292 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1150 (dev->toread || 1293 (dev->toread ||
1151 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 1294 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1152 syncing || 1295 syncing ||
1296 expanding ||
1153 (failed && (sh->dev[failed_num].toread || 1297 (failed && (sh->dev[failed_num].toread ||
1154 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags)))) 1298 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
1155 ) 1299 )
@@ -1339,13 +1483,77 @@ static void handle_stripe(struct stripe_head *sh)
1339 set_bit(R5_Wantwrite, &dev->flags); 1483 set_bit(R5_Wantwrite, &dev->flags);
1340 set_bit(R5_ReWrite, &dev->flags); 1484 set_bit(R5_ReWrite, &dev->flags);
1341 set_bit(R5_LOCKED, &dev->flags); 1485 set_bit(R5_LOCKED, &dev->flags);
1486 locked++;
1342 } else { 1487 } else {
1343 /* let's read it back */ 1488 /* let's read it back */
1344 set_bit(R5_Wantread, &dev->flags); 1489 set_bit(R5_Wantread, &dev->flags);
1345 set_bit(R5_LOCKED, &dev->flags); 1490 set_bit(R5_LOCKED, &dev->flags);
1491 locked++;
1346 } 1492 }
1347 } 1493 }
1348 1494
1495 if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
1496 /* Need to write out all blocks after computing parity */
1497 sh->disks = conf->raid_disks;
1498 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
1499 compute_parity(sh, RECONSTRUCT_WRITE);
1500 for (i= conf->raid_disks; i--;) {
1501 set_bit(R5_LOCKED, &sh->dev[i].flags);
1502 locked++;
1503 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1504 }
1505 clear_bit(STRIPE_EXPANDING, &sh->state);
1506 } else if (expanded) {
1507 clear_bit(STRIPE_EXPAND_READY, &sh->state);
1508 atomic_dec(&conf->reshape_stripes);
1509 wake_up(&conf->wait_for_overlap);
1510 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
1511 }
1512
1513 if (expanding && locked == 0) {
1514 /* We have read all the blocks in this stripe and now we need to
1515 * copy some of them into a target stripe for expand.
1516 */
1517 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
1518 for (i=0; i< sh->disks; i++)
1519 if (i != sh->pd_idx) {
1520 int dd_idx, pd_idx, j;
1521 struct stripe_head *sh2;
1522
1523 sector_t bn = compute_blocknr(sh, i);
1524 sector_t s = raid5_compute_sector(bn, conf->raid_disks,
1525 conf->raid_disks-1,
1526 &dd_idx, &pd_idx, conf);
1527 sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1);
1528 if (sh2 == NULL)
1529 /* so far only the early blocks of this stripe
1530 * have been requested. When later blocks
1531 * get requested, we will try again
1532 */
1533 continue;
1534 if(!test_bit(STRIPE_EXPANDING, &sh2->state) ||
1535 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
1536 /* must have already done this block */
1537 release_stripe(sh2);
1538 continue;
1539 }
1540 memcpy(page_address(sh2->dev[dd_idx].page),
1541 page_address(sh->dev[i].page),
1542 STRIPE_SIZE);
1543 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
1544 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
1545 for (j=0; j<conf->raid_disks; j++)
1546 if (j != sh2->pd_idx &&
1547 !test_bit(R5_Expanded, &sh2->dev[j].flags))
1548 break;
1549 if (j == conf->raid_disks) {
1550 set_bit(STRIPE_EXPAND_READY, &sh2->state);
1551 set_bit(STRIPE_HANDLE, &sh2->state);
1552 }
1553 release_stripe(sh2);
1554 }
1555 }
1556
1349 spin_unlock(&sh->lock); 1557 spin_unlock(&sh->lock);
1350 1558
1351 while ((bi=return_bi)) { 1559 while ((bi=return_bi)) {
@@ -1384,7 +1592,7 @@ static void handle_stripe(struct stripe_head *sh)
1384 rcu_read_unlock(); 1592 rcu_read_unlock();
1385 1593
1386 if (rdev) { 1594 if (rdev) {
1387 if (syncing) 1595 if (syncing || expanding || expanded)
1388 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 1596 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1389 1597
1390 bi->bi_bdev = rdev->bdev; 1598 bi->bi_bdev = rdev->bdev;
@@ -1526,17 +1734,16 @@ static inline void raid5_plug_device(raid5_conf_t *conf)
1526 spin_unlock_irq(&conf->device_lock); 1734 spin_unlock_irq(&conf->device_lock);
1527} 1735}
1528 1736
1529static int make_request (request_queue_t *q, struct bio * bi) 1737static int make_request(request_queue_t *q, struct bio * bi)
1530{ 1738{
1531 mddev_t *mddev = q->queuedata; 1739 mddev_t *mddev = q->queuedata;
1532 raid5_conf_t *conf = mddev_to_conf(mddev); 1740 raid5_conf_t *conf = mddev_to_conf(mddev);
1533 const unsigned int raid_disks = conf->raid_disks;
1534 const unsigned int data_disks = raid_disks - 1;
1535 unsigned int dd_idx, pd_idx; 1741 unsigned int dd_idx, pd_idx;
1536 sector_t new_sector; 1742 sector_t new_sector;
1537 sector_t logical_sector, last_sector; 1743 sector_t logical_sector, last_sector;
1538 struct stripe_head *sh; 1744 struct stripe_head *sh;
1539 const int rw = bio_data_dir(bi); 1745 const int rw = bio_data_dir(bi);
1746 int remaining;
1540 1747
1541 if (unlikely(bio_barrier(bi))) { 1748 if (unlikely(bio_barrier(bi))) {
1542 bio_endio(bi, bi->bi_size, -EOPNOTSUPP); 1749 bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
@@ -1555,20 +1762,77 @@ static int make_request (request_queue_t *q, struct bio * bi)
1555 1762
1556 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 1763 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
1557 DEFINE_WAIT(w); 1764 DEFINE_WAIT(w);
1558 1765 int disks;
1559 new_sector = raid5_compute_sector(logical_sector,
1560 raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1561 1766
1767 retry:
1768 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
1769 if (likely(conf->expand_progress == MaxSector))
1770 disks = conf->raid_disks;
1771 else {
1772 /* spinlock is needed as expand_progress may be
1773 * 64bit on a 32bit platform, and so it might be
1774 * possible to see a half-updated value
1775 * Ofcourse expand_progress could change after
1776 * the lock is dropped, so once we get a reference
1777 * to the stripe that we think it is, we will have
1778 * to check again.
1779 */
1780 spin_lock_irq(&conf->device_lock);
1781 disks = conf->raid_disks;
1782 if (logical_sector >= conf->expand_progress)
1783 disks = conf->previous_raid_disks;
1784 else {
1785 if (logical_sector >= conf->expand_lo) {
1786 spin_unlock_irq(&conf->device_lock);
1787 schedule();
1788 goto retry;
1789 }
1790 }
1791 spin_unlock_irq(&conf->device_lock);
1792 }
1793 new_sector = raid5_compute_sector(logical_sector, disks, disks - 1,
1794 &dd_idx, &pd_idx, conf);
1562 PRINTK("raid5: make_request, sector %llu logical %llu\n", 1795 PRINTK("raid5: make_request, sector %llu logical %llu\n",
1563 (unsigned long long)new_sector, 1796 (unsigned long long)new_sector,
1564 (unsigned long long)logical_sector); 1797 (unsigned long long)logical_sector);
1565 1798
1566 retry: 1799 sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
1567 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
1568 sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
1569 if (sh) { 1800 if (sh) {
1570 if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { 1801 if (unlikely(conf->expand_progress != MaxSector)) {
1571 /* Add failed due to overlap. Flush everything 1802 /* expansion might have moved on while waiting for a
1803 * stripe, so we must do the range check again.
1804 * Expansion could still move past after this
1805 * test, but as we are holding a reference to
1806 * 'sh', we know that if that happens,
1807 * STRIPE_EXPANDING will get set and the expansion
1808 * won't proceed until we finish with the stripe.
1809 */
1810 int must_retry = 0;
1811 spin_lock_irq(&conf->device_lock);
1812 if (logical_sector < conf->expand_progress &&
1813 disks == conf->previous_raid_disks)
1814 /* mismatch, need to try again */
1815 must_retry = 1;
1816 spin_unlock_irq(&conf->device_lock);
1817 if (must_retry) {
1818 release_stripe(sh);
1819 goto retry;
1820 }
1821 }
1822 /* FIXME what if we get a false positive because these
1823 * are being updated.
1824 */
1825 if (logical_sector >= mddev->suspend_lo &&
1826 logical_sector < mddev->suspend_hi) {
1827 release_stripe(sh);
1828 schedule();
1829 goto retry;
1830 }
1831
1832 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
1833 !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
1834 /* Stripe is busy expanding or
1835 * add failed due to overlap. Flush everything
1572 * and wait a while 1836 * and wait a while
1573 */ 1837 */
1574 raid5_unplug_device(mddev->queue); 1838 raid5_unplug_device(mddev->queue);
@@ -1580,7 +1844,6 @@ static int make_request (request_queue_t *q, struct bio * bi)
1580 raid5_plug_device(conf); 1844 raid5_plug_device(conf);
1581 handle_stripe(sh); 1845 handle_stripe(sh);
1582 release_stripe(sh); 1846 release_stripe(sh);
1583
1584 } else { 1847 } else {
1585 /* cannot get stripe for read-ahead, just give-up */ 1848 /* cannot get stripe for read-ahead, just give-up */
1586 clear_bit(BIO_UPTODATE, &bi->bi_flags); 1849 clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1590,7 +1853,9 @@ static int make_request (request_queue_t *q, struct bio * bi)
1590 1853
1591 } 1854 }
1592 spin_lock_irq(&conf->device_lock); 1855 spin_lock_irq(&conf->device_lock);
1593 if (--bi->bi_phys_segments == 0) { 1856 remaining = --bi->bi_phys_segments;
1857 spin_unlock_irq(&conf->device_lock);
1858 if (remaining == 0) {
1594 int bytes = bi->bi_size; 1859 int bytes = bi->bi_size;
1595 1860
1596 if ( bio_data_dir(bi) == WRITE ) 1861 if ( bio_data_dir(bi) == WRITE )
@@ -1598,7 +1863,6 @@ static int make_request (request_queue_t *q, struct bio * bi)
1598 bi->bi_size = 0; 1863 bi->bi_size = 0;
1599 bi->bi_end_io(bi, bytes, 0); 1864 bi->bi_end_io(bi, bytes, 0);
1600 } 1865 }
1601 spin_unlock_irq(&conf->device_lock);
1602 return 0; 1866 return 0;
1603} 1867}
1604 1868
@@ -1607,12 +1871,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1607{ 1871{
1608 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 1872 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1609 struct stripe_head *sh; 1873 struct stripe_head *sh;
1610 int sectors_per_chunk = conf->chunk_size >> 9; 1874 int pd_idx;
1611 sector_t x; 1875 sector_t first_sector, last_sector;
1612 unsigned long stripe;
1613 int chunk_offset;
1614 int dd_idx, pd_idx;
1615 sector_t first_sector;
1616 int raid_disks = conf->raid_disks; 1876 int raid_disks = conf->raid_disks;
1617 int data_disks = raid_disks-1; 1877 int data_disks = raid_disks-1;
1618 sector_t max_sector = mddev->size << 1; 1878 sector_t max_sector = mddev->size << 1;
@@ -1621,6 +1881,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1621 if (sector_nr >= max_sector) { 1881 if (sector_nr >= max_sector) {
1622 /* just being told to finish up .. nothing much to do */ 1882 /* just being told to finish up .. nothing much to do */
1623 unplug_slaves(mddev); 1883 unplug_slaves(mddev);
1884 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
1885 end_reshape(conf);
1886 return 0;
1887 }
1624 1888
1625 if (mddev->curr_resync < max_sector) /* aborted */ 1889 if (mddev->curr_resync < max_sector) /* aborted */
1626 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 1890 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
@@ -1631,6 +1895,123 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1631 1895
1632 return 0; 1896 return 0;
1633 } 1897 }
1898
1899 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
1900 /* reshaping is quite different to recovery/resync so it is
1901 * handled quite separately ... here.
1902 *
1903 * On each call to sync_request, we gather one chunk worth of
1904 * destination stripes and flag them as expanding.
1905 * Then we find all the source stripes and request reads.
1906 * As the reads complete, handle_stripe will copy the data
1907 * into the destination stripe and release that stripe.
1908 */
1909 int i;
1910 int dd_idx;
1911 sector_t writepos, safepos, gap;
1912
1913 if (sector_nr == 0 &&
1914 conf->expand_progress != 0) {
1915 /* restarting in the middle, skip the initial sectors */
1916 sector_nr = conf->expand_progress;
1917 sector_div(sector_nr, conf->raid_disks-1);
1918 *skipped = 1;
1919 return sector_nr;
1920 }
1921
1922 /* we update the metadata when there is more than 3Meg
1923 * in the block range (that is rather arbitrary, should
1924 * probably be time based) or when the data about to be
1925 * copied would over-write the source of the data at
1926 * the front of the range.
1927 * i.e. one new_stripe forward from expand_progress new_maps
1928 * to after where expand_lo old_maps to
1929 */
1930 writepos = conf->expand_progress +
1931 conf->chunk_size/512*(conf->raid_disks-1);
1932 sector_div(writepos, conf->raid_disks-1);
1933 safepos = conf->expand_lo;
1934 sector_div(safepos, conf->previous_raid_disks-1);
1935 gap = conf->expand_progress - conf->expand_lo;
1936
1937 if (writepos >= safepos ||
1938 gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
1939 /* Cannot proceed until we've updated the superblock... */
1940 wait_event(conf->wait_for_overlap,
1941 atomic_read(&conf->reshape_stripes)==0);
1942 mddev->reshape_position = conf->expand_progress;
1943 mddev->sb_dirty = 1;
1944 md_wakeup_thread(mddev->thread);
1945 wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
1946 kthread_should_stop());
1947 spin_lock_irq(&conf->device_lock);
1948 conf->expand_lo = mddev->reshape_position;
1949 spin_unlock_irq(&conf->device_lock);
1950 wake_up(&conf->wait_for_overlap);
1951 }
1952
1953 for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
1954 int j;
1955 int skipped = 0;
1956 pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
1957 sh = get_active_stripe(conf, sector_nr+i,
1958 conf->raid_disks, pd_idx, 0);
1959 set_bit(STRIPE_EXPANDING, &sh->state);
1960 atomic_inc(&conf->reshape_stripes);
1961 /* If any of this stripe is beyond the end of the old
1962 * array, then we need to zero those blocks
1963 */
1964 for (j=sh->disks; j--;) {
1965 sector_t s;
1966 if (j == sh->pd_idx)
1967 continue;
1968 s = compute_blocknr(sh, j);
1969 if (s < (mddev->array_size<<1)) {
1970 skipped = 1;
1971 continue;
1972 }
1973 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
1974 set_bit(R5_Expanded, &sh->dev[j].flags);
1975 set_bit(R5_UPTODATE, &sh->dev[j].flags);
1976 }
1977 if (!skipped) {
1978 set_bit(STRIPE_EXPAND_READY, &sh->state);
1979 set_bit(STRIPE_HANDLE, &sh->state);
1980 }
1981 release_stripe(sh);
1982 }
1983 spin_lock_irq(&conf->device_lock);
1984 conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
1985 spin_unlock_irq(&conf->device_lock);
1986 /* Ok, those stripe are ready. We can start scheduling
1987 * reads on the source stripes.
1988 * The source stripes are determined by mapping the first and last
1989 * block on the destination stripes.
1990 */
1991 raid_disks = conf->previous_raid_disks;
1992 data_disks = raid_disks - 1;
1993 first_sector =
1994 raid5_compute_sector(sector_nr*(conf->raid_disks-1),
1995 raid_disks, data_disks,
1996 &dd_idx, &pd_idx, conf);
1997 last_sector =
1998 raid5_compute_sector((sector_nr+conf->chunk_size/512)
1999 *(conf->raid_disks-1) -1,
2000 raid_disks, data_disks,
2001 &dd_idx, &pd_idx, conf);
2002 if (last_sector >= (mddev->size<<1))
2003 last_sector = (mddev->size<<1)-1;
2004 while (first_sector <= last_sector) {
2005 pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
2006 sh = get_active_stripe(conf, first_sector,
2007 conf->previous_raid_disks, pd_idx, 0);
2008 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2009 set_bit(STRIPE_HANDLE, &sh->state);
2010 release_stripe(sh);
2011 first_sector += STRIPE_SECTORS;
2012 }
2013 return conf->chunk_size>>9;
2014 }
1634 /* if there is 1 or more failed drives and we are trying 2015 /* if there is 1 or more failed drives and we are trying
1635 * to resync, then assert that we are finished, because there is 2016 * to resync, then assert that we are finished, because there is
1636 * nothing we can do. 2017 * nothing we can do.
@@ -1649,16 +2030,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1649 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 2030 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
1650 } 2031 }
1651 2032
1652 x = sector_nr; 2033 pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
1653 chunk_offset = sector_div(x, sectors_per_chunk); 2034 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
1654 stripe = x;
1655 BUG_ON(x != stripe);
1656
1657 first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
1658 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1659 sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
1660 if (sh == NULL) { 2035 if (sh == NULL) {
1661 sh = get_active_stripe(conf, sector_nr, pd_idx, 0); 2036 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
1662 /* make sure we don't swamp the stripe cache if someone else 2037 /* make sure we don't swamp the stripe cache if someone else
1663 * is trying to get access 2038 * is trying to get access
1664 */ 2039 */
@@ -1822,11 +2197,64 @@ static int run(mddev_t *mddev)
1822 return -EIO; 2197 return -EIO;
1823 } 2198 }
1824 2199
1825 mddev->private = kzalloc(sizeof (raid5_conf_t) 2200 if (mddev->reshape_position != MaxSector) {
1826 + mddev->raid_disks * sizeof(struct disk_info), 2201 /* Check that we can continue the reshape.
1827 GFP_KERNEL); 2202 * Currently only disks can change, it must
2203 * increase, and we must be past the point where
2204 * a stripe over-writes itself
2205 */
2206 sector_t here_new, here_old;
2207 int old_disks;
2208
2209 if (mddev->new_level != mddev->level ||
2210 mddev->new_layout != mddev->layout ||
2211 mddev->new_chunk != mddev->chunk_size) {
2212 printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n",
2213 mdname(mddev));
2214 return -EINVAL;
2215 }
2216 if (mddev->delta_disks <= 0) {
2217 printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n",
2218 mdname(mddev));
2219 return -EINVAL;
2220 }
2221 old_disks = mddev->raid_disks - mddev->delta_disks;
2222 /* reshape_position must be on a new-stripe boundary, and one
2223 * further up in new geometry must map after here in old geometry.
2224 */
2225 here_new = mddev->reshape_position;
2226 if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) {
2227 printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n");
2228 return -EINVAL;
2229 }
2230 /* here_new is the stripe we will write to */
2231 here_old = mddev->reshape_position;
2232 sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1));
2233 /* here_old is the first stripe that we might need to read from */
2234 if (here_new >= here_old) {
2235 /* Reading from the same stripe as writing to - bad */
2236 printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n");
2237 return -EINVAL;
2238 }
2239 printk(KERN_INFO "raid5: reshape will continue\n");
2240 /* OK, we should be able to continue; */
2241 }
2242
2243
2244 mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
1828 if ((conf = mddev->private) == NULL) 2245 if ((conf = mddev->private) == NULL)
1829 goto abort; 2246 goto abort;
2247 if (mddev->reshape_position == MaxSector) {
2248 conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
2249 } else {
2250 conf->raid_disks = mddev->raid_disks;
2251 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
2252 }
2253
2254 conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
2255 GFP_KERNEL);
2256 if (!conf->disks)
2257 goto abort;
1830 2258
1831 conf->mddev = mddev; 2259 conf->mddev = mddev;
1832 2260
@@ -1847,7 +2275,7 @@ static int run(mddev_t *mddev)
1847 2275
1848 ITERATE_RDEV(mddev,rdev,tmp) { 2276 ITERATE_RDEV(mddev,rdev,tmp) {
1849 raid_disk = rdev->raid_disk; 2277 raid_disk = rdev->raid_disk;
1850 if (raid_disk >= mddev->raid_disks 2278 if (raid_disk >= conf->raid_disks
1851 || raid_disk < 0) 2279 || raid_disk < 0)
1852 continue; 2280 continue;
1853 disk = conf->disks + raid_disk; 2281 disk = conf->disks + raid_disk;
@@ -1863,7 +2291,6 @@ static int run(mddev_t *mddev)
1863 } 2291 }
1864 } 2292 }
1865 2293
1866 conf->raid_disks = mddev->raid_disks;
1867 /* 2294 /*
1868 * 0 for a fully functional array, 1 for a degraded array. 2295 * 0 for a fully functional array, 1 for a degraded array.
1869 */ 2296 */
@@ -1873,6 +2300,7 @@ static int run(mddev_t *mddev)
1873 conf->level = mddev->level; 2300 conf->level = mddev->level;
1874 conf->algorithm = mddev->layout; 2301 conf->algorithm = mddev->layout;
1875 conf->max_nr_stripes = NR_STRIPES; 2302 conf->max_nr_stripes = NR_STRIPES;
2303 conf->expand_progress = mddev->reshape_position;
1876 2304
1877 /* device size must be a multiple of chunk size */ 2305 /* device size must be a multiple of chunk size */
1878 mddev->size &= ~(mddev->chunk_size/1024 -1); 2306 mddev->size &= ~(mddev->chunk_size/1024 -1);
@@ -1945,6 +2373,21 @@ static int run(mddev_t *mddev)
1945 2373
1946 print_raid5_conf(conf); 2374 print_raid5_conf(conf);
1947 2375
2376 if (conf->expand_progress != MaxSector) {
2377 printk("...ok start reshape thread\n");
2378 conf->expand_lo = conf->expand_progress;
2379 atomic_set(&conf->reshape_stripes, 0);
2380 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2381 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2382 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
2383 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
2384 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
2385 "%s_reshape");
2386 /* FIXME if md_register_thread fails?? */
2387 md_wakeup_thread(mddev->sync_thread);
2388
2389 }
2390
1948 /* read-ahead size must cover two whole stripes, which is 2391 /* read-ahead size must cover two whole stripes, which is
1949 * 2 * (n-1) * chunksize where 'n' is the number of raid devices 2392 * 2 * (n-1) * chunksize where 'n' is the number of raid devices
1950 */ 2393 */
@@ -1960,12 +2403,13 @@ static int run(mddev_t *mddev)
1960 2403
1961 mddev->queue->unplug_fn = raid5_unplug_device; 2404 mddev->queue->unplug_fn = raid5_unplug_device;
1962 mddev->queue->issue_flush_fn = raid5_issue_flush; 2405 mddev->queue->issue_flush_fn = raid5_issue_flush;
2406 mddev->array_size = mddev->size * (conf->previous_raid_disks - 1);
1963 2407
1964 mddev->array_size = mddev->size * (mddev->raid_disks - 1);
1965 return 0; 2408 return 0;
1966abort: 2409abort:
1967 if (conf) { 2410 if (conf) {
1968 print_raid5_conf(conf); 2411 print_raid5_conf(conf);
2412 kfree(conf->disks);
1969 kfree(conf->stripe_hashtbl); 2413 kfree(conf->stripe_hashtbl);
1970 kfree(conf); 2414 kfree(conf);
1971 } 2415 }
@@ -1986,6 +2430,7 @@ static int stop(mddev_t *mddev)
1986 kfree(conf->stripe_hashtbl); 2430 kfree(conf->stripe_hashtbl);
1987 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 2431 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
1988 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); 2432 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
2433 kfree(conf->disks);
1989 kfree(conf); 2434 kfree(conf);
1990 mddev->private = NULL; 2435 mddev->private = NULL;
1991 return 0; 2436 return 0;
@@ -2001,7 +2446,7 @@ static void print_sh (struct stripe_head *sh)
2001 printk("sh %llu, count %d.\n", 2446 printk("sh %llu, count %d.\n",
2002 (unsigned long long)sh->sector, atomic_read(&sh->count)); 2447 (unsigned long long)sh->sector, atomic_read(&sh->count));
2003 printk("sh %llu, ", (unsigned long long)sh->sector); 2448 printk("sh %llu, ", (unsigned long long)sh->sector);
2004 for (i = 0; i < sh->raid_conf->raid_disks; i++) { 2449 for (i = 0; i < sh->disks; i++) {
2005 printk("(cache%d: %p %ld) ", 2450 printk("(cache%d: %p %ld) ",
2006 i, sh->dev[i].page, sh->dev[i].flags); 2451 i, sh->dev[i].page, sh->dev[i].flags);
2007 } 2452 }
@@ -2132,7 +2577,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
2132 /* 2577 /*
2133 * find the disk ... 2578 * find the disk ...
2134 */ 2579 */
2135 for (disk=0; disk < mddev->raid_disks; disk++) 2580 for (disk=0; disk < conf->raid_disks; disk++)
2136 if ((p=conf->disks + disk)->rdev == NULL) { 2581 if ((p=conf->disks + disk)->rdev == NULL) {
2137 clear_bit(In_sync, &rdev->flags); 2582 clear_bit(In_sync, &rdev->flags);
2138 rdev->raid_disk = disk; 2583 rdev->raid_disk = disk;
@@ -2168,11 +2613,146 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
2168 return 0; 2613 return 0;
2169} 2614}
2170 2615
2616#ifdef CONFIG_MD_RAID5_RESHAPE
2617static int raid5_check_reshape(mddev_t *mddev)
2618{
2619 raid5_conf_t *conf = mddev_to_conf(mddev);
2620 int err;
2621
2622 if (mddev->delta_disks < 0 ||
2623 mddev->new_level != mddev->level)
2624 return -EINVAL; /* Cannot shrink array or change level yet */
2625 if (mddev->delta_disks == 0)
2626 return 0; /* nothing to do */
2627
2628 /* Can only proceed if there are plenty of stripe_heads.
2629 * We need a minimum of one full stripe,, and for sensible progress
2630 * it is best to have about 4 times that.
2631 * If we require 4 times, then the default 256 4K stripe_heads will
2632 * allow for chunk sizes up to 256K, which is probably OK.
2633 * If the chunk size is greater, user-space should request more
2634 * stripe_heads first.
2635 */
2636 if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
2637 (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
2638 printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n",
2639 (mddev->chunk_size / STRIPE_SIZE)*4);
2640 return -ENOSPC;
2641 }
2642
2643 err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
2644 if (err)
2645 return err;
2646
2647 /* looks like we might be able to manage this */
2648 return 0;
2649}
2650
2651static int raid5_start_reshape(mddev_t *mddev)
2652{
2653 raid5_conf_t *conf = mddev_to_conf(mddev);
2654 mdk_rdev_t *rdev;
2655 struct list_head *rtmp;
2656 int spares = 0;
2657 int added_devices = 0;
2658
2659 if (mddev->degraded ||
2660 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2661 return -EBUSY;
2662
2663 ITERATE_RDEV(mddev, rdev, rtmp)
2664 if (rdev->raid_disk < 0 &&
2665 !test_bit(Faulty, &rdev->flags))
2666 spares++;
2667
2668 if (spares < mddev->delta_disks-1)
2669 /* Not enough devices even to make a degraded array
2670 * of that size
2671 */
2672 return -EINVAL;
2673
2674 atomic_set(&conf->reshape_stripes, 0);
2675 spin_lock_irq(&conf->device_lock);
2676 conf->previous_raid_disks = conf->raid_disks;
2677 conf->raid_disks += mddev->delta_disks;
2678 conf->expand_progress = 0;
2679 conf->expand_lo = 0;
2680 spin_unlock_irq(&conf->device_lock);
2681
2682 /* Add some new drives, as many as will fit.
2683 * We know there are enough to make the newly sized array work.
2684 */
2685 ITERATE_RDEV(mddev, rdev, rtmp)
2686 if (rdev->raid_disk < 0 &&
2687 !test_bit(Faulty, &rdev->flags)) {
2688 if (raid5_add_disk(mddev, rdev)) {
2689 char nm[20];
2690 set_bit(In_sync, &rdev->flags);
2691 conf->working_disks++;
2692 added_devices++;
2693 sprintf(nm, "rd%d", rdev->raid_disk);
2694 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
2695 } else
2696 break;
2697 }
2698
2699 mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
2700 mddev->raid_disks = conf->raid_disks;
2701 mddev->reshape_position = 0;
2702 mddev->sb_dirty = 1;
2703
2704 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
2705 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2706 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
2707 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
2708 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
2709 "%s_reshape");
2710 if (!mddev->sync_thread) {
2711 mddev->recovery = 0;
2712 spin_lock_irq(&conf->device_lock);
2713 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
2714 conf->expand_progress = MaxSector;
2715 spin_unlock_irq(&conf->device_lock);
2716 return -EAGAIN;
2717 }
2718 md_wakeup_thread(mddev->sync_thread);
2719 md_new_event(mddev);
2720 return 0;
2721}
2722#endif
2723
2724static void end_reshape(raid5_conf_t *conf)
2725{
2726 struct block_device *bdev;
2727
2728 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
2729 conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1);
2730 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
2731 conf->mddev->changed = 1;
2732
2733 bdev = bdget_disk(conf->mddev->gendisk, 0);
2734 if (bdev) {
2735 mutex_lock(&bdev->bd_inode->i_mutex);
2736 i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
2737 mutex_unlock(&bdev->bd_inode->i_mutex);
2738 bdput(bdev);
2739 }
2740 spin_lock_irq(&conf->device_lock);
2741 conf->expand_progress = MaxSector;
2742 spin_unlock_irq(&conf->device_lock);
2743 conf->mddev->reshape_position = MaxSector;
2744 }
2745}
2746
2171static void raid5_quiesce(mddev_t *mddev, int state) 2747static void raid5_quiesce(mddev_t *mddev, int state)
2172{ 2748{
2173 raid5_conf_t *conf = mddev_to_conf(mddev); 2749 raid5_conf_t *conf = mddev_to_conf(mddev);
2174 2750
2175 switch(state) { 2751 switch(state) {
2752 case 2: /* resume for a suspend */
2753 wake_up(&conf->wait_for_overlap);
2754 break;
2755
2176 case 1: /* stop all writes */ 2756 case 1: /* stop all writes */
2177 spin_lock_irq(&conf->device_lock); 2757 spin_lock_irq(&conf->device_lock);
2178 conf->quiesce = 1; 2758 conf->quiesce = 1;
@@ -2186,6 +2766,7 @@ static void raid5_quiesce(mddev_t *mddev, int state)
2186 spin_lock_irq(&conf->device_lock); 2766 spin_lock_irq(&conf->device_lock);
2187 conf->quiesce = 0; 2767 conf->quiesce = 0;
2188 wake_up(&conf->wait_for_stripe); 2768 wake_up(&conf->wait_for_stripe);
2769 wake_up(&conf->wait_for_overlap);
2189 spin_unlock_irq(&conf->device_lock); 2770 spin_unlock_irq(&conf->device_lock);
2190 break; 2771 break;
2191 } 2772 }
@@ -2206,6 +2787,10 @@ static struct mdk_personality raid5_personality =
2206 .spare_active = raid5_spare_active, 2787 .spare_active = raid5_spare_active,
2207 .sync_request = sync_request, 2788 .sync_request = sync_request,
2208 .resize = raid5_resize, 2789 .resize = raid5_resize,
2790#ifdef CONFIG_MD_RAID5_RESHAPE
2791 .check_reshape = raid5_check_reshape,
2792 .start_reshape = raid5_start_reshape,
2793#endif
2209 .quiesce = raid5_quiesce, 2794 .quiesce = raid5_quiesce,
2210}; 2795};
2211 2796
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index cd477ebf2ee4..6df4930fddec 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -331,9 +331,9 @@ static int grow_stripes(raid6_conf_t *conf, int num)
331 kmem_cache_t *sc; 331 kmem_cache_t *sc;
332 int devs = conf->raid_disks; 332 int devs = conf->raid_disks;
333 333
334 sprintf(conf->cache_name, "raid6/%s", mdname(conf->mddev)); 334 sprintf(conf->cache_name[0], "raid6/%s", mdname(conf->mddev));
335 335
336 sc = kmem_cache_create(conf->cache_name, 336 sc = kmem_cache_create(conf->cache_name[0],
337 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 337 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
338 0, 0, NULL, NULL); 338 0, 0, NULL, NULL);
339 if (!sc) 339 if (!sc)
@@ -2006,11 +2006,14 @@ static int run(mddev_t *mddev)
2006 return -EIO; 2006 return -EIO;
2007 } 2007 }
2008 2008
2009 mddev->private = kzalloc(sizeof (raid6_conf_t) 2009 mddev->private = kzalloc(sizeof (raid6_conf_t), GFP_KERNEL);
2010 + mddev->raid_disks * sizeof(struct disk_info),
2011 GFP_KERNEL);
2012 if ((conf = mddev->private) == NULL) 2010 if ((conf = mddev->private) == NULL)
2013 goto abort; 2011 goto abort;
2012 conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
2013 GFP_KERNEL);
2014 if (!conf->disks)
2015 goto abort;
2016
2014 conf->mddev = mddev; 2017 conf->mddev = mddev;
2015 2018
2016 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 2019 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
@@ -2158,6 +2161,7 @@ abort:
2158 print_raid6_conf(conf); 2161 print_raid6_conf(conf);
2159 safe_put_page(conf->spare_page); 2162 safe_put_page(conf->spare_page);
2160 kfree(conf->stripe_hashtbl); 2163 kfree(conf->stripe_hashtbl);
2164 kfree(conf->disks);
2161 kfree(conf); 2165 kfree(conf);
2162 } 2166 }
2163 mddev->private = NULL; 2167 mddev->private = NULL;