aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@cse.unsw.edu.au>2005-09-09 19:23:45 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-09-09 19:39:10 -0400
commit8ddf9efe6708f3674f0ddfeb6425fd27bea109a2 (patch)
tree90862c8fa9f04cf98423b3da1b2c5d1f01a7310d
parent36fa30636fb84b209210299684e1be66d9e58217 (diff)
[PATCH] md: support write-mostly device in raid1
This allows a device in a raid1 to be marked as "write mostly". Read requests will only be sent if there is no other option. Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--drivers/md/md.c18
-rw-r--r--drivers/md/raid1.c76
-rw-r--r--include/linux/raid/md_k.h3
-rw-r--r--include/linux/raid/md_p.h11
4 files changed, 82 insertions, 26 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ae654466dc23..f1ac356e656d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -670,6 +670,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
670 670
671 if (mddev->level != LEVEL_MULTIPATH) { 671 if (mddev->level != LEVEL_MULTIPATH) {
672 rdev->faulty = 0; 672 rdev->faulty = 0;
673 rdev->flags = 0;
673 desc = sb->disks + rdev->desc_nr; 674 desc = sb->disks + rdev->desc_nr;
674 675
675 if (desc->state & (1<<MD_DISK_FAULTY)) 676 if (desc->state & (1<<MD_DISK_FAULTY))
@@ -679,6 +680,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
679 rdev->in_sync = 1; 680 rdev->in_sync = 1;
680 rdev->raid_disk = desc->raid_disk; 681 rdev->raid_disk = desc->raid_disk;
681 } 682 }
683 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
684 set_bit(WriteMostly, &rdev->flags);
682 } else /* MULTIPATH are always insync */ 685 } else /* MULTIPATH are always insync */
683 rdev->in_sync = 1; 686 rdev->in_sync = 1;
684 return 0; 687 return 0;
@@ -777,6 +780,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
777 spare++; 780 spare++;
778 working++; 781 working++;
779 } 782 }
783 if (test_bit(WriteMostly, &rdev2->flags))
784 d->state |= (1<<MD_DISK_WRITEMOSTLY);
780 } 785 }
781 786
782 /* now set the "removed" and "faulty" bits on any missing devices */ 787 /* now set the "removed" and "faulty" bits on any missing devices */
@@ -990,6 +995,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
990 rdev->raid_disk = role; 995 rdev->raid_disk = role;
991 break; 996 break;
992 } 997 }
998 rdev->flags = 0;
999 if (sb->devflags & WriteMostly1)
1000 set_bit(WriteMostly, &rdev->flags);
993 } else /* MULTIPATH are always insync */ 1001 } else /* MULTIPATH are always insync */
994 rdev->in_sync = 1; 1002 rdev->in_sync = 1;
995 1003
@@ -2152,6 +2160,8 @@ static int get_disk_info(mddev_t * mddev, void __user * arg)
2152 info.state |= (1<<MD_DISK_ACTIVE); 2160 info.state |= (1<<MD_DISK_ACTIVE);
2153 info.state |= (1<<MD_DISK_SYNC); 2161 info.state |= (1<<MD_DISK_SYNC);
2154 } 2162 }
2163 if (test_bit(WriteMostly, &rdev->flags))
2164 info.state |= (1<<MD_DISK_WRITEMOSTLY);
2155 } else { 2165 } else {
2156 info.major = info.minor = 0; 2166 info.major = info.minor = 0;
2157 info.raid_disk = -1; 2167 info.raid_disk = -1;
@@ -2237,6 +2247,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2237 rdev->saved_raid_disk = rdev->raid_disk; 2247 rdev->saved_raid_disk = rdev->raid_disk;
2238 2248
2239 rdev->in_sync = 0; /* just to be sure */ 2249 rdev->in_sync = 0; /* just to be sure */
2250 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
2251 set_bit(WriteMostly, &rdev->flags);
2252
2240 rdev->raid_disk = -1; 2253 rdev->raid_disk = -1;
2241 err = bind_rdev_to_array(rdev, mddev); 2254 err = bind_rdev_to_array(rdev, mddev);
2242 if (err) 2255 if (err)
@@ -2277,6 +2290,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2277 else 2290 else
2278 rdev->in_sync = 0; 2291 rdev->in_sync = 0;
2279 2292
2293 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
2294 set_bit(WriteMostly, &rdev->flags);
2295
2280 err = bind_rdev_to_array(rdev, mddev); 2296 err = bind_rdev_to_array(rdev, mddev);
2281 if (err) { 2297 if (err) {
2282 export_rdev(rdev); 2298 export_rdev(rdev);
@@ -3329,6 +3345,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
3329 char b[BDEVNAME_SIZE]; 3345 char b[BDEVNAME_SIZE];
3330 seq_printf(seq, " %s[%d]", 3346 seq_printf(seq, " %s[%d]",
3331 bdevname(rdev->bdev,b), rdev->desc_nr); 3347 bdevname(rdev->bdev,b), rdev->desc_nr);
3348 if (test_bit(WriteMostly, &rdev->flags))
3349 seq_printf(seq, "(W)");
3332 if (rdev->faulty) { 3350 if (rdev->faulty) {
3333 seq_printf(seq, "(F)"); 3351 seq_printf(seq, "(F)");
3334 continue; 3352 continue;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ba643e4bfac9..28839a8193f2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -360,13 +360,14 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
360{ 360{
361 const unsigned long this_sector = r1_bio->sector; 361 const unsigned long this_sector = r1_bio->sector;
362 int new_disk = conf->last_used, disk = new_disk; 362 int new_disk = conf->last_used, disk = new_disk;
363 int wonly_disk = -1;
363 const int sectors = r1_bio->sectors; 364 const int sectors = r1_bio->sectors;
364 sector_t new_distance, current_distance; 365 sector_t new_distance, current_distance;
365 mdk_rdev_t *new_rdev, *rdev; 366 mdk_rdev_t *rdev;
366 367
367 rcu_read_lock(); 368 rcu_read_lock();
368 /* 369 /*
369 * Check if it if we can balance. We can balance on the whole 370 * Check if we can balance. We can balance on the whole
370 * device if no resync is going on, or below the resync window. 371 * device if no resync is going on, or below the resync window.
371 * We take the first readable disk when above the resync window. 372 * We take the first readable disk when above the resync window.
372 */ 373 */
@@ -376,11 +377,16 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
376 /* Choose the first operation device, for consistancy */ 377 /* Choose the first operation device, for consistancy */
377 new_disk = 0; 378 new_disk = 0;
378 379
379 while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || 380 for (rdev = conf->mirrors[new_disk].rdev;
380 !new_rdev->in_sync) { 381 !rdev || !rdev->in_sync
381 new_disk++; 382 || test_bit(WriteMostly, &rdev->flags);
382 if (new_disk == conf->raid_disks) { 383 rdev = conf->mirrors[++new_disk].rdev) {
383 new_disk = -1; 384
385 if (rdev && rdev->in_sync)
386 wonly_disk = new_disk;
387
388 if (new_disk == conf->raid_disks - 1) {
389 new_disk = wonly_disk;
384 break; 390 break;
385 } 391 }
386 } 392 }
@@ -389,16 +395,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
389 395
390 396
391 /* make sure the disk is operational */ 397 /* make sure the disk is operational */
392 while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || 398 for (rdev = conf->mirrors[new_disk].rdev;
393 !new_rdev->in_sync) { 399 !rdev || !rdev->in_sync ||
400 test_bit(WriteMostly, &rdev->flags);
401 rdev = conf->mirrors[new_disk].rdev) {
402
403 if (rdev && rdev->in_sync)
404 wonly_disk = new_disk;
405
394 if (new_disk <= 0) 406 if (new_disk <= 0)
395 new_disk = conf->raid_disks; 407 new_disk = conf->raid_disks;
396 new_disk--; 408 new_disk--;
397 if (new_disk == disk) { 409 if (new_disk == disk) {
398 new_disk = -1; 410 new_disk = wonly_disk;
399 goto rb_out; 411 break;
400 } 412 }
401 } 413 }
414
415 if (new_disk < 0)
416 goto rb_out;
417
402 disk = new_disk; 418 disk = new_disk;
403 /* now disk == new_disk == starting point for search */ 419 /* now disk == new_disk == starting point for search */
404 420
@@ -419,37 +435,41 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
419 disk = conf->raid_disks; 435 disk = conf->raid_disks;
420 disk--; 436 disk--;
421 437
422 if ((rdev=conf->mirrors[disk].rdev) == NULL || 438 rdev = conf->mirrors[disk].rdev;
423 !rdev->in_sync) 439
440 if (!rdev ||
441 !rdev->in_sync ||
442 test_bit(WriteMostly, &rdev->flags))
424 continue; 443 continue;
425 444
426 if (!atomic_read(&rdev->nr_pending)) { 445 if (!atomic_read(&rdev->nr_pending)) {
427 new_disk = disk; 446 new_disk = disk;
428 new_rdev = rdev;
429 break; 447 break;
430 } 448 }
431 new_distance = abs(this_sector - conf->mirrors[disk].head_position); 449 new_distance = abs(this_sector - conf->mirrors[disk].head_position);
432 if (new_distance < current_distance) { 450 if (new_distance < current_distance) {
433 current_distance = new_distance; 451 current_distance = new_distance;
434 new_disk = disk; 452 new_disk = disk;
435 new_rdev = rdev;
436 } 453 }
437 } while (disk != conf->last_used); 454 } while (disk != conf->last_used);
438 455
439rb_out: 456 rb_out:
440 457
441 458
442 if (new_disk >= 0) { 459 if (new_disk >= 0) {
443 conf->next_seq_sect = this_sector + sectors; 460 rdev = conf->mirrors[new_disk].rdev;
444 conf->last_used = new_disk; 461 if (!rdev)
445 atomic_inc(&new_rdev->nr_pending); 462 goto retry;
446 if (!new_rdev->in_sync) { 463 atomic_inc(&rdev->nr_pending);
464 if (!rdev->in_sync) {
447 /* cannot risk returning a device that failed 465 /* cannot risk returning a device that failed
448 * before we inc'ed nr_pending 466 * before we inc'ed nr_pending
449 */ 467 */
450 atomic_dec(&new_rdev->nr_pending); 468 atomic_dec(&rdev->nr_pending);
451 goto retry; 469 goto retry;
452 } 470 }
471 conf->next_seq_sect = this_sector + sectors;
472 conf->last_used = new_disk;
453 } 473 }
454 rcu_read_unlock(); 474 rcu_read_unlock();
455 475
@@ -1109,6 +1129,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1109 sector_t max_sector, nr_sectors; 1129 sector_t max_sector, nr_sectors;
1110 int disk; 1130 int disk;
1111 int i; 1131 int i;
1132 int wonly;
1112 int write_targets = 0; 1133 int write_targets = 0;
1113 int sync_blocks; 1134 int sync_blocks;
1114 int still_degraded = 0; 1135 int still_degraded = 0;
@@ -1164,14 +1185,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1164 */ 1185 */
1165 disk = conf->last_used; 1186 disk = conf->last_used;
1166 /* make sure disk is operational */ 1187 /* make sure disk is operational */
1167 1188 wonly = disk;
1168 while (conf->mirrors[disk].rdev == NULL || 1189 while (conf->mirrors[disk].rdev == NULL ||
1169 !conf->mirrors[disk].rdev->in_sync) { 1190 !conf->mirrors[disk].rdev->in_sync ||
1191 test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags)
1192 ) {
1193 if (conf->mirrors[disk].rdev &&
1194 conf->mirrors[disk].rdev->in_sync)
1195 wonly = disk;
1170 if (disk <= 0) 1196 if (disk <= 0)
1171 disk = conf->raid_disks; 1197 disk = conf->raid_disks;
1172 disk--; 1198 disk--;
1173 if (disk == conf->last_used) 1199 if (disk == conf->last_used) {
1200 disk = wonly;
1174 break; 1201 break;
1202 }
1175 } 1203 }
1176 conf->last_used = disk; 1204 conf->last_used = disk;
1177 atomic_inc(&conf->mirrors[disk].rdev->nr_pending); 1205 atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 817062bf7352..7ef78e15ce04 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -181,6 +181,9 @@ struct mdk_rdev_s
181 int faulty; /* if faulty do not issue IO requests */ 181 int faulty; /* if faulty do not issue IO requests */
182 int in_sync; /* device is a full member of the array */ 182 int in_sync; /* device is a full member of the array */
183 183
184 unsigned long flags; /* Should include faulty and in_sync here. */
185#define WriteMostly 4 /* Avoid reading if at all possible */
186
184 int desc_nr; /* descriptor index in the superblock */ 187 int desc_nr; /* descriptor index in the superblock */
185 int raid_disk; /* role of device in array */ 188 int raid_disk; /* role of device in array */
186 int saved_raid_disk; /* role that device used to have in the 189 int saved_raid_disk; /* role that device used to have in the
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index dc65cd435494..4f047f84fb1f 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -79,6 +79,11 @@
79#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ 79#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */
80#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ 80#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */
81 81
82#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config.
83 * read requests will only be sent here in
84 * dire need
85 */
86
82typedef struct mdp_device_descriptor_s { 87typedef struct mdp_device_descriptor_s {
83 __u32 number; /* 0 Device number in the entire set */ 88 __u32 number; /* 0 Device number in the entire set */
84 __u32 major; /* 1 Device major number */ 89 __u32 major; /* 1 Device major number */
@@ -193,7 +198,7 @@ struct mdp_superblock_1 {
193 198
194 __u64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/ 199 __u64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/
195 __u32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */ 200 __u32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */
196 __u32 layout; /* only for raid5 currently */ 201 __u32 layout; /* only for raid5 and raid10 currently */
197 __u64 size; /* used size of component devices, in 512byte sectors */ 202 __u64 size; /* used size of component devices, in 512byte sectors */
198 203
199 __u32 chunksize; /* in 512byte sectors */ 204 __u32 chunksize; /* in 512byte sectors */
@@ -212,7 +217,9 @@ struct mdp_superblock_1 {
212 __u32 dev_number; /* permanent identifier of this device - not role in raid */ 217 __u32 dev_number; /* permanent identifier of this device - not role in raid */
213 __u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ 218 __u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */
214 __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ 219 __u8 device_uuid[16]; /* user-space setable, ignored by kernel */
215 __u8 pad2[64-56]; /* set to 0 when writing */ 220 __u8 devflags; /* per-device flags. Only one defined...*/
221#define WriteMostly1 1 /* mask for writemostly flag in above */
222 __u8 pad2[64-57]; /* set to 0 when writing */
216 223
217 /* array state information - 64 bytes */ 224 /* array state information - 64 bytes */
218 __u64 utime; /* 40 bits second, 24 btes microseconds */ 225 __u64 utime; /* 40 bits second, 24 btes microseconds */