diff options
Diffstat (limited to 'drivers/md/raid6main.c')
-rw-r--r-- | drivers/md/raid6main.c | 348 |
1 files changed, 213 insertions, 135 deletions
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 0000d162d19..8c823d686a6 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c | |||
@@ -40,12 +40,10 @@ | |||
40 | #define STRIPE_SHIFT (PAGE_SHIFT - 9) | 40 | #define STRIPE_SHIFT (PAGE_SHIFT - 9) |
41 | #define STRIPE_SECTORS (STRIPE_SIZE>>9) | 41 | #define STRIPE_SECTORS (STRIPE_SIZE>>9) |
42 | #define IO_THRESHOLD 1 | 42 | #define IO_THRESHOLD 1 |
43 | #define HASH_PAGES 1 | 43 | #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) |
44 | #define HASH_PAGES_ORDER 0 | ||
45 | #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *)) | ||
46 | #define HASH_MASK (NR_HASH - 1) | 44 | #define HASH_MASK (NR_HASH - 1) |
47 | 45 | ||
48 | #define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]) | 46 | #define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])) |
49 | 47 | ||
50 | /* bio's attached to a stripe+device for I/O are linked together in bi_sector | 48 | /* bio's attached to a stripe+device for I/O are linked together in bi_sector |
51 | * order without overlap. There may be several bio's per stripe+device, and | 49 | * order without overlap. There may be several bio's per stripe+device, and |
@@ -132,29 +130,21 @@ static void release_stripe(struct stripe_head *sh) | |||
132 | spin_unlock_irqrestore(&conf->device_lock, flags); | 130 | spin_unlock_irqrestore(&conf->device_lock, flags); |
133 | } | 131 | } |
134 | 132 | ||
135 | static void remove_hash(struct stripe_head *sh) | 133 | static inline void remove_hash(struct stripe_head *sh) |
136 | { | 134 | { |
137 | PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); | 135 | PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); |
138 | 136 | ||
139 | if (sh->hash_pprev) { | 137 | hlist_del_init(&sh->hash); |
140 | if (sh->hash_next) | ||
141 | sh->hash_next->hash_pprev = sh->hash_pprev; | ||
142 | *sh->hash_pprev = sh->hash_next; | ||
143 | sh->hash_pprev = NULL; | ||
144 | } | ||
145 | } | 138 | } |
146 | 139 | ||
147 | static __inline__ void insert_hash(raid6_conf_t *conf, struct stripe_head *sh) | 140 | static inline void insert_hash(raid6_conf_t *conf, struct stripe_head *sh) |
148 | { | 141 | { |
149 | struct stripe_head **shp = &stripe_hash(conf, sh->sector); | 142 | struct hlist_head *hp = stripe_hash(conf, sh->sector); |
150 | 143 | ||
151 | PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); | 144 | PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); |
152 | 145 | ||
153 | CHECK_DEVLOCK(); | 146 | CHECK_DEVLOCK(); |
154 | if ((sh->hash_next = *shp) != NULL) | 147 | hlist_add_head(&sh->hash, hp); |
155 | (*shp)->hash_pprev = &sh->hash_next; | ||
156 | *shp = sh; | ||
157 | sh->hash_pprev = shp; | ||
158 | } | 148 | } |
159 | 149 | ||
160 | 150 | ||
@@ -186,7 +176,7 @@ static void shrink_buffers(struct stripe_head *sh, int num) | |||
186 | if (!p) | 176 | if (!p) |
187 | continue; | 177 | continue; |
188 | sh->dev[i].page = NULL; | 178 | sh->dev[i].page = NULL; |
189 | page_cache_release(p); | 179 | put_page(p); |
190 | } | 180 | } |
191 | } | 181 | } |
192 | 182 | ||
@@ -247,10 +237,11 @@ static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_i | |||
247 | static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector) | 237 | static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector) |
248 | { | 238 | { |
249 | struct stripe_head *sh; | 239 | struct stripe_head *sh; |
240 | struct hlist_node *hn; | ||
250 | 241 | ||
251 | CHECK_DEVLOCK(); | 242 | CHECK_DEVLOCK(); |
252 | PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); | 243 | PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); |
253 | for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next) | 244 | hlist_for_each_entry (sh, hn, stripe_hash(conf, sector), hash) |
254 | if (sh->sector == sector) | 245 | if (sh->sector == sector) |
255 | return sh; | 246 | return sh; |
256 | PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); | 247 | PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); |
@@ -367,8 +358,8 @@ static void shrink_stripes(raid6_conf_t *conf) | |||
367 | conf->slab_cache = NULL; | 358 | conf->slab_cache = NULL; |
368 | } | 359 | } |
369 | 360 | ||
370 | static int raid6_end_read_request (struct bio * bi, unsigned int bytes_done, | 361 | static int raid6_end_read_request(struct bio * bi, unsigned int bytes_done, |
371 | int error) | 362 | int error) |
372 | { | 363 | { |
373 | struct stripe_head *sh = bi->bi_private; | 364 | struct stripe_head *sh = bi->bi_private; |
374 | raid6_conf_t *conf = sh->raid_conf; | 365 | raid6_conf_t *conf = sh->raid_conf; |
@@ -420,9 +411,35 @@ static int raid6_end_read_request (struct bio * bi, unsigned int bytes_done, | |||
420 | #else | 411 | #else |
421 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | 412 | set_bit(R5_UPTODATE, &sh->dev[i].flags); |
422 | #endif | 413 | #endif |
414 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { | ||
415 | printk(KERN_INFO "raid6: read error corrected!!\n"); | ||
416 | clear_bit(R5_ReadError, &sh->dev[i].flags); | ||
417 | clear_bit(R5_ReWrite, &sh->dev[i].flags); | ||
418 | } | ||
419 | if (atomic_read(&conf->disks[i].rdev->read_errors)) | ||
420 | atomic_set(&conf->disks[i].rdev->read_errors, 0); | ||
423 | } else { | 421 | } else { |
424 | md_error(conf->mddev, conf->disks[i].rdev); | 422 | int retry = 0; |
425 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); | 423 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); |
424 | atomic_inc(&conf->disks[i].rdev->read_errors); | ||
425 | if (conf->mddev->degraded) | ||
426 | printk(KERN_WARNING "raid6: read error not correctable.\n"); | ||
427 | else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) | ||
428 | /* Oh, no!!! */ | ||
429 | printk(KERN_WARNING "raid6: read error NOT corrected!!\n"); | ||
430 | else if (atomic_read(&conf->disks[i].rdev->read_errors) | ||
431 | > conf->max_nr_stripes) | ||
432 | printk(KERN_WARNING | ||
433 | "raid6: Too many read errors, failing device.\n"); | ||
434 | else | ||
435 | retry = 1; | ||
436 | if (retry) | ||
437 | set_bit(R5_ReadError, &sh->dev[i].flags); | ||
438 | else { | ||
439 | clear_bit(R5_ReadError, &sh->dev[i].flags); | ||
440 | clear_bit(R5_ReWrite, &sh->dev[i].flags); | ||
441 | md_error(conf->mddev, conf->disks[i].rdev); | ||
442 | } | ||
426 | } | 443 | } |
427 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); | 444 | rdev_dec_pending(conf->disks[i].rdev, conf->mddev); |
428 | #if 0 | 445 | #if 0 |
@@ -805,7 +822,7 @@ static void compute_parity(struct stripe_head *sh, int method) | |||
805 | } | 822 | } |
806 | 823 | ||
807 | /* Compute one missing block */ | 824 | /* Compute one missing block */ |
808 | static void compute_block_1(struct stripe_head *sh, int dd_idx) | 825 | static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) |
809 | { | 826 | { |
810 | raid6_conf_t *conf = sh->raid_conf; | 827 | raid6_conf_t *conf = sh->raid_conf; |
811 | int i, count, disks = conf->raid_disks; | 828 | int i, count, disks = conf->raid_disks; |
@@ -821,7 +838,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx) | |||
821 | compute_parity(sh, UPDATE_PARITY); | 838 | compute_parity(sh, UPDATE_PARITY); |
822 | } else { | 839 | } else { |
823 | ptr[0] = page_address(sh->dev[dd_idx].page); | 840 | ptr[0] = page_address(sh->dev[dd_idx].page); |
824 | memset(ptr[0], 0, STRIPE_SIZE); | 841 | if (!nozero) memset(ptr[0], 0, STRIPE_SIZE); |
825 | count = 1; | 842 | count = 1; |
826 | for (i = disks ; i--; ) { | 843 | for (i = disks ; i--; ) { |
827 | if (i == dd_idx || i == qd_idx) | 844 | if (i == dd_idx || i == qd_idx) |
@@ -838,7 +855,8 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx) | |||
838 | } | 855 | } |
839 | if (count != 1) | 856 | if (count != 1) |
840 | xor_block(count, STRIPE_SIZE, ptr); | 857 | xor_block(count, STRIPE_SIZE, ptr); |
841 | set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); | 858 | if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); |
859 | else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); | ||
842 | } | 860 | } |
843 | } | 861 | } |
844 | 862 | ||
@@ -871,7 +889,7 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) | |||
871 | return; | 889 | return; |
872 | } else { | 890 | } else { |
873 | /* We're missing D+Q; recompute D from P */ | 891 | /* We're missing D+Q; recompute D from P */ |
874 | compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1); | 892 | compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0); |
875 | compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */ | 893 | compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */ |
876 | return; | 894 | return; |
877 | } | 895 | } |
@@ -982,6 +1000,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
982 | } | 1000 | } |
983 | 1001 | ||
984 | 1002 | ||
1003 | static int page_is_zero(struct page *p) | ||
1004 | { | ||
1005 | char *a = page_address(p); | ||
1006 | return ((*(u32*)a) == 0 && | ||
1007 | memcmp(a, a+4, STRIPE_SIZE-4)==0); | ||
1008 | } | ||
985 | /* | 1009 | /* |
986 | * handle_stripe - do things to a stripe. | 1010 | * handle_stripe - do things to a stripe. |
987 | * | 1011 | * |
@@ -1000,7 +1024,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
1000 | * | 1024 | * |
1001 | */ | 1025 | */ |
1002 | 1026 | ||
1003 | static void handle_stripe(struct stripe_head *sh) | 1027 | static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) |
1004 | { | 1028 | { |
1005 | raid6_conf_t *conf = sh->raid_conf; | 1029 | raid6_conf_t *conf = sh->raid_conf; |
1006 | int disks = conf->raid_disks; | 1030 | int disks = conf->raid_disks; |
@@ -1027,11 +1051,11 @@ static void handle_stripe(struct stripe_head *sh) | |||
1027 | syncing = test_bit(STRIPE_SYNCING, &sh->state); | 1051 | syncing = test_bit(STRIPE_SYNCING, &sh->state); |
1028 | /* Now to look around and see what can be done */ | 1052 | /* Now to look around and see what can be done */ |
1029 | 1053 | ||
1054 | rcu_read_lock(); | ||
1030 | for (i=disks; i--; ) { | 1055 | for (i=disks; i--; ) { |
1031 | mdk_rdev_t *rdev; | 1056 | mdk_rdev_t *rdev; |
1032 | dev = &sh->dev[i]; | 1057 | dev = &sh->dev[i]; |
1033 | clear_bit(R5_Insync, &dev->flags); | 1058 | clear_bit(R5_Insync, &dev->flags); |
1034 | clear_bit(R5_Syncio, &dev->flags); | ||
1035 | 1059 | ||
1036 | PRINTK("check %d: state 0x%lx read %p write %p written %p\n", | 1060 | PRINTK("check %d: state 0x%lx read %p write %p written %p\n", |
1037 | i, dev->flags, dev->toread, dev->towrite, dev->written); | 1061 | i, dev->flags, dev->toread, dev->towrite, dev->written); |
@@ -1070,14 +1094,21 @@ static void handle_stripe(struct stripe_head *sh) | |||
1070 | non_overwrite++; | 1094 | non_overwrite++; |
1071 | } | 1095 | } |
1072 | if (dev->written) written++; | 1096 | if (dev->written) written++; |
1073 | rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ | 1097 | rdev = rcu_dereference(conf->disks[i].rdev); |
1074 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { | 1098 | if (!rdev || !test_bit(In_sync, &rdev->flags)) { |
1099 | /* The ReadError flag will just be confusing now */ | ||
1100 | clear_bit(R5_ReadError, &dev->flags); | ||
1101 | clear_bit(R5_ReWrite, &dev->flags); | ||
1102 | } | ||
1103 | if (!rdev || !test_bit(In_sync, &rdev->flags) | ||
1104 | || test_bit(R5_ReadError, &dev->flags)) { | ||
1075 | if ( failed < 2 ) | 1105 | if ( failed < 2 ) |
1076 | failed_num[failed] = i; | 1106 | failed_num[failed] = i; |
1077 | failed++; | 1107 | failed++; |
1078 | } else | 1108 | } else |
1079 | set_bit(R5_Insync, &dev->flags); | 1109 | set_bit(R5_Insync, &dev->flags); |
1080 | } | 1110 | } |
1111 | rcu_read_unlock(); | ||
1081 | PRINTK("locked=%d uptodate=%d to_read=%d" | 1112 | PRINTK("locked=%d uptodate=%d to_read=%d" |
1082 | " to_write=%d failed=%d failed_num=%d,%d\n", | 1113 | " to_write=%d failed=%d failed_num=%d,%d\n", |
1083 | locked, uptodate, to_read, to_write, failed, | 1114 | locked, uptodate, to_read, to_write, failed, |
@@ -1088,6 +1119,17 @@ static void handle_stripe(struct stripe_head *sh) | |||
1088 | if (failed > 2 && to_read+to_write+written) { | 1119 | if (failed > 2 && to_read+to_write+written) { |
1089 | for (i=disks; i--; ) { | 1120 | for (i=disks; i--; ) { |
1090 | int bitmap_end = 0; | 1121 | int bitmap_end = 0; |
1122 | |||
1123 | if (test_bit(R5_ReadError, &sh->dev[i].flags)) { | ||
1124 | mdk_rdev_t *rdev; | ||
1125 | rcu_read_lock(); | ||
1126 | rdev = rcu_dereference(conf->disks[i].rdev); | ||
1127 | if (rdev && test_bit(In_sync, &rdev->flags)) | ||
1128 | /* multiple read failures in one stripe */ | ||
1129 | md_error(conf->mddev, rdev); | ||
1130 | rcu_read_unlock(); | ||
1131 | } | ||
1132 | |||
1091 | spin_lock_irq(&conf->device_lock); | 1133 | spin_lock_irq(&conf->device_lock); |
1092 | /* fail all writes first */ | 1134 | /* fail all writes first */ |
1093 | bi = sh->dev[i].towrite; | 1135 | bi = sh->dev[i].towrite; |
@@ -1123,7 +1165,8 @@ static void handle_stripe(struct stripe_head *sh) | |||
1123 | } | 1165 | } |
1124 | 1166 | ||
1125 | /* fail any reads if this device is non-operational */ | 1167 | /* fail any reads if this device is non-operational */ |
1126 | if (!test_bit(R5_Insync, &sh->dev[i].flags)) { | 1168 | if (!test_bit(R5_Insync, &sh->dev[i].flags) || |
1169 | test_bit(R5_ReadError, &sh->dev[i].flags)) { | ||
1127 | bi = sh->dev[i].toread; | 1170 | bi = sh->dev[i].toread; |
1128 | sh->dev[i].toread = NULL; | 1171 | sh->dev[i].toread = NULL; |
1129 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | 1172 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) |
@@ -1228,7 +1271,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
1228 | if (uptodate == disks-1) { | 1271 | if (uptodate == disks-1) { |
1229 | PRINTK("Computing stripe %llu block %d\n", | 1272 | PRINTK("Computing stripe %llu block %d\n", |
1230 | (unsigned long long)sh->sector, i); | 1273 | (unsigned long long)sh->sector, i); |
1231 | compute_block_1(sh, i); | 1274 | compute_block_1(sh, i, 0); |
1232 | uptodate++; | 1275 | uptodate++; |
1233 | } else if ( uptodate == disks-2 && failed >= 2 ) { | 1276 | } else if ( uptodate == disks-2 && failed >= 2 ) { |
1234 | /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */ | 1277 | /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */ |
@@ -1259,9 +1302,6 @@ static void handle_stripe(struct stripe_head *sh) | |||
1259 | locked++; | 1302 | locked++; |
1260 | PRINTK("Reading block %d (sync=%d)\n", | 1303 | PRINTK("Reading block %d (sync=%d)\n", |
1261 | i, syncing); | 1304 | i, syncing); |
1262 | if (syncing) | ||
1263 | md_sync_acct(conf->disks[i].rdev->bdev, | ||
1264 | STRIPE_SECTORS); | ||
1265 | } | 1305 | } |
1266 | } | 1306 | } |
1267 | } | 1307 | } |
@@ -1323,7 +1363,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
1323 | /* We have failed blocks and need to compute them */ | 1363 | /* We have failed blocks and need to compute them */ |
1324 | switch ( failed ) { | 1364 | switch ( failed ) { |
1325 | case 0: BUG(); | 1365 | case 0: BUG(); |
1326 | case 1: compute_block_1(sh, failed_num[0]); break; | 1366 | case 1: compute_block_1(sh, failed_num[0], 0); break; |
1327 | case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break; | 1367 | case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break; |
1328 | default: BUG(); /* This request should have been failed? */ | 1368 | default: BUG(); /* This request should have been failed? */ |
1329 | } | 1369 | } |
@@ -1338,12 +1378,10 @@ static void handle_stripe(struct stripe_head *sh) | |||
1338 | (unsigned long long)sh->sector, i); | 1378 | (unsigned long long)sh->sector, i); |
1339 | locked++; | 1379 | locked++; |
1340 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | 1380 | set_bit(R5_Wantwrite, &sh->dev[i].flags); |
1341 | #if 0 /**** FIX: I don't understand the logic here... ****/ | ||
1342 | if (!test_bit(R5_Insync, &sh->dev[i].flags) | ||
1343 | || ((i==pd_idx || i==qd_idx) && failed == 0)) /* FIX? */ | ||
1344 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1345 | #endif | ||
1346 | } | 1381 | } |
1382 | /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ | ||
1383 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1384 | |||
1347 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 1385 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { |
1348 | atomic_dec(&conf->preread_active_stripes); | 1386 | atomic_dec(&conf->preread_active_stripes); |
1349 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) | 1387 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) |
@@ -1356,84 +1394,119 @@ static void handle_stripe(struct stripe_head *sh) | |||
1356 | * Any reads will already have been scheduled, so we just see if enough data | 1394 | * Any reads will already have been scheduled, so we just see if enough data |
1357 | * is available | 1395 | * is available |
1358 | */ | 1396 | */ |
1359 | if (syncing && locked == 0 && | 1397 | if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) { |
1360 | !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 2) { | 1398 | int update_p = 0, update_q = 0; |
1361 | set_bit(STRIPE_HANDLE, &sh->state); | 1399 | struct r5dev *dev; |
1362 | #if 0 /* RAID-6: Don't support CHECK PARITY yet */ | ||
1363 | if (failed == 0) { | ||
1364 | char *pagea; | ||
1365 | if (uptodate != disks) | ||
1366 | BUG(); | ||
1367 | compute_parity(sh, CHECK_PARITY); | ||
1368 | uptodate--; | ||
1369 | pagea = page_address(sh->dev[pd_idx].page); | ||
1370 | if ((*(u32*)pagea) == 0 && | ||
1371 | !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) { | ||
1372 | /* parity is correct (on disc, not in buffer any more) */ | ||
1373 | set_bit(STRIPE_INSYNC, &sh->state); | ||
1374 | } | ||
1375 | } | ||
1376 | #endif | ||
1377 | if (!test_bit(STRIPE_INSYNC, &sh->state)) { | ||
1378 | int failed_needupdate[2]; | ||
1379 | struct r5dev *adev, *bdev; | ||
1380 | |||
1381 | if ( failed < 1 ) | ||
1382 | failed_num[0] = pd_idx; | ||
1383 | if ( failed < 2 ) | ||
1384 | failed_num[1] = (failed_num[0] == qd_idx) ? pd_idx : qd_idx; | ||
1385 | 1400 | ||
1386 | failed_needupdate[0] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[0]].flags); | 1401 | set_bit(STRIPE_HANDLE, &sh->state); |
1387 | failed_needupdate[1] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[1]].flags); | ||
1388 | 1402 | ||
1389 | PRINTK("sync: failed=%d num=%d,%d fnu=%u%u\n", | 1403 | BUG_ON(failed>2); |
1390 | failed, failed_num[0], failed_num[1], failed_needupdate[0], failed_needupdate[1]); | 1404 | BUG_ON(uptodate < disks); |
1405 | /* Want to check and possibly repair P and Q. | ||
1406 | * However there could be one 'failed' device, in which | ||
1407 | * case we can only check one of them, possibly using the | ||
1408 | * other to generate missing data | ||
1409 | */ | ||
1391 | 1410 | ||
1392 | #if 0 /* RAID-6: This code seems to require that CHECK_PARITY destroys the uptodateness of the parity */ | 1411 | /* If !tmp_page, we cannot do the calculations, |
1393 | /* should be able to compute the missing block(s) and write to spare */ | 1412 | * but as we have set STRIPE_HANDLE, we will soon be called |
1394 | if ( failed_needupdate[0] ^ failed_needupdate[1] ) { | 1413 | * by stripe_handle with a tmp_page - just wait until then. |
1395 | if (uptodate+1 != disks) | 1414 | */ |
1396 | BUG(); | 1415 | if (tmp_page) { |
1397 | compute_block_1(sh, failed_needupdate[0] ? failed_num[0] : failed_num[1]); | 1416 | if (failed == q_failed) { |
1398 | uptodate++; | 1417 | /* The only possible failed device holds 'Q', so it makes |
1399 | } else if ( failed_needupdate[0] & failed_needupdate[1] ) { | 1418 | * sense to check P (If anything else were failed, we would |
1400 | if (uptodate+2 != disks) | 1419 | * have used P to recreate it). |
1401 | BUG(); | 1420 | */ |
1402 | compute_block_2(sh, failed_num[0], failed_num[1]); | 1421 | compute_block_1(sh, pd_idx, 1); |
1403 | uptodate += 2; | 1422 | if (!page_is_zero(sh->dev[pd_idx].page)) { |
1423 | compute_block_1(sh,pd_idx,0); | ||
1424 | update_p = 1; | ||
1425 | } | ||
1426 | } | ||
1427 | if (!q_failed && failed < 2) { | ||
1428 | /* q is not failed, and we didn't use it to generate | ||
1429 | * anything, so it makes sense to check it | ||
1430 | */ | ||
1431 | memcpy(page_address(tmp_page), | ||
1432 | page_address(sh->dev[qd_idx].page), | ||
1433 | STRIPE_SIZE); | ||
1434 | compute_parity(sh, UPDATE_PARITY); | ||
1435 | if (memcmp(page_address(tmp_page), | ||
1436 | page_address(sh->dev[qd_idx].page), | ||
1437 | STRIPE_SIZE)!= 0) { | ||
1438 | clear_bit(STRIPE_INSYNC, &sh->state); | ||
1439 | update_q = 1; | ||
1440 | } | ||
1441 | } | ||
1442 | if (update_p || update_q) { | ||
1443 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | ||
1444 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
1445 | /* don't try to repair!! */ | ||
1446 | update_p = update_q = 0; | ||
1404 | } | 1447 | } |
1405 | #else | ||
1406 | compute_block_2(sh, failed_num[0], failed_num[1]); | ||
1407 | uptodate += failed_needupdate[0] + failed_needupdate[1]; | ||
1408 | #endif | ||
1409 | 1448 | ||
1410 | if (uptodate != disks) | 1449 | /* now write out any block on a failed drive, |
1411 | BUG(); | 1450 | * or P or Q if they need it |
1451 | */ | ||
1412 | 1452 | ||
1413 | PRINTK("Marking for sync stripe %llu blocks %d,%d\n", | 1453 | if (failed == 2) { |
1414 | (unsigned long long)sh->sector, failed_num[0], failed_num[1]); | 1454 | dev = &sh->dev[failed_num[1]]; |
1455 | locked++; | ||
1456 | set_bit(R5_LOCKED, &dev->flags); | ||
1457 | set_bit(R5_Wantwrite, &dev->flags); | ||
1458 | } | ||
1459 | if (failed >= 1) { | ||
1460 | dev = &sh->dev[failed_num[0]]; | ||
1461 | locked++; | ||
1462 | set_bit(R5_LOCKED, &dev->flags); | ||
1463 | set_bit(R5_Wantwrite, &dev->flags); | ||
1464 | } | ||
1415 | 1465 | ||
1416 | /**** FIX: Should we really do both of these unconditionally? ****/ | 1466 | if (update_p) { |
1417 | adev = &sh->dev[failed_num[0]]; | 1467 | dev = &sh->dev[pd_idx]; |
1418 | locked += !test_bit(R5_LOCKED, &adev->flags); | 1468 | locked ++; |
1419 | set_bit(R5_LOCKED, &adev->flags); | 1469 | set_bit(R5_LOCKED, &dev->flags); |
1420 | set_bit(R5_Wantwrite, &adev->flags); | 1470 | set_bit(R5_Wantwrite, &dev->flags); |
1421 | bdev = &sh->dev[failed_num[1]]; | 1471 | } |
1422 | locked += !test_bit(R5_LOCKED, &bdev->flags); | 1472 | if (update_q) { |
1423 | set_bit(R5_LOCKED, &bdev->flags); | 1473 | dev = &sh->dev[qd_idx]; |
1474 | locked++; | ||
1475 | set_bit(R5_LOCKED, &dev->flags); | ||
1476 | set_bit(R5_Wantwrite, &dev->flags); | ||
1477 | } | ||
1424 | clear_bit(STRIPE_DEGRADED, &sh->state); | 1478 | clear_bit(STRIPE_DEGRADED, &sh->state); |
1425 | set_bit(R5_Wantwrite, &bdev->flags); | ||
1426 | 1479 | ||
1427 | set_bit(STRIPE_INSYNC, &sh->state); | 1480 | set_bit(STRIPE_INSYNC, &sh->state); |
1428 | set_bit(R5_Syncio, &adev->flags); | ||
1429 | set_bit(R5_Syncio, &bdev->flags); | ||
1430 | } | 1481 | } |
1431 | } | 1482 | } |
1483 | |||
1432 | if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | 1484 | if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { |
1433 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); | 1485 | md_done_sync(conf->mddev, STRIPE_SECTORS,1); |
1434 | clear_bit(STRIPE_SYNCING, &sh->state); | 1486 | clear_bit(STRIPE_SYNCING, &sh->state); |
1435 | } | 1487 | } |
1436 | 1488 | ||
1489 | /* If the failed drives are just a ReadError, then we might need | ||
1490 | * to progress the repair/check process | ||
1491 | */ | ||
1492 | if (failed <= 2 && ! conf->mddev->ro) | ||
1493 | for (i=0; i<failed;i++) { | ||
1494 | dev = &sh->dev[failed_num[i]]; | ||
1495 | if (test_bit(R5_ReadError, &dev->flags) | ||
1496 | && !test_bit(R5_LOCKED, &dev->flags) | ||
1497 | && test_bit(R5_UPTODATE, &dev->flags) | ||
1498 | ) { | ||
1499 | if (!test_bit(R5_ReWrite, &dev->flags)) { | ||
1500 | set_bit(R5_Wantwrite, &dev->flags); | ||
1501 | set_bit(R5_ReWrite, &dev->flags); | ||
1502 | set_bit(R5_LOCKED, &dev->flags); | ||
1503 | } else { | ||
1504 | /* let's read it back */ | ||
1505 | set_bit(R5_Wantread, &dev->flags); | ||
1506 | set_bit(R5_LOCKED, &dev->flags); | ||
1507 | } | ||
1508 | } | ||
1509 | } | ||
1437 | spin_unlock(&sh->lock); | 1510 | spin_unlock(&sh->lock); |
1438 | 1511 | ||
1439 | while ((bi=return_bi)) { | 1512 | while ((bi=return_bi)) { |
@@ -1472,7 +1545,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
1472 | rcu_read_unlock(); | 1545 | rcu_read_unlock(); |
1473 | 1546 | ||
1474 | if (rdev) { | 1547 | if (rdev) { |
1475 | if (test_bit(R5_Syncio, &sh->dev[i].flags)) | 1548 | if (syncing) |
1476 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | 1549 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); |
1477 | 1550 | ||
1478 | bi->bi_bdev = rdev->bdev; | 1551 | bi->bi_bdev = rdev->bdev; |
@@ -1489,6 +1562,9 @@ static void handle_stripe(struct stripe_head *sh) | |||
1489 | bi->bi_io_vec[0].bv_offset = 0; | 1562 | bi->bi_io_vec[0].bv_offset = 0; |
1490 | bi->bi_size = STRIPE_SIZE; | 1563 | bi->bi_size = STRIPE_SIZE; |
1491 | bi->bi_next = NULL; | 1564 | bi->bi_next = NULL; |
1565 | if (rw == WRITE && | ||
1566 | test_bit(R5_ReWrite, &sh->dev[i].flags)) | ||
1567 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); | ||
1492 | generic_make_request(bi); | 1568 | generic_make_request(bi); |
1493 | } else { | 1569 | } else { |
1494 | if (rw == 1) | 1570 | if (rw == 1) |
@@ -1664,7 +1740,7 @@ static int make_request (request_queue_t *q, struct bio * bi) | |||
1664 | } | 1740 | } |
1665 | finish_wait(&conf->wait_for_overlap, &w); | 1741 | finish_wait(&conf->wait_for_overlap, &w); |
1666 | raid6_plug_device(conf); | 1742 | raid6_plug_device(conf); |
1667 | handle_stripe(sh); | 1743 | handle_stripe(sh, NULL); |
1668 | release_stripe(sh); | 1744 | release_stripe(sh); |
1669 | } else { | 1745 | } else { |
1670 | /* cannot get stripe for read-ahead, just give-up */ | 1746 | /* cannot get stripe for read-ahead, just give-up */ |
@@ -1728,6 +1804,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1728 | return rv; | 1804 | return rv; |
1729 | } | 1805 | } |
1730 | if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && | 1806 | if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && |
1807 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && | ||
1731 | !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { | 1808 | !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { |
1732 | /* we can skip this block, and probably more */ | 1809 | /* we can skip this block, and probably more */ |
1733 | sync_blocks /= STRIPE_SECTORS; | 1810 | sync_blocks /= STRIPE_SECTORS; |
@@ -1765,7 +1842,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1765 | clear_bit(STRIPE_INSYNC, &sh->state); | 1842 | clear_bit(STRIPE_INSYNC, &sh->state); |
1766 | spin_unlock(&sh->lock); | 1843 | spin_unlock(&sh->lock); |
1767 | 1844 | ||
1768 | handle_stripe(sh); | 1845 | handle_stripe(sh, NULL); |
1769 | release_stripe(sh); | 1846 | release_stripe(sh); |
1770 | 1847 | ||
1771 | return STRIPE_SECTORS; | 1848 | return STRIPE_SECTORS; |
@@ -1821,7 +1898,7 @@ static void raid6d (mddev_t *mddev) | |||
1821 | spin_unlock_irq(&conf->device_lock); | 1898 | spin_unlock_irq(&conf->device_lock); |
1822 | 1899 | ||
1823 | handled++; | 1900 | handled++; |
1824 | handle_stripe(sh); | 1901 | handle_stripe(sh, conf->spare_page); |
1825 | release_stripe(sh); | 1902 | release_stripe(sh); |
1826 | 1903 | ||
1827 | spin_lock_irq(&conf->device_lock); | 1904 | spin_lock_irq(&conf->device_lock); |
@@ -1848,17 +1925,19 @@ static int run(mddev_t *mddev) | |||
1848 | return -EIO; | 1925 | return -EIO; |
1849 | } | 1926 | } |
1850 | 1927 | ||
1851 | mddev->private = kmalloc (sizeof (raid6_conf_t) | 1928 | mddev->private = kzalloc(sizeof (raid6_conf_t) |
1852 | + mddev->raid_disks * sizeof(struct disk_info), | 1929 | + mddev->raid_disks * sizeof(struct disk_info), |
1853 | GFP_KERNEL); | 1930 | GFP_KERNEL); |
1854 | if ((conf = mddev->private) == NULL) | 1931 | if ((conf = mddev->private) == NULL) |
1855 | goto abort; | 1932 | goto abort; |
1856 | memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) ); | ||
1857 | conf->mddev = mddev; | 1933 | conf->mddev = mddev; |
1858 | 1934 | ||
1859 | if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) | 1935 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) |
1936 | goto abort; | ||
1937 | |||
1938 | conf->spare_page = alloc_page(GFP_KERNEL); | ||
1939 | if (!conf->spare_page) | ||
1860 | goto abort; | 1940 | goto abort; |
1861 | memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); | ||
1862 | 1941 | ||
1863 | spin_lock_init(&conf->device_lock); | 1942 | spin_lock_init(&conf->device_lock); |
1864 | init_waitqueue_head(&conf->wait_for_stripe); | 1943 | init_waitqueue_head(&conf->wait_for_stripe); |
@@ -1929,13 +2008,18 @@ static int run(mddev_t *mddev) | |||
1929 | goto abort; | 2008 | goto abort; |
1930 | } | 2009 | } |
1931 | 2010 | ||
1932 | #if 0 /* FIX: For now */ | ||
1933 | if (mddev->degraded > 0 && | 2011 | if (mddev->degraded > 0 && |
1934 | mddev->recovery_cp != MaxSector) { | 2012 | mddev->recovery_cp != MaxSector) { |
1935 | printk(KERN_ERR "raid6: cannot start dirty degraded array for %s\n", mdname(mddev)); | 2013 | if (mddev->ok_start_degraded) |
1936 | goto abort; | 2014 | printk(KERN_WARNING "raid6: starting dirty degraded array:%s" |
2015 | "- data corruption possible.\n", | ||
2016 | mdname(mddev)); | ||
2017 | else { | ||
2018 | printk(KERN_ERR "raid6: cannot start dirty degraded array" | ||
2019 | " for %s\n", mdname(mddev)); | ||
2020 | goto abort; | ||
2021 | } | ||
1937 | } | 2022 | } |
1938 | #endif | ||
1939 | 2023 | ||
1940 | { | 2024 | { |
1941 | mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6"); | 2025 | mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6"); |
@@ -1977,7 +2061,7 @@ static int run(mddev_t *mddev) | |||
1977 | */ | 2061 | */ |
1978 | { | 2062 | { |
1979 | int stripe = (mddev->raid_disks-2) * mddev->chunk_size | 2063 | int stripe = (mddev->raid_disks-2) * mddev->chunk_size |
1980 | / PAGE_CACHE_SIZE; | 2064 | / PAGE_SIZE; |
1981 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) | 2065 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) |
1982 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; | 2066 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; |
1983 | } | 2067 | } |
@@ -1985,18 +2069,14 @@ static int run(mddev_t *mddev) | |||
1985 | /* Ok, everything is just fine now */ | 2069 | /* Ok, everything is just fine now */ |
1986 | mddev->array_size = mddev->size * (mddev->raid_disks - 2); | 2070 | mddev->array_size = mddev->size * (mddev->raid_disks - 2); |
1987 | 2071 | ||
1988 | if (mddev->bitmap) | ||
1989 | mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; | ||
1990 | |||
1991 | mddev->queue->unplug_fn = raid6_unplug_device; | 2072 | mddev->queue->unplug_fn = raid6_unplug_device; |
1992 | mddev->queue->issue_flush_fn = raid6_issue_flush; | 2073 | mddev->queue->issue_flush_fn = raid6_issue_flush; |
1993 | return 0; | 2074 | return 0; |
1994 | abort: | 2075 | abort: |
1995 | if (conf) { | 2076 | if (conf) { |
1996 | print_raid6_conf(conf); | 2077 | print_raid6_conf(conf); |
1997 | if (conf->stripe_hashtbl) | 2078 | safe_put_page(conf->spare_page); |
1998 | free_pages((unsigned long) conf->stripe_hashtbl, | 2079 | kfree(conf->stripe_hashtbl); |
1999 | HASH_PAGES_ORDER); | ||
2000 | kfree(conf); | 2080 | kfree(conf); |
2001 | } | 2081 | } |
2002 | mddev->private = NULL; | 2082 | mddev->private = NULL; |
@@ -2013,7 +2093,7 @@ static int stop (mddev_t *mddev) | |||
2013 | md_unregister_thread(mddev->thread); | 2093 | md_unregister_thread(mddev->thread); |
2014 | mddev->thread = NULL; | 2094 | mddev->thread = NULL; |
2015 | shrink_stripes(conf); | 2095 | shrink_stripes(conf); |
2016 | free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); | 2096 | kfree(conf->stripe_hashtbl); |
2017 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 2097 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ |
2018 | kfree(conf); | 2098 | kfree(conf); |
2019 | mddev->private = NULL; | 2099 | mddev->private = NULL; |
@@ -2040,12 +2120,13 @@ static void print_sh (struct seq_file *seq, struct stripe_head *sh) | |||
2040 | static void printall (struct seq_file *seq, raid6_conf_t *conf) | 2120 | static void printall (struct seq_file *seq, raid6_conf_t *conf) |
2041 | { | 2121 | { |
2042 | struct stripe_head *sh; | 2122 | struct stripe_head *sh; |
2123 | struct hlist_node *hn; | ||
2043 | int i; | 2124 | int i; |
2044 | 2125 | ||
2045 | spin_lock_irq(&conf->device_lock); | 2126 | spin_lock_irq(&conf->device_lock); |
2046 | for (i = 0; i < NR_HASH; i++) { | 2127 | for (i = 0; i < NR_HASH; i++) { |
2047 | sh = conf->stripe_hashtbl[i]; | 2128 | sh = conf->stripe_hashtbl[i]; |
2048 | for (; sh; sh = sh->hash_next) { | 2129 | hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) { |
2049 | if (sh->raid_conf != conf) | 2130 | if (sh->raid_conf != conf) |
2050 | continue; | 2131 | continue; |
2051 | print_sh(seq, sh); | 2132 | print_sh(seq, sh); |
@@ -2223,17 +2304,12 @@ static void raid6_quiesce(mddev_t *mddev, int state) | |||
2223 | spin_unlock_irq(&conf->device_lock); | 2304 | spin_unlock_irq(&conf->device_lock); |
2224 | break; | 2305 | break; |
2225 | } | 2306 | } |
2226 | if (mddev->thread) { | ||
2227 | if (mddev->bitmap) | ||
2228 | mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; | ||
2229 | else | ||
2230 | mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; | ||
2231 | md_wakeup_thread(mddev->thread); | ||
2232 | } | ||
2233 | } | 2307 | } |
2234 | static mdk_personality_t raid6_personality= | 2308 | |
2309 | static struct mdk_personality raid6_personality = | ||
2235 | { | 2310 | { |
2236 | .name = "raid6", | 2311 | .name = "raid6", |
2312 | .level = 6, | ||
2237 | .owner = THIS_MODULE, | 2313 | .owner = THIS_MODULE, |
2238 | .make_request = make_request, | 2314 | .make_request = make_request, |
2239 | .run = run, | 2315 | .run = run, |
@@ -2248,7 +2324,7 @@ static mdk_personality_t raid6_personality= | |||
2248 | .quiesce = raid6_quiesce, | 2324 | .quiesce = raid6_quiesce, |
2249 | }; | 2325 | }; |
2250 | 2326 | ||
2251 | static int __init raid6_init (void) | 2327 | static int __init raid6_init(void) |
2252 | { | 2328 | { |
2253 | int e; | 2329 | int e; |
2254 | 2330 | ||
@@ -2256,15 +2332,17 @@ static int __init raid6_init (void) | |||
2256 | if ( e ) | 2332 | if ( e ) |
2257 | return e; | 2333 | return e; |
2258 | 2334 | ||
2259 | return register_md_personality (RAID6, &raid6_personality); | 2335 | return register_md_personality(&raid6_personality); |
2260 | } | 2336 | } |
2261 | 2337 | ||
2262 | static void raid6_exit (void) | 2338 | static void raid6_exit (void) |
2263 | { | 2339 | { |
2264 | unregister_md_personality (RAID6); | 2340 | unregister_md_personality(&raid6_personality); |
2265 | } | 2341 | } |
2266 | 2342 | ||
2267 | module_init(raid6_init); | 2343 | module_init(raid6_init); |
2268 | module_exit(raid6_exit); | 2344 | module_exit(raid6_exit); |
2269 | MODULE_LICENSE("GPL"); | 2345 | MODULE_LICENSE("GPL"); |
2270 | MODULE_ALIAS("md-personality-8"); /* RAID6 */ | 2346 | MODULE_ALIAS("md-personality-8"); /* RAID6 */ |
2347 | MODULE_ALIAS("md-raid6"); | ||
2348 | MODULE_ALIAS("md-level-6"); | ||