diff options
author | NeilBrown <neilb@cse.unsw.edu.au> | 2005-09-09 19:23:47 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-09-09 19:39:10 -0400 |
commit | 15945fee6f09bff1f86b1a735b5888dc59cf38e3 (patch) | |
tree | ed2f66ceccfa30867035e7ba7be46159e97e4e4d | |
parent | 4b6d287f627b5fb6a49f78f9e81649ff98c62bb7 (diff) |
[PATCH] md: support md/linear array with components greater than 2 terabytes.
linear currently uses division by the size of the smallest componenet device
to find which device a request goes to. If that smallest device is larger
than 2 terabytes, then the division will not work on some systems.
So we introduce a pre-shift, and take care not to make the hash table too
large, much like the code in raid0.
Also get rid of conf->nr_zones, which is not needed.
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | drivers/md/linear.c | 95 | ||||
-rw-r--r-- | include/linux/raid/linear.h | 4 |
2 files changed, 68 insertions, 31 deletions
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 4991ba543368..bb279fad2fd2 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
@@ -38,7 +38,8 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) | |||
38 | /* | 38 | /* |
39 | * sector_div(a,b) returns the remainer and sets a to a/b | 39 | * sector_div(a,b) returns the remainer and sets a to a/b |
40 | */ | 40 | */ |
41 | (void)sector_div(block, conf->smallest->size); | 41 | block >>= conf->preshift; |
42 | (void)sector_div(block, conf->hash_spacing); | ||
42 | hash = conf->hash_table[block]; | 43 | hash = conf->hash_table[block]; |
43 | 44 | ||
44 | while ((sector>>1) >= (hash->size + hash->offset)) | 45 | while ((sector>>1) >= (hash->size + hash->offset)) |
@@ -47,7 +48,7 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) | |||
47 | } | 48 | } |
48 | 49 | ||
49 | /** | 50 | /** |
50 | * linear_mergeable_bvec -- tell bio layer if a two requests can be merged | 51 | * linear_mergeable_bvec -- tell bio layer if two requests can be merged |
51 | * @q: request queue | 52 | * @q: request queue |
52 | * @bio: the buffer head that's been built up so far | 53 | * @bio: the buffer head that's been built up so far |
53 | * @biovec: the request that could be merged to it. | 54 | * @biovec: the request that could be merged to it. |
@@ -116,7 +117,7 @@ static int linear_run (mddev_t *mddev) | |||
116 | dev_info_t **table; | 117 | dev_info_t **table; |
117 | mdk_rdev_t *rdev; | 118 | mdk_rdev_t *rdev; |
118 | int i, nb_zone, cnt; | 119 | int i, nb_zone, cnt; |
119 | sector_t start; | 120 | sector_t min_spacing; |
120 | sector_t curr_offset; | 121 | sector_t curr_offset; |
121 | struct list_head *tmp; | 122 | struct list_head *tmp; |
122 | 123 | ||
@@ -127,11 +128,6 @@ static int linear_run (mddev_t *mddev) | |||
127 | memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t)); | 128 | memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t)); |
128 | mddev->private = conf; | 129 | mddev->private = conf; |
129 | 130 | ||
130 | /* | ||
131 | * Find the smallest device. | ||
132 | */ | ||
133 | |||
134 | conf->smallest = NULL; | ||
135 | cnt = 0; | 131 | cnt = 0; |
136 | mddev->array_size = 0; | 132 | mddev->array_size = 0; |
137 | 133 | ||
@@ -159,8 +155,6 @@ static int linear_run (mddev_t *mddev) | |||
159 | disk->size = rdev->size; | 155 | disk->size = rdev->size; |
160 | mddev->array_size += rdev->size; | 156 | mddev->array_size += rdev->size; |
161 | 157 | ||
162 | if (!conf->smallest || (disk->size < conf->smallest->size)) | ||
163 | conf->smallest = disk; | ||
164 | cnt++; | 158 | cnt++; |
165 | } | 159 | } |
166 | if (cnt != mddev->raid_disks) { | 160 | if (cnt != mddev->raid_disks) { |
@@ -168,6 +162,36 @@ static int linear_run (mddev_t *mddev) | |||
168 | goto out; | 162 | goto out; |
169 | } | 163 | } |
170 | 164 | ||
165 | min_spacing = mddev->array_size; | ||
166 | sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); | ||
167 | |||
168 | /* min_spacing is the minimum spacing that will fit the hash | ||
169 | * table in one PAGE. This may be much smaller than needed. | ||
170 | * We find the smallest non-terminal set of consecutive devices | ||
171 | * that is larger than min_spacing as use the size of that as | ||
172 | * the actual spacing | ||
173 | */ | ||
174 | conf->hash_spacing = mddev->array_size; | ||
175 | for (i=0; i < cnt-1 ; i++) { | ||
176 | sector_t sz = 0; | ||
177 | int j; | ||
178 | for (j=i; i<cnt-1 && sz < min_spacing ; j++) | ||
179 | sz += conf->disks[j].size; | ||
180 | if (sz >= min_spacing && sz < conf->hash_spacing) | ||
181 | conf->hash_spacing = sz; | ||
182 | } | ||
183 | |||
184 | /* hash_spacing may be too large for sector_div to work with, | ||
185 | * so we might need to pre-shift | ||
186 | */ | ||
187 | conf->preshift = 0; | ||
188 | if (sizeof(sector_t) > sizeof(u32)) { | ||
189 | sector_t space = conf->hash_spacing; | ||
190 | while (space > (sector_t)(~(u32)0)) { | ||
191 | space >>= 1; | ||
192 | conf->preshift++; | ||
193 | } | ||
194 | } | ||
171 | /* | 195 | /* |
172 | * This code was restructured to work around a gcc-2.95.3 internal | 196 | * This code was restructured to work around a gcc-2.95.3 internal |
173 | * compiler error. Alter it with care. | 197 | * compiler error. Alter it with care. |
@@ -177,39 +201,52 @@ static int linear_run (mddev_t *mddev) | |||
177 | unsigned round; | 201 | unsigned round; |
178 | unsigned long base; | 202 | unsigned long base; |
179 | 203 | ||
180 | sz = mddev->array_size; | 204 | sz = mddev->array_size >> conf->preshift; |
181 | base = conf->smallest->size; | 205 | sz += 1; /* force round-up */ |
206 | base = conf->hash_spacing >> conf->preshift; | ||
182 | round = sector_div(sz, base); | 207 | round = sector_div(sz, base); |
183 | nb_zone = conf->nr_zones = sz + (round ? 1 : 0); | 208 | nb_zone = sz + (round ? 1 : 0); |
184 | } | 209 | } |
185 | 210 | BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *)); | |
186 | conf->hash_table = kmalloc (sizeof (dev_info_t*) * nb_zone, | 211 | |
212 | conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone, | ||
187 | GFP_KERNEL); | 213 | GFP_KERNEL); |
188 | if (!conf->hash_table) | 214 | if (!conf->hash_table) |
189 | goto out; | 215 | goto out; |
190 | 216 | ||
191 | /* | 217 | /* |
192 | * Here we generate the linear hash table | 218 | * Here we generate the linear hash table |
219 | * First calculate the device offsets. | ||
193 | */ | 220 | */ |
221 | conf->disks[0].offset = 0; | ||
222 | for (i=1; i<mddev->raid_disks; i++) | ||
223 | conf->disks[i].offset = | ||
224 | conf->disks[i-1].offset + | ||
225 | conf->disks[i-1].size; | ||
226 | |||
194 | table = conf->hash_table; | 227 | table = conf->hash_table; |
195 | start = 0; | ||
196 | curr_offset = 0; | 228 | curr_offset = 0; |
197 | for (i = 0; i < cnt; i++) { | 229 | i = 0; |
198 | dev_info_t *disk = conf->disks + i; | 230 | for (curr_offset = 0; |
231 | curr_offset < mddev->array_size; | ||
232 | curr_offset += conf->hash_spacing) { | ||
199 | 233 | ||
200 | disk->offset = curr_offset; | 234 | while (i < mddev->raid_disks-1 && |
201 | curr_offset += disk->size; | 235 | curr_offset >= conf->disks[i+1].offset) |
236 | i++; | ||
202 | 237 | ||
203 | /* 'curr_offset' is the end of this disk | 238 | *table ++ = conf->disks + i; |
204 | * 'start' is the start of table | 239 | } |
240 | |||
241 | if (conf->preshift) { | ||
242 | conf->hash_spacing >>= conf->preshift; | ||
243 | /* round hash_spacing up so that when we divide by it, | ||
244 | * we err on the side of "too-low", which is safest. | ||
205 | */ | 245 | */ |
206 | while (start < curr_offset) { | 246 | conf->hash_spacing++; |
207 | *table++ = disk; | ||
208 | start += conf->smallest->size; | ||
209 | } | ||
210 | } | 247 | } |
211 | if (table-conf->hash_table != nb_zone) | 248 | |
212 | BUG(); | 249 | BUG_ON(table - conf->hash_table > nb_zone); |
213 | 250 | ||
214 | blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); | 251 | blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); |
215 | mddev->queue->unplug_fn = linear_unplug; | 252 | mddev->queue->unplug_fn = linear_unplug; |
@@ -299,7 +336,7 @@ static void linear_status (struct seq_file *seq, mddev_t *mddev) | |||
299 | sector_t s = 0; | 336 | sector_t s = 0; |
300 | 337 | ||
301 | seq_printf(seq, " "); | 338 | seq_printf(seq, " "); |
302 | for (j = 0; j < conf->nr_zones; j++) | 339 | for (j = 0; j < mddev->raid_disks; j++) |
303 | { | 340 | { |
304 | char b[BDEVNAME_SIZE]; | 341 | char b[BDEVNAME_SIZE]; |
305 | s += conf->smallest_size; | 342 | s += conf->smallest_size; |
diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h index e04c4fe45b53..7eaf290e10e7 100644 --- a/include/linux/raid/linear.h +++ b/include/linux/raid/linear.h | |||
@@ -14,8 +14,8 @@ typedef struct dev_info dev_info_t; | |||
14 | struct linear_private_data | 14 | struct linear_private_data |
15 | { | 15 | { |
16 | dev_info_t **hash_table; | 16 | dev_info_t **hash_table; |
17 | dev_info_t *smallest; | 17 | sector_t hash_spacing; |
18 | int nr_zones; | 18 | int preshift; /* shift before dividing by hash_spacing */ |
19 | dev_info_t disks[0]; | 19 | dev_info_t disks[0]; |
20 | }; | 20 | }; |
21 | 21 | ||