aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPhilipp Reisner <philipp.reisner@linbit.com>2007-01-11 04:58:10 -0500
committerMark Fasheh <mark.fasheh@oracle.com>2007-02-07 15:15:58 -0500
commitb559292e066f6d570cd5aa5dbd41de61dd04bdce (patch)
treed8221b5f54ad9b8cef694de687013614dccf5966
parent925037bcba7691db2403684141a276930ad184f3 (diff)
[PATCH] ocfs2 heartbeat: clean up bio submission code
As was already pointed out Mathieu Avila on Thu, 07 Sep 2006 03:15:25 -0700 that OCFS2 is expecting bio_add_page() to add pages to BIOs in an easily predictable manner. That is not true, especially for devices with own merge_bvec_fn(). Therefore OCFS2's heartbeat code is very likely to fail on such devices. Move the bio_put() call into the bio's bi_end_io() function. This makes the whole idea of trying to predict the behaviour of bio_add_page() unnecessary. Removed compute_max_sectors() and o2hb_compute_request_limits(). Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
-rw-r--r--fs/ocfs2/cluster/heartbeat.c158
1 files changed, 31 insertions, 127 deletions
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 277ca67a2ad6..5a9779bb9236 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -184,10 +184,9 @@ static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
184 flush_scheduled_work(); 184 flush_scheduled_work();
185} 185}
186 186
187static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc, 187static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
188 unsigned int num_ios)
189{ 188{
190 atomic_set(&wc->wc_num_reqs, num_ios); 189 atomic_set(&wc->wc_num_reqs, 1);
191 init_completion(&wc->wc_io_complete); 190 init_completion(&wc->wc_io_complete);
192 wc->wc_error = 0; 191 wc->wc_error = 0;
193} 192}
@@ -212,6 +211,7 @@ static void o2hb_wait_on_io(struct o2hb_region *reg,
212 struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping; 211 struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
213 212
214 blk_run_address_space(mapping); 213 blk_run_address_space(mapping);
214 o2hb_bio_wait_dec(wc, 1);
215 215
216 wait_for_completion(&wc->wc_io_complete); 216 wait_for_completion(&wc->wc_io_complete);
217} 217}
@@ -231,6 +231,7 @@ static int o2hb_bio_end_io(struct bio *bio,
231 return 1; 231 return 1;
232 232
233 o2hb_bio_wait_dec(wc, 1); 233 o2hb_bio_wait_dec(wc, 1);
234 bio_put(bio);
234 return 0; 235 return 0;
235} 236}
236 237
@@ -238,23 +239,22 @@ static int o2hb_bio_end_io(struct bio *bio,
238 * start_slot. */ 239 * start_slot. */
239static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, 240static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
240 struct o2hb_bio_wait_ctxt *wc, 241 struct o2hb_bio_wait_ctxt *wc,
241 unsigned int start_slot, 242 unsigned int *current_slot,
242 unsigned int num_slots) 243 unsigned int max_slots)
243{ 244{
244 int i, nr_vecs, len, first_page, last_page; 245 int len, current_page;
245 unsigned int vec_len, vec_start; 246 unsigned int vec_len, vec_start;
246 unsigned int bits = reg->hr_block_bits; 247 unsigned int bits = reg->hr_block_bits;
247 unsigned int spp = reg->hr_slots_per_page; 248 unsigned int spp = reg->hr_slots_per_page;
249 unsigned int cs = *current_slot;
248 struct bio *bio; 250 struct bio *bio;
249 struct page *page; 251 struct page *page;
250 252
251 nr_vecs = (num_slots + spp - 1) / spp;
252
253 /* Testing has shown this allocation to take long enough under 253 /* Testing has shown this allocation to take long enough under
254 * GFP_KERNEL that the local node can get fenced. It would be 254 * GFP_KERNEL that the local node can get fenced. It would be
255 * nicest if we could pre-allocate these bios and avoid this 255 * nicest if we could pre-allocate these bios and avoid this
256 * all together. */ 256 * all together. */
257 bio = bio_alloc(GFP_ATOMIC, nr_vecs); 257 bio = bio_alloc(GFP_ATOMIC, 16);
258 if (!bio) { 258 if (!bio) {
259 mlog(ML_ERROR, "Could not alloc slots BIO!\n"); 259 mlog(ML_ERROR, "Could not alloc slots BIO!\n");
260 bio = ERR_PTR(-ENOMEM); 260 bio = ERR_PTR(-ENOMEM);
@@ -262,137 +262,53 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
262 } 262 }
263 263
264 /* Must put everything in 512 byte sectors for the bio... */ 264 /* Must put everything in 512 byte sectors for the bio... */
265 bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9); 265 bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9);
266 bio->bi_bdev = reg->hr_bdev; 266 bio->bi_bdev = reg->hr_bdev;
267 bio->bi_private = wc; 267 bio->bi_private = wc;
268 bio->bi_end_io = o2hb_bio_end_io; 268 bio->bi_end_io = o2hb_bio_end_io;
269 269
270 first_page = start_slot / spp; 270 vec_start = (cs << bits) % PAGE_CACHE_SIZE;
271 last_page = first_page + nr_vecs; 271 while(cs < max_slots) {
272 vec_start = (start_slot << bits) % PAGE_CACHE_SIZE; 272 current_page = cs / spp;
273 for(i = first_page; i < last_page; i++) { 273 page = reg->hr_slot_data[current_page];
274 page = reg->hr_slot_data[i];
275 274
276 vec_len = PAGE_CACHE_SIZE; 275 vec_len = min(PAGE_CACHE_SIZE,
277 /* last page might be short */ 276 (max_slots-cs) * (PAGE_CACHE_SIZE/spp) );
278 if (((i + 1) * spp) > (start_slot + num_slots))
279 vec_len = ((num_slots + start_slot) % spp) << bits;
280 vec_len -= vec_start;
281 277
282 mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", 278 mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
283 i, vec_len, vec_start); 279 current_page, vec_len, vec_start);
284 280
285 len = bio_add_page(bio, page, vec_len, vec_start); 281 len = bio_add_page(bio, page, vec_len, vec_start);
286 if (len != vec_len) { 282 if (len != vec_len) break;
287 bio_put(bio);
288 bio = ERR_PTR(-EIO);
289
290 mlog(ML_ERROR, "Error adding page to bio i = %d, "
291 "vec_len = %u, len = %d\n, start = %u\n",
292 i, vec_len, len, vec_start);
293 goto bail;
294 }
295 283
284 cs += vec_len / (PAGE_CACHE_SIZE/spp);
296 vec_start = 0; 285 vec_start = 0;
297 } 286 }
298 287
299bail: 288bail:
289 *current_slot = cs;
300 return bio; 290 return bio;
301} 291}
302 292
303/*
304 * Compute the maximum number of sectors the bdev can handle in one bio,
305 * as a power of two.
306 *
307 * Stolen from oracleasm, thanks Joel!
308 */
309static int compute_max_sectors(struct block_device *bdev)
310{
311 int max_pages, max_sectors, pow_two_sectors;
312
313 struct request_queue *q;
314
315 q = bdev_get_queue(bdev);
316 max_pages = q->max_sectors >> (PAGE_SHIFT - 9);
317 if (max_pages > BIO_MAX_PAGES)
318 max_pages = BIO_MAX_PAGES;
319 if (max_pages > q->max_phys_segments)
320 max_pages = q->max_phys_segments;
321 if (max_pages > q->max_hw_segments)
322 max_pages = q->max_hw_segments;
323 max_pages--; /* Handle I/Os that straddle a page */
324
325 if (max_pages) {
326 max_sectors = max_pages << (PAGE_SHIFT - 9);
327 } else {
328 /* If BIO contains 1 or less than 1 page. */
329 max_sectors = q->max_sectors;
330 }
331 /* Why is fls() 1-based???? */
332 pow_two_sectors = 1 << (fls(max_sectors) - 1);
333
334 return pow_two_sectors;
335}
336
337static inline void o2hb_compute_request_limits(struct o2hb_region *reg,
338 unsigned int num_slots,
339 unsigned int *num_bios,
340 unsigned int *slots_per_bio)
341{
342 unsigned int max_sectors, io_sectors;
343
344 max_sectors = compute_max_sectors(reg->hr_bdev);
345
346 io_sectors = num_slots << (reg->hr_block_bits - 9);
347
348 *num_bios = (io_sectors + max_sectors - 1) / max_sectors;
349 *slots_per_bio = max_sectors >> (reg->hr_block_bits - 9);
350
351 mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This "
352 "device can handle %u sectors of I/O\n", io_sectors, num_slots,
353 max_sectors);
354 mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n",
355 *num_bios, *slots_per_bio);
356}
357
358static int o2hb_read_slots(struct o2hb_region *reg, 293static int o2hb_read_slots(struct o2hb_region *reg,
359 unsigned int max_slots) 294 unsigned int max_slots)
360{ 295{
361 unsigned int num_bios, slots_per_bio, start_slot, num_slots; 296 unsigned int current_slot=0;
362 int i, status; 297 int status;
363 struct o2hb_bio_wait_ctxt wc; 298 struct o2hb_bio_wait_ctxt wc;
364 struct bio **bios;
365 struct bio *bio; 299 struct bio *bio;
366 300
367 o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio); 301 o2hb_bio_wait_init(&wc);
368 302
369 bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL); 303 while(current_slot < max_slots) {
370 if (!bios) { 304 bio = o2hb_setup_one_bio(reg, &wc, &current_slot, max_slots);
371 status = -ENOMEM;
372 mlog_errno(status);
373 return status;
374 }
375
376 o2hb_bio_wait_init(&wc, num_bios);
377
378 num_slots = slots_per_bio;
379 for(i = 0; i < num_bios; i++) {
380 start_slot = i * slots_per_bio;
381
382 /* adjust num_slots at last bio */
383 if (max_slots < (start_slot + num_slots))
384 num_slots = max_slots - start_slot;
385
386 bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots);
387 if (IS_ERR(bio)) { 305 if (IS_ERR(bio)) {
388 o2hb_bio_wait_dec(&wc, num_bios - i);
389
390 status = PTR_ERR(bio); 306 status = PTR_ERR(bio);
391 mlog_errno(status); 307 mlog_errno(status);
392 goto bail_and_wait; 308 goto bail_and_wait;
393 } 309 }
394 bios[i] = bio;
395 310
311 atomic_inc(&wc.wc_num_reqs);
396 submit_bio(READ, bio); 312 submit_bio(READ, bio);
397 } 313 }
398 314
@@ -403,38 +319,30 @@ bail_and_wait:
403 if (wc.wc_error && !status) 319 if (wc.wc_error && !status)
404 status = wc.wc_error; 320 status = wc.wc_error;
405 321
406 if (bios) {
407 for(i = 0; i < num_bios; i++)
408 if (bios[i])
409 bio_put(bios[i]);
410 kfree(bios);
411 }
412
413 return status; 322 return status;
414} 323}
415 324
416static int o2hb_issue_node_write(struct o2hb_region *reg, 325static int o2hb_issue_node_write(struct o2hb_region *reg,
417 struct bio **write_bio,
418 struct o2hb_bio_wait_ctxt *write_wc) 326 struct o2hb_bio_wait_ctxt *write_wc)
419{ 327{
420 int status; 328 int status;
421 unsigned int slot; 329 unsigned int slot;
422 struct bio *bio; 330 struct bio *bio;
423 331
424 o2hb_bio_wait_init(write_wc, 1); 332 o2hb_bio_wait_init(write_wc);
425 333
426 slot = o2nm_this_node(); 334 slot = o2nm_this_node();
427 335
428 bio = o2hb_setup_one_bio(reg, write_wc, slot, 1); 336 bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1);
429 if (IS_ERR(bio)) { 337 if (IS_ERR(bio)) {
430 status = PTR_ERR(bio); 338 status = PTR_ERR(bio);
431 mlog_errno(status); 339 mlog_errno(status);
432 goto bail; 340 goto bail;
433 } 341 }
434 342
343 atomic_inc(&write_wc->wc_num_reqs);
435 submit_bio(WRITE, bio); 344 submit_bio(WRITE, bio);
436 345
437 *write_bio = bio;
438 status = 0; 346 status = 0;
439bail: 347bail:
440 return status; 348 return status;
@@ -826,7 +734,6 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
826{ 734{
827 int i, ret, highest_node, change = 0; 735 int i, ret, highest_node, change = 0;
828 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; 736 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
829 struct bio *write_bio;
830 struct o2hb_bio_wait_ctxt write_wc; 737 struct o2hb_bio_wait_ctxt write_wc;
831 738
832 ret = o2nm_configured_node_map(configured_nodes, 739 ret = o2nm_configured_node_map(configured_nodes,
@@ -864,7 +771,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
864 771
865 /* And fire off the write. Note that we don't wait on this I/O 772 /* And fire off the write. Note that we don't wait on this I/O
866 * until later. */ 773 * until later. */
867 ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); 774 ret = o2hb_issue_node_write(reg, &write_wc);
868 if (ret < 0) { 775 if (ret < 0) {
869 mlog_errno(ret); 776 mlog_errno(ret);
870 return ret; 777 return ret;
@@ -882,7 +789,6 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
882 * people we find in our steady state have seen us. 789 * people we find in our steady state have seen us.
883 */ 790 */
884 o2hb_wait_on_io(reg, &write_wc); 791 o2hb_wait_on_io(reg, &write_wc);
885 bio_put(write_bio);
886 if (write_wc.wc_error) { 792 if (write_wc.wc_error) {
887 /* Do not re-arm the write timeout on I/O error - we 793 /* Do not re-arm the write timeout on I/O error - we
888 * can't be sure that the new block ever made it to 794 * can't be sure that the new block ever made it to
@@ -943,7 +849,6 @@ static int o2hb_thread(void *data)
943{ 849{
944 int i, ret; 850 int i, ret;
945 struct o2hb_region *reg = data; 851 struct o2hb_region *reg = data;
946 struct bio *write_bio;
947 struct o2hb_bio_wait_ctxt write_wc; 852 struct o2hb_bio_wait_ctxt write_wc;
948 struct timeval before_hb, after_hb; 853 struct timeval before_hb, after_hb;
949 unsigned int elapsed_msec; 854 unsigned int elapsed_msec;
@@ -993,10 +898,9 @@ static int o2hb_thread(void *data)
993 * 898 *
994 * XXX: Should we skip this on unclean_stop? */ 899 * XXX: Should we skip this on unclean_stop? */
995 o2hb_prepare_block(reg, 0); 900 o2hb_prepare_block(reg, 0);
996 ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); 901 ret = o2hb_issue_node_write(reg, &write_wc);
997 if (ret == 0) { 902 if (ret == 0) {
998 o2hb_wait_on_io(reg, &write_wc); 903 o2hb_wait_on_io(reg, &write_wc);
999 bio_put(write_bio);
1000 } else { 904 } else {
1001 mlog_errno(ret); 905 mlog_errno(ret);
1002 } 906 }