diff options
| author | Philipp Reisner <philipp.reisner@linbit.com> | 2007-01-11 04:58:10 -0500 |
|---|---|---|
| committer | Mark Fasheh <mark.fasheh@oracle.com> | 2007-02-07 15:15:58 -0500 |
| commit | b559292e066f6d570cd5aa5dbd41de61dd04bdce (patch) | |
| tree | d8221b5f54ad9b8cef694de687013614dccf5966 /fs/ocfs2 | |
| parent | 925037bcba7691db2403684141a276930ad184f3 (diff) | |
[PATCH] ocfs2 heartbeat: clean up bio submission code
As was already pointed out Mathieu Avila on Thu, 07 Sep 2006 03:15:25 -0700
that OCFS2 is expecting bio_add_page() to add pages to BIOs in an easily
predictable manner.
That is not true, especially for devices with own merge_bvec_fn().
Therefore OCFS2's heartbeat code is very likely to fail on such devices.
Move the bio_put() call into the bio's bi_end_io() function. This makes the
whole idea of trying to predict the behaviour of bio_add_page() unnecessary.
Removed compute_max_sectors() and o2hb_compute_request_limits().
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Diffstat (limited to 'fs/ocfs2')
| -rw-r--r-- | fs/ocfs2/cluster/heartbeat.c | 158 |
1 files changed, 31 insertions, 127 deletions
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 277ca67a2ad6..5a9779bb9236 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
| @@ -184,10 +184,9 @@ static void o2hb_disarm_write_timeout(struct o2hb_region *reg) | |||
| 184 | flush_scheduled_work(); | 184 | flush_scheduled_work(); |
| 185 | } | 185 | } |
| 186 | 186 | ||
| 187 | static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc, | 187 | static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) |
| 188 | unsigned int num_ios) | ||
| 189 | { | 188 | { |
| 190 | atomic_set(&wc->wc_num_reqs, num_ios); | 189 | atomic_set(&wc->wc_num_reqs, 1); |
| 191 | init_completion(&wc->wc_io_complete); | 190 | init_completion(&wc->wc_io_complete); |
| 192 | wc->wc_error = 0; | 191 | wc->wc_error = 0; |
| 193 | } | 192 | } |
| @@ -212,6 +211,7 @@ static void o2hb_wait_on_io(struct o2hb_region *reg, | |||
| 212 | struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping; | 211 | struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping; |
| 213 | 212 | ||
| 214 | blk_run_address_space(mapping); | 213 | blk_run_address_space(mapping); |
| 214 | o2hb_bio_wait_dec(wc, 1); | ||
| 215 | 215 | ||
| 216 | wait_for_completion(&wc->wc_io_complete); | 216 | wait_for_completion(&wc->wc_io_complete); |
| 217 | } | 217 | } |
| @@ -231,6 +231,7 @@ static int o2hb_bio_end_io(struct bio *bio, | |||
| 231 | return 1; | 231 | return 1; |
| 232 | 232 | ||
| 233 | o2hb_bio_wait_dec(wc, 1); | 233 | o2hb_bio_wait_dec(wc, 1); |
| 234 | bio_put(bio); | ||
| 234 | return 0; | 235 | return 0; |
| 235 | } | 236 | } |
| 236 | 237 | ||
| @@ -238,23 +239,22 @@ static int o2hb_bio_end_io(struct bio *bio, | |||
| 238 | * start_slot. */ | 239 | * start_slot. */ |
| 239 | static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, | 240 | static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, |
| 240 | struct o2hb_bio_wait_ctxt *wc, | 241 | struct o2hb_bio_wait_ctxt *wc, |
| 241 | unsigned int start_slot, | 242 | unsigned int *current_slot, |
| 242 | unsigned int num_slots) | 243 | unsigned int max_slots) |
| 243 | { | 244 | { |
| 244 | int i, nr_vecs, len, first_page, last_page; | 245 | int len, current_page; |
| 245 | unsigned int vec_len, vec_start; | 246 | unsigned int vec_len, vec_start; |
| 246 | unsigned int bits = reg->hr_block_bits; | 247 | unsigned int bits = reg->hr_block_bits; |
| 247 | unsigned int spp = reg->hr_slots_per_page; | 248 | unsigned int spp = reg->hr_slots_per_page; |
| 249 | unsigned int cs = *current_slot; | ||
| 248 | struct bio *bio; | 250 | struct bio *bio; |
| 249 | struct page *page; | 251 | struct page *page; |
| 250 | 252 | ||
| 251 | nr_vecs = (num_slots + spp - 1) / spp; | ||
| 252 | |||
| 253 | /* Testing has shown this allocation to take long enough under | 253 | /* Testing has shown this allocation to take long enough under |
| 254 | * GFP_KERNEL that the local node can get fenced. It would be | 254 | * GFP_KERNEL that the local node can get fenced. It would be |
| 255 | * nicest if we could pre-allocate these bios and avoid this | 255 | * nicest if we could pre-allocate these bios and avoid this |
| 256 | * all together. */ | 256 | * all together. */ |
| 257 | bio = bio_alloc(GFP_ATOMIC, nr_vecs); | 257 | bio = bio_alloc(GFP_ATOMIC, 16); |
| 258 | if (!bio) { | 258 | if (!bio) { |
| 259 | mlog(ML_ERROR, "Could not alloc slots BIO!\n"); | 259 | mlog(ML_ERROR, "Could not alloc slots BIO!\n"); |
| 260 | bio = ERR_PTR(-ENOMEM); | 260 | bio = ERR_PTR(-ENOMEM); |
| @@ -262,137 +262,53 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, | |||
| 262 | } | 262 | } |
| 263 | 263 | ||
| 264 | /* Must put everything in 512 byte sectors for the bio... */ | 264 | /* Must put everything in 512 byte sectors for the bio... */ |
| 265 | bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9); | 265 | bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9); |
| 266 | bio->bi_bdev = reg->hr_bdev; | 266 | bio->bi_bdev = reg->hr_bdev; |
| 267 | bio->bi_private = wc; | 267 | bio->bi_private = wc; |
| 268 | bio->bi_end_io = o2hb_bio_end_io; | 268 | bio->bi_end_io = o2hb_bio_end_io; |
| 269 | 269 | ||
| 270 | first_page = start_slot / spp; | 270 | vec_start = (cs << bits) % PAGE_CACHE_SIZE; |
| 271 | last_page = first_page + nr_vecs; | 271 | while(cs < max_slots) { |
| 272 | vec_start = (start_slot << bits) % PAGE_CACHE_SIZE; | 272 | current_page = cs / spp; |
| 273 | for(i = first_page; i < last_page; i++) { | 273 | page = reg->hr_slot_data[current_page]; |
| 274 | page = reg->hr_slot_data[i]; | ||
| 275 | 274 | ||
| 276 | vec_len = PAGE_CACHE_SIZE; | 275 | vec_len = min(PAGE_CACHE_SIZE, |
| 277 | /* last page might be short */ | 276 | (max_slots-cs) * (PAGE_CACHE_SIZE/spp) ); |
| 278 | if (((i + 1) * spp) > (start_slot + num_slots)) | ||
| 279 | vec_len = ((num_slots + start_slot) % spp) << bits; | ||
| 280 | vec_len -= vec_start; | ||
| 281 | 277 | ||
| 282 | mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", | 278 | mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", |
| 283 | i, vec_len, vec_start); | 279 | current_page, vec_len, vec_start); |
| 284 | 280 | ||
| 285 | len = bio_add_page(bio, page, vec_len, vec_start); | 281 | len = bio_add_page(bio, page, vec_len, vec_start); |
| 286 | if (len != vec_len) { | 282 | if (len != vec_len) break; |
| 287 | bio_put(bio); | ||
| 288 | bio = ERR_PTR(-EIO); | ||
| 289 | |||
| 290 | mlog(ML_ERROR, "Error adding page to bio i = %d, " | ||
| 291 | "vec_len = %u, len = %d\n, start = %u\n", | ||
| 292 | i, vec_len, len, vec_start); | ||
| 293 | goto bail; | ||
| 294 | } | ||
| 295 | 283 | ||
| 284 | cs += vec_len / (PAGE_CACHE_SIZE/spp); | ||
| 296 | vec_start = 0; | 285 | vec_start = 0; |
| 297 | } | 286 | } |
| 298 | 287 | ||
| 299 | bail: | 288 | bail: |
| 289 | *current_slot = cs; | ||
| 300 | return bio; | 290 | return bio; |
| 301 | } | 291 | } |
| 302 | 292 | ||
| 303 | /* | ||
| 304 | * Compute the maximum number of sectors the bdev can handle in one bio, | ||
| 305 | * as a power of two. | ||
| 306 | * | ||
| 307 | * Stolen from oracleasm, thanks Joel! | ||
| 308 | */ | ||
| 309 | static int compute_max_sectors(struct block_device *bdev) | ||
| 310 | { | ||
| 311 | int max_pages, max_sectors, pow_two_sectors; | ||
| 312 | |||
| 313 | struct request_queue *q; | ||
| 314 | |||
| 315 | q = bdev_get_queue(bdev); | ||
| 316 | max_pages = q->max_sectors >> (PAGE_SHIFT - 9); | ||
| 317 | if (max_pages > BIO_MAX_PAGES) | ||
| 318 | max_pages = BIO_MAX_PAGES; | ||
| 319 | if (max_pages > q->max_phys_segments) | ||
| 320 | max_pages = q->max_phys_segments; | ||
| 321 | if (max_pages > q->max_hw_segments) | ||
| 322 | max_pages = q->max_hw_segments; | ||
| 323 | max_pages--; /* Handle I/Os that straddle a page */ | ||
| 324 | |||
| 325 | if (max_pages) { | ||
| 326 | max_sectors = max_pages << (PAGE_SHIFT - 9); | ||
| 327 | } else { | ||
| 328 | /* If BIO contains 1 or less than 1 page. */ | ||
| 329 | max_sectors = q->max_sectors; | ||
| 330 | } | ||
| 331 | /* Why is fls() 1-based???? */ | ||
| 332 | pow_two_sectors = 1 << (fls(max_sectors) - 1); | ||
| 333 | |||
| 334 | return pow_two_sectors; | ||
| 335 | } | ||
| 336 | |||
| 337 | static inline void o2hb_compute_request_limits(struct o2hb_region *reg, | ||
| 338 | unsigned int num_slots, | ||
| 339 | unsigned int *num_bios, | ||
| 340 | unsigned int *slots_per_bio) | ||
| 341 | { | ||
| 342 | unsigned int max_sectors, io_sectors; | ||
| 343 | |||
| 344 | max_sectors = compute_max_sectors(reg->hr_bdev); | ||
| 345 | |||
| 346 | io_sectors = num_slots << (reg->hr_block_bits - 9); | ||
| 347 | |||
| 348 | *num_bios = (io_sectors + max_sectors - 1) / max_sectors; | ||
| 349 | *slots_per_bio = max_sectors >> (reg->hr_block_bits - 9); | ||
| 350 | |||
| 351 | mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This " | ||
| 352 | "device can handle %u sectors of I/O\n", io_sectors, num_slots, | ||
| 353 | max_sectors); | ||
| 354 | mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n", | ||
| 355 | *num_bios, *slots_per_bio); | ||
| 356 | } | ||
| 357 | |||
| 358 | static int o2hb_read_slots(struct o2hb_region *reg, | 293 | static int o2hb_read_slots(struct o2hb_region *reg, |
| 359 | unsigned int max_slots) | 294 | unsigned int max_slots) |
| 360 | { | 295 | { |
| 361 | unsigned int num_bios, slots_per_bio, start_slot, num_slots; | 296 | unsigned int current_slot=0; |
| 362 | int i, status; | 297 | int status; |
| 363 | struct o2hb_bio_wait_ctxt wc; | 298 | struct o2hb_bio_wait_ctxt wc; |
| 364 | struct bio **bios; | ||
| 365 | struct bio *bio; | 299 | struct bio *bio; |
| 366 | 300 | ||
| 367 | o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio); | 301 | o2hb_bio_wait_init(&wc); |
| 368 | 302 | ||
| 369 | bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL); | 303 | while(current_slot < max_slots) { |
| 370 | if (!bios) { | 304 | bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots); |
| 371 | status = -ENOMEM; | ||
| 372 | mlog_errno(status); | ||
| 373 | return status; | ||
| 374 | } | ||
| 375 | |||
| 376 | o2hb_bio_wait_init(&wc, num_bios); | ||
| 377 | |||
| 378 | num_slots = slots_per_bio; | ||
| 379 | for(i = 0; i < num_bios; i++) { | ||
| 380 | start_slot = i * slots_per_bio; | ||
| 381 | |||
| 382 | /* adjust num_slots at last bio */ | ||
| 383 | if (max_slots < (start_slot + num_slots)) | ||
| 384 | num_slots = max_slots - start_slot; | ||
| 385 | |||
| 386 | bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots); | ||
| 387 | if (IS_ERR(bio)) { | 305 | if (IS_ERR(bio)) { |
| 388 | o2hb_bio_wait_dec(&wc, num_bios - i); | ||
| 389 | |||
| 390 | status = PTR_ERR(bio); | 306 | status = PTR_ERR(bio); |
| 391 | mlog_errno(status); | 307 | mlog_errno(status); |
| 392 | goto bail_and_wait; | 308 | goto bail_and_wait; |
| 393 | } | 309 | } |
| 394 | bios[i] = bio; | ||
| 395 | 310 | ||
| 311 | atomic_inc(&wc.wc_num_reqs); | ||
| 396 | submit_bio(READ, bio); | 312 | submit_bio(READ, bio); |
| 397 | } | 313 | } |
| 398 | 314 | ||
| @@ -403,38 +319,30 @@ bail_and_wait: | |||
| 403 | if (wc.wc_error && !status) | 319 | if (wc.wc_error && !status) |
| 404 | status = wc.wc_error; | 320 | status = wc.wc_error; |
| 405 | 321 | ||
| 406 | if (bios) { | ||
| 407 | for(i = 0; i < num_bios; i++) | ||
| 408 | if (bios[i]) | ||
| 409 | bio_put(bios[i]); | ||
| 410 | kfree(bios); | ||
| 411 | } | ||
| 412 | |||
| 413 | return status; | 322 | return status; |
| 414 | } | 323 | } |
| 415 | 324 | ||
| 416 | static int o2hb_issue_node_write(struct o2hb_region *reg, | 325 | static int o2hb_issue_node_write(struct o2hb_region *reg, |
| 417 | struct bio **write_bio, | ||
| 418 | struct o2hb_bio_wait_ctxt *write_wc) | 326 | struct o2hb_bio_wait_ctxt *write_wc) |
| 419 | { | 327 | { |
| 420 | int status; | 328 | int status; |
| 421 | unsigned int slot; | 329 | unsigned int slot; |
| 422 | struct bio *bio; | 330 | struct bio *bio; |
| 423 | 331 | ||
| 424 | o2hb_bio_wait_init(write_wc, 1); | 332 | o2hb_bio_wait_init(write_wc); |
| 425 | 333 | ||
| 426 | slot = o2nm_this_node(); | 334 | slot = o2nm_this_node(); |
| 427 | 335 | ||
| 428 | bio = o2hb_setup_one_bio(reg, write_wc, slot, 1); | 336 | bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1); |
| 429 | if (IS_ERR(bio)) { | 337 | if (IS_ERR(bio)) { |
| 430 | status = PTR_ERR(bio); | 338 | status = PTR_ERR(bio); |
| 431 | mlog_errno(status); | 339 | mlog_errno(status); |
| 432 | goto bail; | 340 | goto bail; |
| 433 | } | 341 | } |
| 434 | 342 | ||
| 343 | atomic_inc(&write_wc->wc_num_reqs); | ||
| 435 | submit_bio(WRITE, bio); | 344 | submit_bio(WRITE, bio); |
| 436 | 345 | ||
| 437 | *write_bio = bio; | ||
| 438 | status = 0; | 346 | status = 0; |
| 439 | bail: | 347 | bail: |
| 440 | return status; | 348 | return status; |
| @@ -826,7 +734,6 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
| 826 | { | 734 | { |
| 827 | int i, ret, highest_node, change = 0; | 735 | int i, ret, highest_node, change = 0; |
| 828 | unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 736 | unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
| 829 | struct bio *write_bio; | ||
| 830 | struct o2hb_bio_wait_ctxt write_wc; | 737 | struct o2hb_bio_wait_ctxt write_wc; |
| 831 | 738 | ||
| 832 | ret = o2nm_configured_node_map(configured_nodes, | 739 | ret = o2nm_configured_node_map(configured_nodes, |
| @@ -864,7 +771,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
| 864 | 771 | ||
| 865 | /* And fire off the write. Note that we don't wait on this I/O | 772 | /* And fire off the write. Note that we don't wait on this I/O |
| 866 | * until later. */ | 773 | * until later. */ |
| 867 | ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); | 774 | ret = o2hb_issue_node_write(reg, &write_wc); |
| 868 | if (ret < 0) { | 775 | if (ret < 0) { |
| 869 | mlog_errno(ret); | 776 | mlog_errno(ret); |
| 870 | return ret; | 777 | return ret; |
| @@ -882,7 +789,6 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
| 882 | * people we find in our steady state have seen us. | 789 | * people we find in our steady state have seen us. |
| 883 | */ | 790 | */ |
| 884 | o2hb_wait_on_io(reg, &write_wc); | 791 | o2hb_wait_on_io(reg, &write_wc); |
| 885 | bio_put(write_bio); | ||
| 886 | if (write_wc.wc_error) { | 792 | if (write_wc.wc_error) { |
| 887 | /* Do not re-arm the write timeout on I/O error - we | 793 | /* Do not re-arm the write timeout on I/O error - we |
| 888 | * can't be sure that the new block ever made it to | 794 | * can't be sure that the new block ever made it to |
| @@ -943,7 +849,6 @@ static int o2hb_thread(void *data) | |||
| 943 | { | 849 | { |
| 944 | int i, ret; | 850 | int i, ret; |
| 945 | struct o2hb_region *reg = data; | 851 | struct o2hb_region *reg = data; |
| 946 | struct bio *write_bio; | ||
| 947 | struct o2hb_bio_wait_ctxt write_wc; | 852 | struct o2hb_bio_wait_ctxt write_wc; |
| 948 | struct timeval before_hb, after_hb; | 853 | struct timeval before_hb, after_hb; |
| 949 | unsigned int elapsed_msec; | 854 | unsigned int elapsed_msec; |
| @@ -993,10 +898,9 @@ static int o2hb_thread(void *data) | |||
| 993 | * | 898 | * |
| 994 | * XXX: Should we skip this on unclean_stop? */ | 899 | * XXX: Should we skip this on unclean_stop? */ |
| 995 | o2hb_prepare_block(reg, 0); | 900 | o2hb_prepare_block(reg, 0); |
| 996 | ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); | 901 | ret = o2hb_issue_node_write(reg, &write_wc); |
| 997 | if (ret == 0) { | 902 | if (ret == 0) { |
| 998 | o2hb_wait_on_io(reg, &write_wc); | 903 | o2hb_wait_on_io(reg, &write_wc); |
| 999 | bio_put(write_bio); | ||
| 1000 | } else { | 904 | } else { |
| 1001 | mlog_errno(ret); | 905 | mlog_errno(ret); |
| 1002 | } | 906 | } |
