diff options
Diffstat (limited to 'drivers/block/drbd/drbd_actlog.c')
-rw-r--r-- | drivers/block/drbd/drbd_actlog.c | 246 |
1 files changed, 188 insertions, 58 deletions
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 92510f8ad013..6608076dc39e 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c | |||
@@ -104,7 +104,6 @@ struct update_al_work { | |||
104 | int err; | 104 | int err; |
105 | }; | 105 | }; |
106 | 106 | ||
107 | static int al_write_transaction(struct drbd_conf *mdev); | ||
108 | 107 | ||
109 | void *drbd_md_get_buffer(struct drbd_conf *mdev) | 108 | void *drbd_md_get_buffer(struct drbd_conf *mdev) |
110 | { | 109 | { |
@@ -168,7 +167,11 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, | |||
168 | bio->bi_end_io = drbd_md_io_complete; | 167 | bio->bi_end_io = drbd_md_io_complete; |
169 | bio->bi_rw = rw; | 168 | bio->bi_rw = rw; |
170 | 169 | ||
171 | if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ | 170 | if (!(rw & WRITE) && mdev->state.disk == D_DISKLESS && mdev->ldev == NULL) |
171 | /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */ | ||
172 | ; | ||
173 | else if (!get_ldev_if_state(mdev, D_ATTACHING)) { | ||
174 | /* Corresponding put_ldev in drbd_md_io_complete() */ | ||
172 | dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); | 175 | dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); |
173 | err = -ENODEV; | 176 | err = -ENODEV; |
174 | goto out; | 177 | goto out; |
@@ -199,9 +202,10 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | |||
199 | 202 | ||
200 | BUG_ON(!bdev->md_bdev); | 203 | BUG_ON(!bdev->md_bdev); |
201 | 204 | ||
202 | dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n", | 205 | dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", |
203 | current->comm, current->pid, __func__, | 206 | current->comm, current->pid, __func__, |
204 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | 207 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", |
208 | (void*)_RET_IP_ ); | ||
205 | 209 | ||
206 | if (sector < drbd_md_first_sector(bdev) || | 210 | if (sector < drbd_md_first_sector(bdev) || |
207 | sector + 7 > drbd_md_last_sector(bdev)) | 211 | sector + 7 > drbd_md_last_sector(bdev)) |
@@ -209,7 +213,8 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | |||
209 | current->comm, current->pid, __func__, | 213 | current->comm, current->pid, __func__, |
210 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); | 214 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); |
211 | 215 | ||
212 | err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE); | 216 | /* we do all our meta data IO in aligned 4k blocks. */ |
217 | err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096); | ||
213 | if (err) { | 218 | if (err) { |
214 | dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", | 219 | dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", |
215 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); | 220 | (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); |
@@ -217,44 +222,99 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, | |||
217 | return err; | 222 | return err; |
218 | } | 223 | } |
219 | 224 | ||
220 | static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) | 225 | static struct bm_extent *find_active_resync_extent(struct drbd_conf *mdev, unsigned int enr) |
221 | { | 226 | { |
222 | struct lc_element *al_ext; | ||
223 | struct lc_element *tmp; | 227 | struct lc_element *tmp; |
224 | int wake; | ||
225 | |||
226 | spin_lock_irq(&mdev->al_lock); | ||
227 | tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); | 228 | tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); |
228 | if (unlikely(tmp != NULL)) { | 229 | if (unlikely(tmp != NULL)) { |
229 | struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); | 230 | struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); |
230 | if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { | 231 | if (test_bit(BME_NO_WRITES, &bm_ext->flags)) |
231 | wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); | 232 | return bm_ext; |
232 | spin_unlock_irq(&mdev->al_lock); | 233 | } |
233 | if (wake) | 234 | return NULL; |
234 | wake_up(&mdev->al_wait); | 235 | } |
235 | return NULL; | 236 | |
236 | } | 237 | static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr, bool nonblock) |
238 | { | ||
239 | struct lc_element *al_ext; | ||
240 | struct bm_extent *bm_ext; | ||
241 | int wake; | ||
242 | |||
243 | spin_lock_irq(&mdev->al_lock); | ||
244 | bm_ext = find_active_resync_extent(mdev, enr); | ||
245 | if (bm_ext) { | ||
246 | wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); | ||
247 | spin_unlock_irq(&mdev->al_lock); | ||
248 | if (wake) | ||
249 | wake_up(&mdev->al_wait); | ||
250 | return NULL; | ||
237 | } | 251 | } |
238 | al_ext = lc_get(mdev->act_log, enr); | 252 | if (nonblock) |
253 | al_ext = lc_try_get(mdev->act_log, enr); | ||
254 | else | ||
255 | al_ext = lc_get(mdev->act_log, enr); | ||
239 | spin_unlock_irq(&mdev->al_lock); | 256 | spin_unlock_irq(&mdev->al_lock); |
240 | return al_ext; | 257 | return al_ext; |
241 | } | 258 | } |
242 | 259 | ||
243 | void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) | 260 | bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i) |
244 | { | 261 | { |
245 | /* for bios crossing activity log extent boundaries, | 262 | /* for bios crossing activity log extent boundaries, |
246 | * we may need to activate two extents in one go */ | 263 | * we may need to activate two extents in one go */ |
247 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); | 264 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); |
248 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); | 265 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); |
249 | unsigned enr; | ||
250 | bool locked = false; | ||
251 | 266 | ||
267 | D_ASSERT((unsigned)(last - first) <= 1); | ||
268 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); | ||
269 | |||
270 | /* FIXME figure out a fast path for bios crossing AL extent boundaries */ | ||
271 | if (first != last) | ||
272 | return false; | ||
273 | |||
274 | return _al_get(mdev, first, true); | ||
275 | } | ||
276 | |||
277 | bool drbd_al_begin_io_prepare(struct drbd_conf *mdev, struct drbd_interval *i) | ||
278 | { | ||
279 | /* for bios crossing activity log extent boundaries, | ||
280 | * we may need to activate two extents in one go */ | ||
281 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); | ||
282 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); | ||
283 | unsigned enr; | ||
284 | bool need_transaction = false; | ||
252 | 285 | ||
253 | D_ASSERT(first <= last); | 286 | D_ASSERT(first <= last); |
254 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); | 287 | D_ASSERT(atomic_read(&mdev->local_cnt) > 0); |
255 | 288 | ||
256 | for (enr = first; enr <= last; enr++) | 289 | for (enr = first; enr <= last; enr++) { |
257 | wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL); | 290 | struct lc_element *al_ext; |
291 | wait_event(mdev->al_wait, | ||
292 | (al_ext = _al_get(mdev, enr, false)) != NULL); | ||
293 | if (al_ext->lc_number != enr) | ||
294 | need_transaction = true; | ||
295 | } | ||
296 | return need_transaction; | ||
297 | } | ||
298 | |||
299 | static int al_write_transaction(struct drbd_conf *mdev, bool delegate); | ||
300 | |||
301 | /* When called through generic_make_request(), we must delegate | ||
302 | * activity log I/O to the worker thread: a further request | ||
303 | * submitted via generic_make_request() within the same task | ||
304 | * would be queued on current->bio_list, and would only start | ||
305 | * after this function returns (see generic_make_request()). | ||
306 | * | ||
307 | * However, if we *are* the worker, we must not delegate to ourselves. | ||
308 | */ | ||
309 | |||
310 | /* | ||
311 | * @delegate: delegate activity log I/O to the worker thread | ||
312 | */ | ||
313 | void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate) | ||
314 | { | ||
315 | bool locked = false; | ||
316 | |||
317 | BUG_ON(delegate && current == mdev->tconn->worker.task); | ||
258 | 318 | ||
259 | /* Serialize multiple transactions. | 319 | /* Serialize multiple transactions. |
260 | * This uses test_and_set_bit, memory barrier is implicit. | 320 | * This uses test_and_set_bit, memory barrier is implicit. |
@@ -264,13 +324,6 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) | |||
264 | (locked = lc_try_lock_for_transaction(mdev->act_log))); | 324 | (locked = lc_try_lock_for_transaction(mdev->act_log))); |
265 | 325 | ||
266 | if (locked) { | 326 | if (locked) { |
267 | /* drbd_al_write_transaction(mdev,al_ext,enr); | ||
268 | * recurses into generic_make_request(), which | ||
269 | * disallows recursion, bios being serialized on the | ||
270 | * current->bio_tail list now. | ||
271 | * we have to delegate updates to the activity log | ||
272 | * to the worker thread. */ | ||
273 | |||
274 | /* Double check: it may have been committed by someone else, | 327 | /* Double check: it may have been committed by someone else, |
275 | * while we have been waiting for the lock. */ | 328 | * while we have been waiting for the lock. */ |
276 | if (mdev->act_log->pending_changes) { | 329 | if (mdev->act_log->pending_changes) { |
@@ -280,11 +333,8 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) | |||
280 | write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; | 333 | write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; |
281 | rcu_read_unlock(); | 334 | rcu_read_unlock(); |
282 | 335 | ||
283 | if (write_al_updates) { | 336 | if (write_al_updates) |
284 | al_write_transaction(mdev); | 337 | al_write_transaction(mdev, delegate); |
285 | mdev->al_writ_cnt++; | ||
286 | } | ||
287 | |||
288 | spin_lock_irq(&mdev->al_lock); | 338 | spin_lock_irq(&mdev->al_lock); |
289 | /* FIXME | 339 | /* FIXME |
290 | if (err) | 340 | if (err) |
@@ -298,6 +348,66 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) | |||
298 | } | 348 | } |
299 | } | 349 | } |
300 | 350 | ||
351 | /* | ||
352 | * @delegate: delegate activity log I/O to the worker thread | ||
353 | */ | ||
354 | void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate) | ||
355 | { | ||
356 | BUG_ON(delegate && current == mdev->tconn->worker.task); | ||
357 | |||
358 | if (drbd_al_begin_io_prepare(mdev, i)) | ||
359 | drbd_al_begin_io_commit(mdev, delegate); | ||
360 | } | ||
361 | |||
362 | int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i) | ||
363 | { | ||
364 | struct lru_cache *al = mdev->act_log; | ||
365 | /* for bios crossing activity log extent boundaries, | ||
366 | * we may need to activate two extents in one go */ | ||
367 | unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); | ||
368 | unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); | ||
369 | unsigned nr_al_extents; | ||
370 | unsigned available_update_slots; | ||
371 | unsigned enr; | ||
372 | |||
373 | D_ASSERT(first <= last); | ||
374 | |||
375 | nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */ | ||
376 | available_update_slots = min(al->nr_elements - al->used, | ||
377 | al->max_pending_changes - al->pending_changes); | ||
378 | |||
379 | /* We want all necessary updates for a given request within the same transaction | ||
380 | * We could first check how many updates are *actually* needed, | ||
381 | * and use that instead of the worst-case nr_al_extents */ | ||
382 | if (available_update_slots < nr_al_extents) | ||
383 | return -EWOULDBLOCK; | ||
384 | |||
385 | /* Is resync active in this area? */ | ||
386 | for (enr = first; enr <= last; enr++) { | ||
387 | struct lc_element *tmp; | ||
388 | tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); | ||
389 | if (unlikely(tmp != NULL)) { | ||
390 | struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); | ||
391 | if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { | ||
392 | if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags)) | ||
393 | return -EBUSY; | ||
394 | return -EWOULDBLOCK; | ||
395 | } | ||
396 | } | ||
397 | } | ||
398 | |||
399 | /* Checkout the refcounts. | ||
400 | * Given that we checked for available elements and update slots above, | ||
401 | * this has to be successful. */ | ||
402 | for (enr = first; enr <= last; enr++) { | ||
403 | struct lc_element *al_ext; | ||
404 | al_ext = lc_get_cumulative(mdev->act_log, enr); | ||
405 | if (!al_ext) | ||
406 | dev_info(DEV, "LOGIC BUG for enr=%u\n", enr); | ||
407 | } | ||
408 | return 0; | ||
409 | } | ||
410 | |||
301 | void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) | 411 | void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) |
302 | { | 412 | { |
303 | /* for bios crossing activity log extent boundaries, | 413 | /* for bios crossing activity log extent boundaries, |
@@ -350,6 +460,24 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) | |||
350 | (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); | 460 | (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); |
351 | } | 461 | } |
352 | 462 | ||
463 | static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev) | ||
464 | { | ||
465 | const unsigned int stripes = mdev->ldev->md.al_stripes; | ||
466 | const unsigned int stripe_size_4kB = mdev->ldev->md.al_stripe_size_4k; | ||
467 | |||
468 | /* transaction number, modulo on-disk ring buffer wrap around */ | ||
469 | unsigned int t = mdev->al_tr_number % (mdev->ldev->md.al_size_4k); | ||
470 | |||
471 | /* ... to aligned 4k on disk block */ | ||
472 | t = ((t % stripes) * stripe_size_4kB) + t/stripes; | ||
473 | |||
474 | /* ... to 512 byte sector in activity log */ | ||
475 | t *= 8; | ||
476 | |||
477 | /* ... plus offset to the on disk position */ | ||
478 | return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t; | ||
479 | } | ||
480 | |||
353 | static int | 481 | static int |
354 | _al_write_transaction(struct drbd_conf *mdev) | 482 | _al_write_transaction(struct drbd_conf *mdev) |
355 | { | 483 | { |
@@ -432,23 +560,27 @@ _al_write_transaction(struct drbd_conf *mdev) | |||
432 | if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) | 560 | if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) |
433 | mdev->al_tr_cycle = 0; | 561 | mdev->al_tr_cycle = 0; |
434 | 562 | ||
435 | sector = mdev->ldev->md.md_offset | 563 | sector = al_tr_number_to_on_disk_sector(mdev); |
436 | + mdev->ldev->md.al_offset | ||
437 | + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9); | ||
438 | 564 | ||
439 | crc = crc32c(0, buffer, 4096); | 565 | crc = crc32c(0, buffer, 4096); |
440 | buffer->crc32c = cpu_to_be32(crc); | 566 | buffer->crc32c = cpu_to_be32(crc); |
441 | 567 | ||
442 | if (drbd_bm_write_hinted(mdev)) | 568 | if (drbd_bm_write_hinted(mdev)) |
443 | err = -EIO; | 569 | err = -EIO; |
444 | /* drbd_chk_io_error done already */ | 570 | else { |
445 | else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { | 571 | bool write_al_updates; |
446 | err = -EIO; | 572 | rcu_read_lock(); |
447 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | 573 | write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; |
448 | } else { | 574 | rcu_read_unlock(); |
449 | /* advance ringbuffer position and transaction counter */ | 575 | if (write_al_updates) { |
450 | mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE); | 576 | if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { |
451 | mdev->al_tr_number++; | 577 | err = -EIO; |
578 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | ||
579 | } else { | ||
580 | mdev->al_tr_number++; | ||
581 | mdev->al_writ_cnt++; | ||
582 | } | ||
583 | } | ||
452 | } | 584 | } |
453 | 585 | ||
454 | drbd_md_put_buffer(mdev); | 586 | drbd_md_put_buffer(mdev); |
@@ -474,20 +606,18 @@ static int w_al_write_transaction(struct drbd_work *w, int unused) | |||
474 | /* Calls from worker context (see w_restart_disk_io()) need to write the | 606 | /* Calls from worker context (see w_restart_disk_io()) need to write the |
475 | transaction directly. Others came through generic_make_request(), | 607 | transaction directly. Others came through generic_make_request(), |
476 | those need to delegate it to the worker. */ | 608 | those need to delegate it to the worker. */ |
477 | static int al_write_transaction(struct drbd_conf *mdev) | 609 | static int al_write_transaction(struct drbd_conf *mdev, bool delegate) |
478 | { | 610 | { |
479 | struct update_al_work al_work; | 611 | if (delegate) { |
480 | 612 | struct update_al_work al_work; | |
481 | if (current == mdev->tconn->worker.task) | 613 | init_completion(&al_work.event); |
614 | al_work.w.cb = w_al_write_transaction; | ||
615 | al_work.w.mdev = mdev; | ||
616 | drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w); | ||
617 | wait_for_completion(&al_work.event); | ||
618 | return al_work.err; | ||
619 | } else | ||
482 | return _al_write_transaction(mdev); | 620 | return _al_write_transaction(mdev); |
483 | |||
484 | init_completion(&al_work.event); | ||
485 | al_work.w.cb = w_al_write_transaction; | ||
486 | al_work.w.mdev = mdev; | ||
487 | drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w); | ||
488 | wait_for_completion(&al_work.event); | ||
489 | |||
490 | return al_work.err; | ||
491 | } | 621 | } |
492 | 622 | ||
493 | static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) | 623 | static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) |