aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block/drbd/drbd_actlog.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/drbd/drbd_actlog.c')
-rw-r--r--drivers/block/drbd/drbd_actlog.c246
1 files changed, 188 insertions, 58 deletions
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 92510f8ad013..6608076dc39e 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -104,7 +104,6 @@ struct update_al_work {
104 int err; 104 int err;
105}; 105};
106 106
107static int al_write_transaction(struct drbd_conf *mdev);
108 107
109void *drbd_md_get_buffer(struct drbd_conf *mdev) 108void *drbd_md_get_buffer(struct drbd_conf *mdev)
110{ 109{
@@ -168,7 +167,11 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
168 bio->bi_end_io = drbd_md_io_complete; 167 bio->bi_end_io = drbd_md_io_complete;
169 bio->bi_rw = rw; 168 bio->bi_rw = rw;
170 169
171 if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ 170 if (!(rw & WRITE) && mdev->state.disk == D_DISKLESS && mdev->ldev == NULL)
171 /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
172 ;
173 else if (!get_ldev_if_state(mdev, D_ATTACHING)) {
174 /* Corresponding put_ldev in drbd_md_io_complete() */
172 dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); 175 dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
173 err = -ENODEV; 176 err = -ENODEV;
174 goto out; 177 goto out;
@@ -199,9 +202,10 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
199 202
200 BUG_ON(!bdev->md_bdev); 203 BUG_ON(!bdev->md_bdev);
201 204
202 dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n", 205 dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
203 current->comm, current->pid, __func__, 206 current->comm, current->pid, __func__,
204 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 207 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
208 (void*)_RET_IP_ );
205 209
206 if (sector < drbd_md_first_sector(bdev) || 210 if (sector < drbd_md_first_sector(bdev) ||
207 sector + 7 > drbd_md_last_sector(bdev)) 211 sector + 7 > drbd_md_last_sector(bdev))
@@ -209,7 +213,8 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
209 current->comm, current->pid, __func__, 213 current->comm, current->pid, __func__,
210 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 214 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
211 215
212 err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE); 216 /* we do all our meta data IO in aligned 4k blocks. */
217 err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096);
213 if (err) { 218 if (err) {
214 dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", 219 dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
215 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); 220 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
@@ -217,44 +222,99 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
217 return err; 222 return err;
218} 223}
219 224
220static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) 225static struct bm_extent *find_active_resync_extent(struct drbd_conf *mdev, unsigned int enr)
221{ 226{
222 struct lc_element *al_ext;
223 struct lc_element *tmp; 227 struct lc_element *tmp;
224 int wake;
225
226 spin_lock_irq(&mdev->al_lock);
227 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); 228 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
228 if (unlikely(tmp != NULL)) { 229 if (unlikely(tmp != NULL)) {
229 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 230 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
230 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { 231 if (test_bit(BME_NO_WRITES, &bm_ext->flags))
231 wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); 232 return bm_ext;
232 spin_unlock_irq(&mdev->al_lock); 233 }
233 if (wake) 234 return NULL;
234 wake_up(&mdev->al_wait); 235}
235 return NULL; 236
236 } 237static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr, bool nonblock)
238{
239 struct lc_element *al_ext;
240 struct bm_extent *bm_ext;
241 int wake;
242
243 spin_lock_irq(&mdev->al_lock);
244 bm_ext = find_active_resync_extent(mdev, enr);
245 if (bm_ext) {
246 wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
247 spin_unlock_irq(&mdev->al_lock);
248 if (wake)
249 wake_up(&mdev->al_wait);
250 return NULL;
237 } 251 }
238 al_ext = lc_get(mdev->act_log, enr); 252 if (nonblock)
253 al_ext = lc_try_get(mdev->act_log, enr);
254 else
255 al_ext = lc_get(mdev->act_log, enr);
239 spin_unlock_irq(&mdev->al_lock); 256 spin_unlock_irq(&mdev->al_lock);
240 return al_ext; 257 return al_ext;
241} 258}
242 259
243void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) 260bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i)
244{ 261{
245 /* for bios crossing activity log extent boundaries, 262 /* for bios crossing activity log extent boundaries,
246 * we may need to activate two extents in one go */ 263 * we may need to activate two extents in one go */
247 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 264 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
248 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 265 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
249 unsigned enr;
250 bool locked = false;
251 266
267 D_ASSERT((unsigned)(last - first) <= 1);
268 D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
269
270 /* FIXME figure out a fast path for bios crossing AL extent boundaries */
271 if (first != last)
272 return false;
273
274 return _al_get(mdev, first, true);
275}
276
277bool drbd_al_begin_io_prepare(struct drbd_conf *mdev, struct drbd_interval *i)
278{
279 /* for bios crossing activity log extent boundaries,
280 * we may need to activate two extents in one go */
281 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
282 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
283 unsigned enr;
284 bool need_transaction = false;
252 285
253 D_ASSERT(first <= last); 286 D_ASSERT(first <= last);
254 D_ASSERT(atomic_read(&mdev->local_cnt) > 0); 287 D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
255 288
256 for (enr = first; enr <= last; enr++) 289 for (enr = first; enr <= last; enr++) {
257 wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL); 290 struct lc_element *al_ext;
291 wait_event(mdev->al_wait,
292 (al_ext = _al_get(mdev, enr, false)) != NULL);
293 if (al_ext->lc_number != enr)
294 need_transaction = true;
295 }
296 return need_transaction;
297}
298
299static int al_write_transaction(struct drbd_conf *mdev, bool delegate);
300
301/* When called through generic_make_request(), we must delegate
302 * activity log I/O to the worker thread: a further request
303 * submitted via generic_make_request() within the same task
304 * would be queued on current->bio_list, and would only start
305 * after this function returns (see generic_make_request()).
306 *
307 * However, if we *are* the worker, we must not delegate to ourselves.
308 */
309
310/*
311 * @delegate: delegate activity log I/O to the worker thread
312 */
313void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate)
314{
315 bool locked = false;
316
317 BUG_ON(delegate && current == mdev->tconn->worker.task);
258 318
259 /* Serialize multiple transactions. 319 /* Serialize multiple transactions.
260 * This uses test_and_set_bit, memory barrier is implicit. 320 * This uses test_and_set_bit, memory barrier is implicit.
@@ -264,13 +324,6 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
264 (locked = lc_try_lock_for_transaction(mdev->act_log))); 324 (locked = lc_try_lock_for_transaction(mdev->act_log)));
265 325
266 if (locked) { 326 if (locked) {
267 /* drbd_al_write_transaction(mdev,al_ext,enr);
268 * recurses into generic_make_request(), which
269 * disallows recursion, bios being serialized on the
270 * current->bio_tail list now.
271 * we have to delegate updates to the activity log
272 * to the worker thread. */
273
274 /* Double check: it may have been committed by someone else, 327 /* Double check: it may have been committed by someone else,
275 * while we have been waiting for the lock. */ 328 * while we have been waiting for the lock. */
276 if (mdev->act_log->pending_changes) { 329 if (mdev->act_log->pending_changes) {
@@ -280,11 +333,8 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
280 write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; 333 write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
281 rcu_read_unlock(); 334 rcu_read_unlock();
282 335
283 if (write_al_updates) { 336 if (write_al_updates)
284 al_write_transaction(mdev); 337 al_write_transaction(mdev, delegate);
285 mdev->al_writ_cnt++;
286 }
287
288 spin_lock_irq(&mdev->al_lock); 338 spin_lock_irq(&mdev->al_lock);
289 /* FIXME 339 /* FIXME
290 if (err) 340 if (err)
@@ -298,6 +348,66 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
298 } 348 }
299} 349}
300 350
351/*
352 * @delegate: delegate activity log I/O to the worker thread
353 */
354void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate)
355{
356 BUG_ON(delegate && current == mdev->tconn->worker.task);
357
358 if (drbd_al_begin_io_prepare(mdev, i))
359 drbd_al_begin_io_commit(mdev, delegate);
360}
361
362int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i)
363{
364 struct lru_cache *al = mdev->act_log;
365 /* for bios crossing activity log extent boundaries,
366 * we may need to activate two extents in one go */
367 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
368 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
369 unsigned nr_al_extents;
370 unsigned available_update_slots;
371 unsigned enr;
372
373 D_ASSERT(first <= last);
374
375 nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */
376 available_update_slots = min(al->nr_elements - al->used,
377 al->max_pending_changes - al->pending_changes);
378
379 /* We want all necessary updates for a given request within the same transaction
380 * We could first check how many updates are *actually* needed,
381 * and use that instead of the worst-case nr_al_extents */
382 if (available_update_slots < nr_al_extents)
383 return -EWOULDBLOCK;
384
385 /* Is resync active in this area? */
386 for (enr = first; enr <= last; enr++) {
387 struct lc_element *tmp;
388 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
389 if (unlikely(tmp != NULL)) {
390 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
391 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
392 if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags))
393 return -EBUSY;
394 return -EWOULDBLOCK;
395 }
396 }
397 }
398
399 /* Checkout the refcounts.
400 * Given that we checked for available elements and update slots above,
401 * this has to be successful. */
402 for (enr = first; enr <= last; enr++) {
403 struct lc_element *al_ext;
404 al_ext = lc_get_cumulative(mdev->act_log, enr);
405 if (!al_ext)
406 dev_info(DEV, "LOGIC BUG for enr=%u\n", enr);
407 }
408 return 0;
409}
410
301void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) 411void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i)
302{ 412{
303 /* for bios crossing activity log extent boundaries, 413 /* for bios crossing activity log extent boundaries,
@@ -350,6 +460,24 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
350 (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); 460 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
351} 461}
352 462
463static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev)
464{
465 const unsigned int stripes = mdev->ldev->md.al_stripes;
466 const unsigned int stripe_size_4kB = mdev->ldev->md.al_stripe_size_4k;
467
468 /* transaction number, modulo on-disk ring buffer wrap around */
469 unsigned int t = mdev->al_tr_number % (mdev->ldev->md.al_size_4k);
470
471 /* ... to aligned 4k on disk block */
472 t = ((t % stripes) * stripe_size_4kB) + t/stripes;
473
474 /* ... to 512 byte sector in activity log */
475 t *= 8;
476
477 /* ... plus offset to the on disk position */
478 return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t;
479}
480
353static int 481static int
354_al_write_transaction(struct drbd_conf *mdev) 482_al_write_transaction(struct drbd_conf *mdev)
355{ 483{
@@ -432,23 +560,27 @@ _al_write_transaction(struct drbd_conf *mdev)
432 if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) 560 if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
433 mdev->al_tr_cycle = 0; 561 mdev->al_tr_cycle = 0;
434 562
435 sector = mdev->ldev->md.md_offset 563 sector = al_tr_number_to_on_disk_sector(mdev);
436 + mdev->ldev->md.al_offset
437 + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9);
438 564
439 crc = crc32c(0, buffer, 4096); 565 crc = crc32c(0, buffer, 4096);
440 buffer->crc32c = cpu_to_be32(crc); 566 buffer->crc32c = cpu_to_be32(crc);
441 567
442 if (drbd_bm_write_hinted(mdev)) 568 if (drbd_bm_write_hinted(mdev))
443 err = -EIO; 569 err = -EIO;
444 /* drbd_chk_io_error done already */ 570 else {
445 else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { 571 bool write_al_updates;
446 err = -EIO; 572 rcu_read_lock();
447 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); 573 write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
448 } else { 574 rcu_read_unlock();
449 /* advance ringbuffer position and transaction counter */ 575 if (write_al_updates) {
450 mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE); 576 if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
451 mdev->al_tr_number++; 577 err = -EIO;
578 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
579 } else {
580 mdev->al_tr_number++;
581 mdev->al_writ_cnt++;
582 }
583 }
452 } 584 }
453 585
454 drbd_md_put_buffer(mdev); 586 drbd_md_put_buffer(mdev);
@@ -474,20 +606,18 @@ static int w_al_write_transaction(struct drbd_work *w, int unused)
474/* Calls from worker context (see w_restart_disk_io()) need to write the 606/* Calls from worker context (see w_restart_disk_io()) need to write the
475 transaction directly. Others came through generic_make_request(), 607 transaction directly. Others came through generic_make_request(),
476 those need to delegate it to the worker. */ 608 those need to delegate it to the worker. */
477static int al_write_transaction(struct drbd_conf *mdev) 609static int al_write_transaction(struct drbd_conf *mdev, bool delegate)
478{ 610{
479 struct update_al_work al_work; 611 if (delegate) {
480 612 struct update_al_work al_work;
481 if (current == mdev->tconn->worker.task) 613 init_completion(&al_work.event);
614 al_work.w.cb = w_al_write_transaction;
615 al_work.w.mdev = mdev;
616 drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
617 wait_for_completion(&al_work.event);
618 return al_work.err;
619 } else
482 return _al_write_transaction(mdev); 620 return _al_write_transaction(mdev);
483
484 init_completion(&al_work.event);
485 al_work.w.cb = w_al_write_transaction;
486 al_work.w.mdev = mdev;
487 drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
488 wait_for_completion(&al_work.event);
489
490 return al_work.err;
491} 621}
492 622
493static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) 623static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)