aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block/drbd
diff options
context:
space:
mode:
authorLars Ellenberg <lars.ellenberg@linbit.com>2010-12-15 02:59:11 -0500
committerPhilipp Reisner <philipp.reisner@linbit.com>2011-03-10 05:43:19 -0500
commit19f843aa08e2d8f87a09b4c2edc43b00638423a8 (patch)
tree49919bd17ba7e03eb7cb76175910714d55704997 /drivers/block/drbd
parent95a0f10cddbf93ce89c175ac1c53dad2d20ad309 (diff)
drbd: bitmap keep track of changes vs on-disk bitmap
When we set or clear bits in a bitmap page, also set a flag in the page->private pointer. This allows us to skip writes of unchanged pages. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers/block/drbd')
-rw-r--r--drivers/block/drbd/drbd_actlog.c132
-rw-r--r--drivers/block/drbd/drbd_bitmap.c409
-rw-r--r--drivers/block/drbd/drbd_int.h7
-rw-r--r--drivers/block/drbd/drbd_main.c27
-rw-r--r--drivers/block/drbd/drbd_nl.c11
-rw-r--r--drivers/block/drbd/drbd_worker.c7
6 files changed, 373 insertions, 220 deletions
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index e3f0f4d31d75..090fc2ce0df4 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -262,6 +262,33 @@ void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
262 spin_unlock_irqrestore(&mdev->al_lock, flags); 262 spin_unlock_irqrestore(&mdev->al_lock, flags);
263} 263}
264 264
265#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
266/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
267 * are still coupled, or assume too much about their relation.
268 * Code below will not work if this is violated.
269 * Will be cleaned up with some followup patch.
270 */
271# error FIXME
272#endif
273
274static unsigned int al_extent_to_bm_page(unsigned int al_enr)
275{
276 return al_enr >>
277 /* bit to page */
278 ((PAGE_SHIFT + 3) -
279 /* al extent number to bit */
280 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
281}
282
283static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
284{
285 return rs_enr >>
286 /* bit to page */
287 ((PAGE_SHIFT + 3) -
288 /* al extent number to bit */
289 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
290}
291
265int 292int
266w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) 293w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
267{ 294{
@@ -289,7 +316,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
289 * For now, we must not write the transaction, 316 * For now, we must not write the transaction,
290 * if we cannot write out the bitmap of the evicted extent. */ 317 * if we cannot write out the bitmap of the evicted extent. */
291 if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) 318 if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
292 drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT); 319 drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted));
293 320
294 /* The bitmap write may have failed, causing a state change. */ 321 /* The bitmap write may have failed, causing a state change. */
295 if (mdev->state.disk < D_INCONSISTENT) { 322 if (mdev->state.disk < D_INCONSISTENT) {
@@ -636,105 +663,6 @@ out_bio_put:
636} 663}
637 664
638/** 665/**
639 * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents
640 * @mdev: DRBD device.
641 *
642 * Called when we detach (unconfigure) local storage,
643 * or when we go from R_PRIMARY to R_SECONDARY role.
644 */
645void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
646{
647 int i, nr_elements;
648 unsigned int enr;
649 struct bio **bios;
650 struct drbd_atodb_wait wc;
651
652 ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING))
653 return; /* sorry, I don't have any act_log etc... */
654
655 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
656
657 nr_elements = mdev->act_log->nr_elements;
658
659 /* GFP_KERNEL, we are not in anyone's write-out path */
660 bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL);
661 if (!bios)
662 goto submit_one_by_one;
663
664 atomic_set(&wc.count, 0);
665 init_completion(&wc.io_done);
666 wc.mdev = mdev;
667 wc.error = 0;
668
669 for (i = 0; i < nr_elements; i++) {
670 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
671 if (enr == LC_FREE)
672 continue;
673 /* next statement also does atomic_inc wc.count and local_cnt */
674 if (atodb_prepare_unless_covered(mdev, bios,
675 enr/AL_EXT_PER_BM_SECT,
676 &wc))
677 goto free_bios_submit_one_by_one;
678 }
679
680 /* unnecessary optimization? */
681 lc_unlock(mdev->act_log);
682 wake_up(&mdev->al_wait);
683
684 /* all prepared, submit them */
685 for (i = 0; i < nr_elements; i++) {
686 if (bios[i] == NULL)
687 break;
688 if (drbd_insert_fault(mdev, DRBD_FAULT_MD_WR)) {
689 bios[i]->bi_rw = WRITE;
690 bio_endio(bios[i], -EIO);
691 } else {
692 submit_bio(WRITE, bios[i]);
693 }
694 }
695
696 /* always (try to) flush bitmap to stable storage */
697 drbd_md_flush(mdev);
698
699 /* In case we did not submit a single IO do not wait for
700 * them to complete. ( Because we would wait forever here. )
701 *
702 * In case we had IOs and they are already complete, there
703 * is not point in waiting anyways.
704 * Therefore this if () ... */
705 if (atomic_read(&wc.count))
706 wait_for_completion(&wc.io_done);
707
708 put_ldev(mdev);
709
710 kfree(bios);
711 return;
712
713 free_bios_submit_one_by_one:
714 /* free everything by calling the endio callback directly. */
715 for (i = 0; i < nr_elements && bios[i]; i++)
716 bio_endio(bios[i], 0);
717
718 kfree(bios);
719
720 submit_one_by_one:
721 dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n");
722
723 for (i = 0; i < mdev->act_log->nr_elements; i++) {
724 enr = lc_element_by_index(mdev->act_log, i)->lc_number;
725 if (enr == LC_FREE)
726 continue;
727 /* Really slow: if we have al-extents 16..19 active,
728 * sector 4 will be written four times! Synchronous! */
729 drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
730 }
731
732 lc_unlock(mdev->act_log);
733 wake_up(&mdev->al_wait);
734 put_ldev(mdev);
735}
736
737/**
738 * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents 666 * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
739 * @mdev: DRBD device. 667 * @mdev: DRBD device.
740 */ 668 */
@@ -813,7 +741,7 @@ static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused
813 return 1; 741 return 1;
814 } 742 }
815 743
816 drbd_bm_write_sect(mdev, udw->enr); 744 drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr));
817 put_ldev(mdev); 745 put_ldev(mdev);
818 746
819 kfree(udw); 747 kfree(udw);
@@ -893,7 +821,6 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
893 dev_warn(DEV, "Kicking resync_lru element enr=%u " 821 dev_warn(DEV, "Kicking resync_lru element enr=%u "
894 "out with rs_failed=%d\n", 822 "out with rs_failed=%d\n",
895 ext->lce.lc_number, ext->rs_failed); 823 ext->lce.lc_number, ext->rs_failed);
896 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
897 } 824 }
898 ext->rs_left = rs_left; 825 ext->rs_left = rs_left;
899 ext->rs_failed = success ? 0 : count; 826 ext->rs_failed = success ? 0 : count;
@@ -912,7 +839,6 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
912 drbd_queue_work_front(&mdev->data.work, &udw->w); 839 drbd_queue_work_front(&mdev->data.work, &udw->w);
913 } else { 840 } else {
914 dev_warn(DEV, "Could not kmalloc an udw\n"); 841 dev_warn(DEV, "Could not kmalloc an udw\n");
915 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
916 } 842 }
917 } 843 }
918 } else { 844 } else {
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 8d959ed6c2cc..72cd41a96ef9 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -70,8 +70,7 @@ struct drbd_bitmap {
70 sector_t bm_dev_capacity; 70 sector_t bm_dev_capacity;
71 struct mutex bm_change; /* serializes resize operations */ 71 struct mutex bm_change; /* serializes resize operations */
72 72
73 atomic_t bm_async_io; 73 wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
74 wait_queue_head_t bm_io_wait;
75 74
76 unsigned long bm_flags; 75 unsigned long bm_flags;
77 76
@@ -82,7 +81,7 @@ struct drbd_bitmap {
82 81
83/* definition of bits in bm_flags */ 82/* definition of bits in bm_flags */
84#define BM_LOCKED 0 83#define BM_LOCKED 0
85#define BM_MD_IO_ERROR 1 84// #define BM_MD_IO_ERROR 1 unused now.
86#define BM_P_VMALLOCED 2 85#define BM_P_VMALLOCED 2
87 86
88static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, 87static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
@@ -155,26 +154,117 @@ void drbd_bm_unlock(struct drbd_conf *mdev)
155 mutex_unlock(&b->bm_change); 154 mutex_unlock(&b->bm_change);
156} 155}
157 156
158static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr) 157/* we store some "meta" info about our pages in page->private */
158/* at a granularity of 4k storage per bitmap bit:
159 * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
160 * 1<<38 bits,
161 * 1<<23 4k bitmap pages.
162 * Use 24 bits as page index, covers 2 peta byte storage
163 * at a granularity of 4k per bit.
164 * Used to report the failed page idx on io error from the endio handlers.
165 */
166#define BM_PAGE_IDX_MASK ((1UL<<24)-1)
167/* this page is currently read in, or written back */
168#define BM_PAGE_IO_LOCK 31
169/* if there has been an IO error for this page */
170#define BM_PAGE_IO_ERROR 30
171/* this is to be able to intelligently skip disk IO,
172 * set if bits have been set since last IO. */
173#define BM_PAGE_NEED_WRITEOUT 29
174/* to mark for lazy writeout once syncer cleared all clearable bits,
175 * we if bits have been cleared since last IO. */
176#define BM_PAGE_LAZY_WRITEOUT 28
177
178/* store_page_idx uses non-atomic assingment. It is only used directly after
179 * allocating the page. All other bm_set_page_* and bm_clear_page_* need to
180 * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
181 * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
182 * requires it all to be atomic as well. */
183static void bm_store_page_idx(struct page *page, unsigned long idx)
159{ 184{
160 /* page_nr = (bitnr/8) >> PAGE_SHIFT; */ 185 BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
161 unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3); 186 page_private(page) |= idx;
162 BUG_ON(page_nr >= b->bm_number_of_pages);
163 return page_nr;
164} 187}
165 188
166/* word offset to long pointer */ 189static unsigned long bm_page_to_idx(struct page *page)
167static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km)
168{ 190{
169 struct page *page; 191 return page_private(page) & BM_PAGE_IDX_MASK;
170 unsigned long page_nr; 192}
193
194/* As is very unlikely that the same page is under IO from more than one
195 * context, we can get away with a bit per page and one wait queue per bitmap.
196 */
197static void bm_page_lock_io(struct drbd_conf *mdev, int page_nr)
198{
199 struct drbd_bitmap *b = mdev->bitmap;
200 void *addr = &page_private(b->bm_pages[page_nr]);
201 wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
202}
203
204static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr)
205{
206 struct drbd_bitmap *b = mdev->bitmap;
207 void *addr = &page_private(b->bm_pages[page_nr]);
208 clear_bit(BM_PAGE_IO_LOCK, addr);
209 smp_mb__after_clear_bit();
210 wake_up(&mdev->bitmap->bm_io_wait);
211}
212
213/* set _before_ submit_io, so it may be reset due to being changed
214 * while this page is in flight... will get submitted later again */
215static void bm_set_page_unchanged(struct page *page)
216{
217 /* use cmpxchg? */
218 clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
219 clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
220}
221
222static void bm_set_page_need_writeout(struct page *page)
223{
224 set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
225}
226
227static int bm_test_page_unchanged(struct page *page)
228{
229 volatile const unsigned long *addr = &page_private(page);
230 return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
231}
171 232
233static void bm_set_page_io_err(struct page *page)
234{
235 set_bit(BM_PAGE_IO_ERROR, &page_private(page));
236}
237
238static void bm_clear_page_io_err(struct page *page)
239{
240 clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
241}
242
243static void bm_set_page_lazy_writeout(struct page *page)
244{
245 set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
246}
247
248static int bm_test_page_lazy_writeout(struct page *page)
249{
250 return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
251}
252
253/* on a 32bit box, this would allow for exactly (2<<38) bits. */
254static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
255{
172 /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */ 256 /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
173 page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); 257 unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
174 BUG_ON(page_nr >= b->bm_number_of_pages); 258 BUG_ON(page_nr >= b->bm_number_of_pages);
175 page = b->bm_pages[page_nr]; 259 return page_nr;
260}
176 261
177 return (unsigned long *) kmap_atomic(page, km); 262static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
263{
264 /* page_nr = (bitnr/8) >> PAGE_SHIFT; */
265 unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
266 BUG_ON(page_nr >= b->bm_number_of_pages);
267 return page_nr;
178} 268}
179 269
180static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km) 270static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km)
@@ -188,11 +278,6 @@ static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
188 return __bm_map_pidx(b, idx, KM_IRQ1); 278 return __bm_map_pidx(b, idx, KM_IRQ1);
189} 279}
190 280
191static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset)
192{
193 return __bm_map_paddr(b, offset, KM_IRQ1);
194}
195
196static void __bm_unmap(unsigned long *p_addr, const enum km_type km) 281static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
197{ 282{
198 kunmap_atomic(p_addr, km); 283 kunmap_atomic(p_addr, km);
@@ -222,6 +307,7 @@ static void bm_unmap(unsigned long *p_addr)
222 * to be able to report device specific. 307 * to be able to report device specific.
223 */ 308 */
224 309
310
225static void bm_free_pages(struct page **pages, unsigned long number) 311static void bm_free_pages(struct page **pages, unsigned long number)
226{ 312{
227 unsigned long i; 313 unsigned long i;
@@ -289,6 +375,9 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
289 bm_vk_free(new_pages, vmalloced); 375 bm_vk_free(new_pages, vmalloced);
290 return NULL; 376 return NULL;
291 } 377 }
378 /* we want to know which page it is
379 * from the endio handlers */
380 bm_store_page_idx(page, i);
292 new_pages[i] = page; 381 new_pages[i] = page;
293 } 382 }
294 } else { 383 } else {
@@ -443,7 +532,7 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b)
443 532
444 while (offset < words) { 533 while (offset < words) {
445 i = do_now = min_t(size_t, words-offset, LWPP); 534 i = do_now = min_t(size_t, words-offset, LWPP);
446 p_addr = __bm_map_paddr(b, offset, KM_USER0); 535 p_addr = __bm_map_pidx(b, bm_word_to_page_idx(b, offset), KM_USER0);
447 bm = p_addr + MLPP(offset); 536 bm = p_addr + MLPP(offset);
448 while (i--) { 537 while (i--) {
449 bits += hweight_long(*bm++); 538 bits += hweight_long(*bm++);
@@ -472,6 +561,7 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b)
472static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) 561static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
473{ 562{
474 unsigned long *p_addr, *bm; 563 unsigned long *p_addr, *bm;
564 unsigned int idx;
475 size_t do_now, end; 565 size_t do_now, end;
476 566
477#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512) 567#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512)
@@ -485,7 +575,8 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
485 575
486 while (offset < end) { 576 while (offset < end) {
487 do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset; 577 do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
488 p_addr = bm_map_paddr(b, offset); 578 idx = bm_word_to_page_idx(b, offset);
579 p_addr = bm_map_pidx(b, idx);
489 bm = p_addr + MLPP(offset); 580 bm = p_addr + MLPP(offset);
490 if (bm+do_now > p_addr + LWPP) { 581 if (bm+do_now > p_addr + LWPP) {
491 printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", 582 printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
@@ -494,6 +585,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
494 } 585 }
495 memset(bm, c, do_now * sizeof(long)); 586 memset(bm, c, do_now * sizeof(long));
496 bm_unmap(p_addr); 587 bm_unmap(p_addr);
588 bm_set_page_need_writeout(b->bm_pages[idx]);
497 offset += do_now; 589 offset += do_now;
498 } 590 }
499} 591}
@@ -604,7 +696,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
604 bm_free_pages(opages + want, have - want); 696 bm_free_pages(opages + want, have - want);
605 } 697 }
606 698
607 p_addr = bm_map_paddr(b, words); 699 p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, words));
608 bm = p_addr + MLPP(words); 700 bm = p_addr + MLPP(words);
609 *bm = DRBD_MAGIC; 701 *bm = DRBD_MAGIC;
610 bm_unmap(p_addr); 702 bm_unmap(p_addr);
@@ -616,7 +708,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
616 bm_vk_free(opages, opages_vmalloced); 708 bm_vk_free(opages, opages_vmalloced);
617 if (!growing) 709 if (!growing)
618 b->bm_set = bm_count_bits(b); 710 b->bm_set = bm_count_bits(b);
619 dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words); 711 dev_info(DEV, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
620 712
621 out: 713 out:
622 drbd_bm_unlock(mdev); 714 drbd_bm_unlock(mdev);
@@ -686,6 +778,7 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
686 struct drbd_bitmap *b = mdev->bitmap; 778 struct drbd_bitmap *b = mdev->bitmap;
687 unsigned long *p_addr, *bm; 779 unsigned long *p_addr, *bm;
688 unsigned long word, bits; 780 unsigned long word, bits;
781 unsigned int idx;
689 size_t end, do_now; 782 size_t end, do_now;
690 783
691 end = offset + number; 784 end = offset + number;
@@ -700,7 +793,8 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
700 spin_lock_irq(&b->bm_lock); 793 spin_lock_irq(&b->bm_lock);
701 while (offset < end) { 794 while (offset < end) {
702 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; 795 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
703 p_addr = bm_map_paddr(b, offset); 796 idx = bm_word_to_page_idx(b, offset);
797 p_addr = bm_map_pidx(b, idx);
704 bm = p_addr + MLPP(offset); 798 bm = p_addr + MLPP(offset);
705 offset += do_now; 799 offset += do_now;
706 while (do_now--) { 800 while (do_now--) {
@@ -710,6 +804,7 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
710 b->bm_set += hweight_long(word) - bits; 804 b->bm_set += hweight_long(word) - bits;
711 } 805 }
712 bm_unmap(p_addr); 806 bm_unmap(p_addr);
807 bm_set_page_need_writeout(b->bm_pages[idx]);
713 } 808 }
714 /* with 32bit <-> 64bit cross-platform connect 809 /* with 32bit <-> 64bit cross-platform connect
715 * this is only correct for current usage, 810 * this is only correct for current usage,
@@ -748,7 +843,7 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
748 else { 843 else {
749 while (offset < end) { 844 while (offset < end) {
750 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; 845 do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
751 p_addr = bm_map_paddr(b, offset); 846 p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
752 bm = p_addr + MLPP(offset); 847 bm = p_addr + MLPP(offset);
753 offset += do_now; 848 offset += do_now;
754 while (do_now--) 849 while (do_now--)
@@ -786,9 +881,22 @@ void drbd_bm_clear_all(struct drbd_conf *mdev)
786 spin_unlock_irq(&b->bm_lock); 881 spin_unlock_irq(&b->bm_lock);
787} 882}
788 883
884struct bm_aio_ctx {
885 struct drbd_conf *mdev;
886 atomic_t in_flight;
887 wait_queue_head_t io_wait;
888 unsigned flags;
889#define BM_AIO_COPY_PAGES 1
890 int error;
891};
892
893/* bv_page may be a copy, or may be the original */
789static void bm_async_io_complete(struct bio *bio, int error) 894static void bm_async_io_complete(struct bio *bio, int error)
790{ 895{
791 struct drbd_bitmap *b = bio->bi_private; 896 struct bm_aio_ctx *ctx = bio->bi_private;
897 struct drbd_conf *mdev = ctx->mdev;
898 struct drbd_bitmap *b = mdev->bitmap;
899 unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
792 int uptodate = bio_flagged(bio, BIO_UPTODATE); 900 int uptodate = bio_flagged(bio, BIO_UPTODATE);
793 901
794 902
@@ -799,35 +907,79 @@ static void bm_async_io_complete(struct bio *bio, int error)
799 if (!error && !uptodate) 907 if (!error && !uptodate)
800 error = -EIO; 908 error = -EIO;
801 909
910 if (!bm_test_page_unchanged(b->bm_pages[idx]))
911 dev_info(DEV, "bitmap page idx %u changed during IO!\n", idx);
912
802 if (error) { 913 if (error) {
803 /* doh. what now? 914 /* ctx error will hold the completed-last non-zero error code,
804 * for now, set all bits, and flag MD_IO_ERROR */ 915 * in case error codes differ. */
805 __set_bit(BM_MD_IO_ERROR, &b->bm_flags); 916 ctx->error = error;
917 bm_set_page_io_err(b->bm_pages[idx]);
918 /* Not identical to on disk version of it.
919 * Is BM_PAGE_IO_ERROR enough? */
920 if (__ratelimit(&drbd_ratelimit_state))
921 dev_err(DEV, "IO ERROR %d on bitmap page idx %u\n",
922 error, idx);
923 } else {
924 bm_clear_page_io_err(b->bm_pages[idx]);
925 dynamic_dev_dbg(DEV, "bitmap page idx %u completed\n", idx);
806 } 926 }
807 if (atomic_dec_and_test(&b->bm_async_io)) 927
808 wake_up(&b->bm_io_wait); 928 bm_page_unlock_io(mdev, idx);
929
930 /* FIXME give back to page pool */
931 if (ctx->flags & BM_AIO_COPY_PAGES)
932 put_page(bio->bi_io_vec[0].bv_page);
809 933
810 bio_put(bio); 934 bio_put(bio);
935
936 if (atomic_dec_and_test(&ctx->in_flight))
937 wake_up(&ctx->io_wait);
811} 938}
812 939
813static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local) 940static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
814{ 941{
815 /* we are process context. we always get a bio */ 942 /* we are process context. we always get a bio */
816 struct bio *bio = bio_alloc(GFP_KERNEL, 1); 943 struct bio *bio = bio_alloc(GFP_KERNEL, 1);
944 struct drbd_conf *mdev = ctx->mdev;
945 struct drbd_bitmap *b = mdev->bitmap;
946 struct page *page;
817 unsigned int len; 947 unsigned int len;
948
818 sector_t on_disk_sector = 949 sector_t on_disk_sector =
819 mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset; 950 mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
820 on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); 951 on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
821 952
822 /* this might happen with very small 953 /* this might happen with very small
823 * flexible external meta data device */ 954 * flexible external meta data device,
955 * or with PAGE_SIZE > 4k */
824 len = min_t(unsigned int, PAGE_SIZE, 956 len = min_t(unsigned int, PAGE_SIZE,
825 (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9); 957 (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
826 958
959 /* serialize IO on this page */
960 bm_page_lock_io(mdev, page_nr);
961 /* before memcpy and submit,
962 * so it can be redirtied any time */
963 bm_set_page_unchanged(b->bm_pages[page_nr]);
964
965 if (ctx->flags & BM_AIO_COPY_PAGES) {
966 /* FIXME alloc_page is good enough for now, but actually needs
967 * to use pre-allocated page pool */
968 void *src, *dest;
969 page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT);
970 dest = kmap_atomic(page, KM_USER0);
971 src = kmap_atomic(b->bm_pages[page_nr], KM_USER1);
972 memcpy(dest, src, PAGE_SIZE);
973 kunmap_atomic(src, KM_USER1);
974 kunmap_atomic(dest, KM_USER0);
975 bm_store_page_idx(page, page_nr);
976 } else
977 page = b->bm_pages[page_nr];
978
827 bio->bi_bdev = mdev->ldev->md_bdev; 979 bio->bi_bdev = mdev->ldev->md_bdev;
828 bio->bi_sector = on_disk_sector; 980 bio->bi_sector = on_disk_sector;
829 bio_add_page(bio, b->bm_pages[page_nr], len, 0); 981 bio_add_page(bio, page, len, 0);
830 bio->bi_private = b; 982 bio->bi_private = ctx;
831 bio->bi_end_io = bm_async_io_complete; 983 bio->bi_end_io = bm_async_io_complete;
832 984
833 if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { 985 if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
@@ -841,36 +993,72 @@ static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int
841/* 993/*
842 * bm_rw: read/write the whole bitmap from/to its on disk location. 994 * bm_rw: read/write the whole bitmap from/to its on disk location.
843 */ 995 */
844static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local) 996static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local)
845{ 997{
998 struct bm_aio_ctx ctx =
999 { .flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0 };
846 struct drbd_bitmap *b = mdev->bitmap; 1000 struct drbd_bitmap *b = mdev->bitmap;
847 /* sector_t sector; */ 1001 int last_page, i, count = 0;
848 int bm_words, num_pages, i;
849 unsigned long now; 1002 unsigned long now;
850 char ppb[10]; 1003 char ppb[10];
851 int err = 0; 1004 int err = 0;
852 1005
853 WARN_ON(!bm_is_locked(b)); 1006 /*
854 1007 * We are protected against bitmap disappearing/resizing by holding an
855 /* no spinlock here, the drbd_bm_lock should be enough! */ 1008 * ldev reference (caller must have called get_ldev()).
1009 * For read/write, we are protected against changes to the bitmap by
1010 * the bitmap lock (see drbd_bitmap_io).
1011 * For lazy writeout, we don't care for ongoing changes to the bitmap,
1012 * as we submit copies of pages anyways.
1013 */
1014 if (!ctx.flags)
1015 WARN_ON(!bm_is_locked(b));
856 1016
857 bm_words = drbd_bm_words(mdev); 1017 /* because of the "extra long to catch oob access" we allocate in
858 num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT; 1018 * drbd_bm_resize, bm_number_of_pages -1 is not necessarily the page
1019 * containing the last _relevant_ bitmap word */
1020 last_page = bm_word_to_page_idx(b, b->bm_words - 1);
859 1021
860 now = jiffies; 1022 now = jiffies;
861 atomic_set(&b->bm_async_io, num_pages); 1023 ctx.mdev = mdev;
862 __clear_bit(BM_MD_IO_ERROR, &b->bm_flags); 1024 atomic_set(&ctx.in_flight, 1); /* one extra ref */
1025 init_waitqueue_head(&ctx.io_wait);
1026 ctx.error = 0;
863 1027
864 /* let the layers below us try to merge these bios... */ 1028 /* let the layers below us try to merge these bios... */
865 for (i = 0; i < num_pages; i++) 1029 for (i = 0; i <= last_page; i++) {
866 bm_page_io_async(mdev, b, i, rw); 1030 /* ignore completely unchanged pages */
1031 if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
1032 break;
1033 if (rw & WRITE) {
1034 if (bm_test_page_unchanged(b->bm_pages[i])) {
1035 dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
1036 continue;
1037 }
1038 /* during lazy writeout,
1039 * ignore those pages not marked for lazy writeout. */
1040 if (lazy_writeout_upper_idx &&
1041 !bm_test_page_lazy_writeout(b->bm_pages[i])) {
1042 dynamic_dev_dbg(DEV, "skipped bm lazy write for idx %u\n", i);
1043 continue;
1044 }
1045 }
1046 atomic_inc(&ctx.in_flight);
1047 bm_page_io_async(&ctx, i, rw);
1048 ++count;
1049 cond_resched();
1050 }
867 1051
868 wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0); 1052 atomic_dec(&ctx.in_flight); /* drop the extra ref */
1053 wait_event(ctx.io_wait, atomic_read(&ctx.in_flight) == 0);
1054 dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
1055 rw == WRITE ? "WRITE" : "READ",
1056 count, jiffies - now);
869 1057
870 if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) { 1058 if (ctx.error) {
871 dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); 1059 dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
872 drbd_chk_io_error(mdev, 1, true); 1060 drbd_chk_io_error(mdev, 1, true);
873 err = -EIO; 1061 err = -EIO; /* ctx.error ? */
874 } 1062 }
875 1063
876 now = jiffies; 1064 now = jiffies;
@@ -895,55 +1083,63 @@ static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
895 */ 1083 */
896int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) 1084int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
897{ 1085{
898 return bm_rw(mdev, READ); 1086 return bm_rw(mdev, READ, 0);
899} 1087}
900 1088
901/** 1089/**
902 * drbd_bm_write() - Write the whole bitmap to its on disk location. 1090 * drbd_bm_write() - Write the whole bitmap to its on disk location.
903 * @mdev: DRBD device. 1091 * @mdev: DRBD device.
1092 *
1093 * Will only write pages that have changed since last IO.
904 */ 1094 */
905int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) 1095int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
906{ 1096{
907 return bm_rw(mdev, WRITE); 1097 return bm_rw(mdev, WRITE, 0);
908} 1098}
909 1099
910/** 1100/**
911 * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap 1101 * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
912 * @mdev: DRBD device. 1102 * @mdev: DRBD device.
913 * @enr: Extent number in the resync lru (happens to be sector offset) 1103 * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages
1104 */
1105int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local)
1106{
1107 return bm_rw(mdev, WRITE, upper_idx);
1108}
1109
1110
1111/**
1112 * drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap
1113 * @mdev: DRBD device.
1114 * @idx: bitmap page index
914 * 1115 *
915 * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered 1116 * We don't want to special case on logical_block_size of the underlaying
916 * by a single sector write. Therefore enr == sector offset from the 1117 * device, so we submit PAGE_SIZE aligned pieces containing the requested enr.
917 * start of the bitmap. 1118 * Note that on "most" systems, PAGE_SIZE is 4k.
918 */ 1119 */
919int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local) 1120int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local)
920{ 1121{
921 sector_t on_disk_sector = enr + mdev->ldev->md.md_offset 1122 struct bm_aio_ctx ctx = { .flags = BM_AIO_COPY_PAGES, };
922 + mdev->ldev->md.bm_offset;
923 int bm_words, num_words, offset;
924 int err = 0;
925 1123
926 mutex_lock(&mdev->md_io_mutex); 1124 if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
927 bm_words = drbd_bm_words(mdev); 1125 dev_info(DEV, "skipped bm page write for idx %u\n", idx);
928 offset = S2W(enr); /* word offset into bitmap */ 1126 return 0;
929 num_words = min(S2W(1), bm_words - offset);
930 if (num_words < S2W(1))
931 memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE);
932 drbd_bm_get_lel(mdev, offset, num_words,
933 page_address(mdev->md_io_page));
934 if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) {
935 int i;
936 err = -EIO;
937 dev_err(DEV, "IO ERROR writing bitmap sector %lu "
938 "(meta-disk sector %llus)\n",
939 enr, (unsigned long long)on_disk_sector);
940 drbd_chk_io_error(mdev, 1, true);
941 for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
942 drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i);
943 } 1127 }
1128
1129 ctx.mdev = mdev;
1130 atomic_set(&ctx.in_flight, 1);
1131 init_waitqueue_head(&ctx.io_wait);
1132
1133 bm_page_io_async(&ctx, idx, WRITE_SYNC);
1134 wait_event(ctx.io_wait, atomic_read(&ctx.in_flight) == 0);
1135
1136 if (ctx.error)
1137 drbd_chk_io_error(mdev, 1, true);
1138 /* that should force detach, so the in memory bitmap will be
1139 * gone in a moment as well. */
1140
944 mdev->bm_writ_cnt++; 1141 mdev->bm_writ_cnt++;
945 mutex_unlock(&mdev->md_io_mutex); 1142 return ctx.error;
946 return err;
947} 1143}
948 1144
949/* NOTE 1145/* NOTE
@@ -965,10 +1161,9 @@ static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
965 dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits); 1161 dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
966 } else { 1162 } else {
967 while (bm_fo < b->bm_bits) { 1163 while (bm_fo < b->bm_bits) {
968 unsigned long offset; 1164 /* bit offset of the first bit in the page */
969 bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */ 1165 bit_offset = bm_fo & ~BPP_MASK;
970 offset = bit_offset >> LN2_BPL; /* word offset of the page */ 1166 p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km);
971 p_addr = __bm_map_paddr(b, offset, km);
972 1167
973 if (find_zero_bit) 1168 if (find_zero_bit)
974 i = generic_find_next_zero_le_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); 1169 i = generic_find_next_zero_le_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
@@ -1048,8 +1243,9 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1048 struct drbd_bitmap *b = mdev->bitmap; 1243 struct drbd_bitmap *b = mdev->bitmap;
1049 unsigned long *p_addr = NULL; 1244 unsigned long *p_addr = NULL;
1050 unsigned long bitnr; 1245 unsigned long bitnr;
1051 unsigned long last_page_nr = -1UL; 1246 unsigned int last_page_nr = -1U;
1052 int c = 0; 1247 int c = 0;
1248 int changed_total = 0;
1053 1249
1054 if (e >= b->bm_bits) { 1250 if (e >= b->bm_bits) {
1055 dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n", 1251 dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
@@ -1057,12 +1253,17 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1057 e = b->bm_bits ? b->bm_bits -1 : 0; 1253 e = b->bm_bits ? b->bm_bits -1 : 0;
1058 } 1254 }
1059 for (bitnr = s; bitnr <= e; bitnr++) { 1255 for (bitnr = s; bitnr <= e; bitnr++) {
1060 unsigned long offset = bitnr>>LN2_BPL; 1256 unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
1061 unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
1062 if (page_nr != last_page_nr) { 1257 if (page_nr != last_page_nr) {
1063 if (p_addr) 1258 if (p_addr)
1064 __bm_unmap(p_addr, km); 1259 __bm_unmap(p_addr, km);
1065 p_addr = __bm_map_paddr(b, offset, km); 1260 if (c < 0)
1261 bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
1262 else if (c > 0)
1263 bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
1264 changed_total += c;
1265 c = 0;
1266 p_addr = __bm_map_pidx(b, page_nr, km);
1066 last_page_nr = page_nr; 1267 last_page_nr = page_nr;
1067 } 1268 }
1068 if (val) 1269 if (val)
@@ -1072,8 +1273,13 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1072 } 1273 }
1073 if (p_addr) 1274 if (p_addr)
1074 __bm_unmap(p_addr, km); 1275 __bm_unmap(p_addr, km);
1075 b->bm_set += c; 1276 if (c < 0)
1076 return c; 1277 bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
1278 else if (c > 0)
1279 bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
1280 changed_total += c;
1281 b->bm_set += changed_total;
1282 return changed_total;
1077} 1283}
1078 1284
1079/* returns number of bits actually changed. 1285/* returns number of bits actually changed.
@@ -1211,8 +1417,7 @@ int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
1211 if (bm_is_locked(b)) 1417 if (bm_is_locked(b))
1212 bm_print_lock_info(mdev); 1418 bm_print_lock_info(mdev);
1213 if (bitnr < b->bm_bits) { 1419 if (bitnr < b->bm_bits) {
1214 unsigned long offset = bitnr>>LN2_BPL; 1420 p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
1215 p_addr = bm_map_paddr(b, offset);
1216 i = generic_test_le_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0; 1421 i = generic_test_le_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0;
1217 bm_unmap(p_addr); 1422 bm_unmap(p_addr);
1218 } else if (bitnr == b->bm_bits) { 1423 } else if (bitnr == b->bm_bits) {
@@ -1231,10 +1436,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
1231{ 1436{
1232 unsigned long flags; 1437 unsigned long flags;
1233 struct drbd_bitmap *b = mdev->bitmap; 1438 struct drbd_bitmap *b = mdev->bitmap;
1234 unsigned long *p_addr = NULL, page_nr = -1; 1439 unsigned long *p_addr = NULL;
1235 unsigned long bitnr; 1440 unsigned long bitnr;
1441 unsigned int page_nr = -1U;
1236 int c = 0; 1442 int c = 0;
1237 size_t w;
1238 1443
1239 /* If this is called without a bitmap, that is a bug. But just to be 1444 /* If this is called without a bitmap, that is a bug. But just to be
1240 * robust in case we screwed up elsewhere, in that case pretend there 1445 * robust in case we screwed up elsewhere, in that case pretend there
@@ -1247,12 +1452,12 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
1247 if (bm_is_locked(b)) 1452 if (bm_is_locked(b))
1248 bm_print_lock_info(mdev); 1453 bm_print_lock_info(mdev);
1249 for (bitnr = s; bitnr <= e; bitnr++) { 1454 for (bitnr = s; bitnr <= e; bitnr++) {
1250 w = bitnr >> LN2_BPL; 1455 unsigned int idx = bm_bit_to_page_idx(b, bitnr);
1251 if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) { 1456 if (page_nr != idx) {
1252 page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3); 1457 page_nr = idx;
1253 if (p_addr) 1458 if (p_addr)
1254 bm_unmap(p_addr); 1459 bm_unmap(p_addr);
1255 p_addr = bm_map_paddr(b, w); 1460 p_addr = bm_map_pidx(b, idx);
1256 } 1461 }
1257 ERR_IF (bitnr >= b->bm_bits) { 1462 ERR_IF (bitnr >= b->bm_bits) {
1258 dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); 1463 dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
@@ -1300,7 +1505,7 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
1300 count = 0; 1505 count = 0;
1301 if (s < b->bm_words) { 1506 if (s < b->bm_words) {
1302 int n = e-s; 1507 int n = e-s;
1303 p_addr = bm_map_paddr(b, s); 1508 p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
1304 bm = p_addr + MLPP(s); 1509 bm = p_addr + MLPP(s);
1305 while (n--) 1510 while (n--)
1306 count += hweight_long(*bm++); 1511 count += hweight_long(*bm++);
@@ -1335,7 +1540,7 @@ unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
1335 count = 0; 1540 count = 0;
1336 if (s < b->bm_words) { 1541 if (s < b->bm_words) {
1337 i = do_now = e-s; 1542 i = do_now = e-s;
1338 p_addr = bm_map_paddr(b, s); 1543 p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
1339 bm = p_addr + MLPP(s); 1544 bm = p_addr + MLPP(s);
1340 while (i--) { 1545 while (i--) {
1341 count += hweight_long(*bm); 1546 count += hweight_long(*bm);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index f6da48bb8c70..74cc50a21822 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -833,7 +833,7 @@ enum {
833 CRASHED_PRIMARY, /* This node was a crashed primary. 833 CRASHED_PRIMARY, /* This node was a crashed primary.
834 * Gets cleared when the state.conn 834 * Gets cleared when the state.conn
835 * goes into C_CONNECTED state. */ 835 * goes into C_CONNECTED state. */
836 WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */ 836 NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */
837 CONSIDER_RESYNC, 837 CONSIDER_RESYNC,
838 838
839 MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ 839 MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */
@@ -1428,7 +1428,7 @@ extern void _drbd_bm_set_bits(struct drbd_conf *mdev,
1428 const unsigned long s, const unsigned long e); 1428 const unsigned long s, const unsigned long e);
1429extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr); 1429extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr);
1430extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); 1430extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
1431extern int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local); 1431extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
1432extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); 1432extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
1433extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); 1433extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
1434extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, 1434extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
@@ -1446,7 +1446,7 @@ extern int drbd_bm_rs_done(struct drbd_conf *mdev);
1446/* for receive_bitmap */ 1446/* for receive_bitmap */
1447extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, 1447extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset,
1448 size_t number, unsigned long *buffer); 1448 size_t number, unsigned long *buffer);
1449/* for _drbd_send_bitmap and drbd_bm_write_sect */ 1449/* for _drbd_send_bitmap */
1450extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, 1450extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset,
1451 size_t number, unsigned long *buffer); 1451 size_t number, unsigned long *buffer);
1452 1452
@@ -1641,7 +1641,6 @@ extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
1641#define drbd_set_out_of_sync(mdev, sector, size) \ 1641#define drbd_set_out_of_sync(mdev, sector, size) \
1642 __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) 1642 __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
1643extern void drbd_al_apply_to_bm(struct drbd_conf *mdev); 1643extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
1644extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev);
1645extern void drbd_al_shrink(struct drbd_conf *mdev); 1644extern void drbd_al_shrink(struct drbd_conf *mdev);
1646 1645
1647 1646
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 67fffad213ec..57ed7181742d 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1289,6 +1289,26 @@ static void abw_start_sync(struct drbd_conf *mdev, int rv)
1289 } 1289 }
1290} 1290}
1291 1291
1292int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
1293{
1294 int rv;
1295
1296 D_ASSERT(current == mdev->worker.task);
1297
1298 /* open coded non-blocking drbd_suspend_io(mdev); */
1299 set_bit(SUSPEND_IO, &mdev->flags);
1300 if (!is_susp(mdev->state))
1301 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
1302
1303 drbd_bm_lock(mdev, why);
1304 rv = io_fn(mdev);
1305 drbd_bm_unlock(mdev);
1306
1307 drbd_resume_io(mdev);
1308
1309 return rv;
1310}
1311
1292/** 1312/**
1293 * after_state_ch() - Perform after state change actions that may sleep 1313 * after_state_ch() - Perform after state change actions that may sleep
1294 * @mdev: DRBD device. 1314 * @mdev: DRBD device.
@@ -1404,7 +1424,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1404 1424
1405 /* D_DISKLESS Peer becomes secondary */ 1425 /* D_DISKLESS Peer becomes secondary */
1406 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) 1426 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1407 drbd_al_to_on_disk_bm(mdev); 1427 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, "demote diskless peer");
1428 put_ldev(mdev);
1429 }
1430
1431 if (os.role == R_PRIMARY && ns.role == R_SECONDARY && get_ldev(mdev)) {
1432 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, "demote");
1408 put_ldev(mdev); 1433 put_ldev(mdev);
1409 } 1434 }
1410 1435
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 2f0724982143..77dc022eaf6b 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -407,10 +407,8 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
407 } 407 }
408 } 408 }
409 409
410 if ((new_role == R_SECONDARY) && get_ldev(mdev)) { 410 /* writeout of activity log covered areas of the bitmap
411 drbd_al_to_on_disk_bm(mdev); 411 * to stable storage done in after state change already */
412 put_ldev(mdev);
413 }
414 412
415 if (mdev->state.conn >= C_WF_REPORT_PARAMS) { 413 if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
416 /* if this was forced, we should consider sync */ 414 /* if this was forced, we should consider sync */
@@ -1174,7 +1172,10 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
1174 1172
1175 if (cp_discovered) { 1173 if (cp_discovered) {
1176 drbd_al_apply_to_bm(mdev); 1174 drbd_al_apply_to_bm(mdev);
1177 drbd_al_to_on_disk_bm(mdev); 1175 if (drbd_bitmap_io(mdev, &drbd_bm_write, "crashed primary apply AL")) {
1176 retcode = ERR_IO_MD_DISK;
1177 goto force_diskless_dec;
1178 }
1178 } 1179 }
1179 1180
1180 if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev)) 1181 if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 9fe3e890da0f..d17f2ed777ce 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -907,10 +907,8 @@ out:
907 907
908 drbd_md_sync(mdev); 908 drbd_md_sync(mdev);
909 909
910 if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) { 910 dev_info(DEV, "Writing changed bitmap pages\n");
911 dev_info(DEV, "Writing the whole bitmap\n"); 911 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
912 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
913 }
914 912
915 if (khelper_cmd) 913 if (khelper_cmd)
916 drbd_khelper(mdev, khelper_cmd); 914 drbd_khelper(mdev, khelper_cmd);
@@ -1127,7 +1125,6 @@ void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
1127 mdev->ov_last_oos_size = size>>9; 1125 mdev->ov_last_oos_size = size>>9;
1128 } 1126 }
1129 drbd_set_out_of_sync(mdev, sector, size); 1127 drbd_set_out_of_sync(mdev, sector, size);
1130 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
1131} 1128}
1132 1129
1133int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) 1130int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)