diff options
author | Lars Ellenberg <lars.ellenberg@linbit.com> | 2010-12-15 02:59:11 -0500 |
---|---|---|
committer | Philipp Reisner <philipp.reisner@linbit.com> | 2011-03-10 05:43:19 -0500 |
commit | 19f843aa08e2d8f87a09b4c2edc43b00638423a8 (patch) | |
tree | 49919bd17ba7e03eb7cb76175910714d55704997 /drivers/block/drbd | |
parent | 95a0f10cddbf93ce89c175ac1c53dad2d20ad309 (diff) |
drbd: bitmap keep track of changes vs on-disk bitmap
When we set or clear bits in a bitmap page,
also set a flag in the page->private pointer.
This allows us to skip writes of unchanged pages.
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers/block/drbd')
-rw-r--r-- | drivers/block/drbd/drbd_actlog.c | 132 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_bitmap.c | 409 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 7 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 27 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 11 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 7 |
6 files changed, 373 insertions, 220 deletions
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index e3f0f4d31d75..090fc2ce0df4 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c | |||
@@ -262,6 +262,33 @@ void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector) | |||
262 | spin_unlock_irqrestore(&mdev->al_lock, flags); | 262 | spin_unlock_irqrestore(&mdev->al_lock, flags); |
263 | } | 263 | } |
264 | 264 | ||
265 | #if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) | ||
266 | /* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT | ||
267 | * are still coupled, or assume too much about their relation. | ||
268 | * Code below will not work if this is violated. | ||
269 | * Will be cleaned up with some followup patch. | ||
270 | */ | ||
271 | # error FIXME | ||
272 | #endif | ||
273 | |||
274 | static unsigned int al_extent_to_bm_page(unsigned int al_enr) | ||
275 | { | ||
276 | return al_enr >> | ||
277 | /* bit to page */ | ||
278 | ((PAGE_SHIFT + 3) - | ||
279 | /* al extent number to bit */ | ||
280 | (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); | ||
281 | } | ||
282 | |||
283 | static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) | ||
284 | { | ||
285 | return rs_enr >> | ||
286 | /* bit to page */ | ||
287 | ((PAGE_SHIFT + 3) - | ||
288 | /* al extent number to bit */ | ||
289 | (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); | ||
290 | } | ||
291 | |||
265 | int | 292 | int |
266 | w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) | 293 | w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) |
267 | { | 294 | { |
@@ -289,7 +316,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) | |||
289 | * For now, we must not write the transaction, | 316 | * For now, we must not write the transaction, |
290 | * if we cannot write out the bitmap of the evicted extent. */ | 317 | * if we cannot write out the bitmap of the evicted extent. */ |
291 | if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) | 318 | if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) |
292 | drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT); | 319 | drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted)); |
293 | 320 | ||
294 | /* The bitmap write may have failed, causing a state change. */ | 321 | /* The bitmap write may have failed, causing a state change. */ |
295 | if (mdev->state.disk < D_INCONSISTENT) { | 322 | if (mdev->state.disk < D_INCONSISTENT) { |
@@ -636,105 +663,6 @@ out_bio_put: | |||
636 | } | 663 | } |
637 | 664 | ||
638 | /** | 665 | /** |
639 | * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents | ||
640 | * @mdev: DRBD device. | ||
641 | * | ||
642 | * Called when we detach (unconfigure) local storage, | ||
643 | * or when we go from R_PRIMARY to R_SECONDARY role. | ||
644 | */ | ||
645 | void drbd_al_to_on_disk_bm(struct drbd_conf *mdev) | ||
646 | { | ||
647 | int i, nr_elements; | ||
648 | unsigned int enr; | ||
649 | struct bio **bios; | ||
650 | struct drbd_atodb_wait wc; | ||
651 | |||
652 | ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING)) | ||
653 | return; /* sorry, I don't have any act_log etc... */ | ||
654 | |||
655 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | ||
656 | |||
657 | nr_elements = mdev->act_log->nr_elements; | ||
658 | |||
659 | /* GFP_KERNEL, we are not in anyone's write-out path */ | ||
660 | bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL); | ||
661 | if (!bios) | ||
662 | goto submit_one_by_one; | ||
663 | |||
664 | atomic_set(&wc.count, 0); | ||
665 | init_completion(&wc.io_done); | ||
666 | wc.mdev = mdev; | ||
667 | wc.error = 0; | ||
668 | |||
669 | for (i = 0; i < nr_elements; i++) { | ||
670 | enr = lc_element_by_index(mdev->act_log, i)->lc_number; | ||
671 | if (enr == LC_FREE) | ||
672 | continue; | ||
673 | /* next statement also does atomic_inc wc.count and local_cnt */ | ||
674 | if (atodb_prepare_unless_covered(mdev, bios, | ||
675 | enr/AL_EXT_PER_BM_SECT, | ||
676 | &wc)) | ||
677 | goto free_bios_submit_one_by_one; | ||
678 | } | ||
679 | |||
680 | /* unnecessary optimization? */ | ||
681 | lc_unlock(mdev->act_log); | ||
682 | wake_up(&mdev->al_wait); | ||
683 | |||
684 | /* all prepared, submit them */ | ||
685 | for (i = 0; i < nr_elements; i++) { | ||
686 | if (bios[i] == NULL) | ||
687 | break; | ||
688 | if (drbd_insert_fault(mdev, DRBD_FAULT_MD_WR)) { | ||
689 | bios[i]->bi_rw = WRITE; | ||
690 | bio_endio(bios[i], -EIO); | ||
691 | } else { | ||
692 | submit_bio(WRITE, bios[i]); | ||
693 | } | ||
694 | } | ||
695 | |||
696 | /* always (try to) flush bitmap to stable storage */ | ||
697 | drbd_md_flush(mdev); | ||
698 | |||
699 | /* In case we did not submit a single IO do not wait for | ||
700 | * them to complete. ( Because we would wait forever here. ) | ||
701 | * | ||
702 | * In case we had IOs and they are already complete, there | ||
703 | * is not point in waiting anyways. | ||
704 | * Therefore this if () ... */ | ||
705 | if (atomic_read(&wc.count)) | ||
706 | wait_for_completion(&wc.io_done); | ||
707 | |||
708 | put_ldev(mdev); | ||
709 | |||
710 | kfree(bios); | ||
711 | return; | ||
712 | |||
713 | free_bios_submit_one_by_one: | ||
714 | /* free everything by calling the endio callback directly. */ | ||
715 | for (i = 0; i < nr_elements && bios[i]; i++) | ||
716 | bio_endio(bios[i], 0); | ||
717 | |||
718 | kfree(bios); | ||
719 | |||
720 | submit_one_by_one: | ||
721 | dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n"); | ||
722 | |||
723 | for (i = 0; i < mdev->act_log->nr_elements; i++) { | ||
724 | enr = lc_element_by_index(mdev->act_log, i)->lc_number; | ||
725 | if (enr == LC_FREE) | ||
726 | continue; | ||
727 | /* Really slow: if we have al-extents 16..19 active, | ||
728 | * sector 4 will be written four times! Synchronous! */ | ||
729 | drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT); | ||
730 | } | ||
731 | |||
732 | lc_unlock(mdev->act_log); | ||
733 | wake_up(&mdev->al_wait); | ||
734 | put_ldev(mdev); | ||
735 | } | ||
736 | |||
737 | /** | ||
738 | * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents | 666 | * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents |
739 | * @mdev: DRBD device. | 667 | * @mdev: DRBD device. |
740 | */ | 668 | */ |
@@ -813,7 +741,7 @@ static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused | |||
813 | return 1; | 741 | return 1; |
814 | } | 742 | } |
815 | 743 | ||
816 | drbd_bm_write_sect(mdev, udw->enr); | 744 | drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr)); |
817 | put_ldev(mdev); | 745 | put_ldev(mdev); |
818 | 746 | ||
819 | kfree(udw); | 747 | kfree(udw); |
@@ -893,7 +821,6 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, | |||
893 | dev_warn(DEV, "Kicking resync_lru element enr=%u " | 821 | dev_warn(DEV, "Kicking resync_lru element enr=%u " |
894 | "out with rs_failed=%d\n", | 822 | "out with rs_failed=%d\n", |
895 | ext->lce.lc_number, ext->rs_failed); | 823 | ext->lce.lc_number, ext->rs_failed); |
896 | set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); | ||
897 | } | 824 | } |
898 | ext->rs_left = rs_left; | 825 | ext->rs_left = rs_left; |
899 | ext->rs_failed = success ? 0 : count; | 826 | ext->rs_failed = success ? 0 : count; |
@@ -912,7 +839,6 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, | |||
912 | drbd_queue_work_front(&mdev->data.work, &udw->w); | 839 | drbd_queue_work_front(&mdev->data.work, &udw->w); |
913 | } else { | 840 | } else { |
914 | dev_warn(DEV, "Could not kmalloc an udw\n"); | 841 | dev_warn(DEV, "Could not kmalloc an udw\n"); |
915 | set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); | ||
916 | } | 842 | } |
917 | } | 843 | } |
918 | } else { | 844 | } else { |
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 8d959ed6c2cc..72cd41a96ef9 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c | |||
@@ -70,8 +70,7 @@ struct drbd_bitmap { | |||
70 | sector_t bm_dev_capacity; | 70 | sector_t bm_dev_capacity; |
71 | struct mutex bm_change; /* serializes resize operations */ | 71 | struct mutex bm_change; /* serializes resize operations */ |
72 | 72 | ||
73 | atomic_t bm_async_io; | 73 | wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */ |
74 | wait_queue_head_t bm_io_wait; | ||
75 | 74 | ||
76 | unsigned long bm_flags; | 75 | unsigned long bm_flags; |
77 | 76 | ||
@@ -82,7 +81,7 @@ struct drbd_bitmap { | |||
82 | 81 | ||
83 | /* definition of bits in bm_flags */ | 82 | /* definition of bits in bm_flags */ |
84 | #define BM_LOCKED 0 | 83 | #define BM_LOCKED 0 |
85 | #define BM_MD_IO_ERROR 1 | 84 | // #define BM_MD_IO_ERROR 1 unused now. |
86 | #define BM_P_VMALLOCED 2 | 85 | #define BM_P_VMALLOCED 2 |
87 | 86 | ||
88 | static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | 87 | static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, |
@@ -155,26 +154,117 @@ void drbd_bm_unlock(struct drbd_conf *mdev) | |||
155 | mutex_unlock(&b->bm_change); | 154 | mutex_unlock(&b->bm_change); |
156 | } | 155 | } |
157 | 156 | ||
158 | static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr) | 157 | /* we store some "meta" info about our pages in page->private */ |
158 | /* at a granularity of 4k storage per bitmap bit: | ||
159 | * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks | ||
160 | * 1<<38 bits, | ||
161 | * 1<<23 4k bitmap pages. | ||
162 | * Use 24 bits as page index, covers 2 peta byte storage | ||
163 | * at a granularity of 4k per bit. | ||
164 | * Used to report the failed page idx on io error from the endio handlers. | ||
165 | */ | ||
166 | #define BM_PAGE_IDX_MASK ((1UL<<24)-1) | ||
167 | /* this page is currently read in, or written back */ | ||
168 | #define BM_PAGE_IO_LOCK 31 | ||
169 | /* if there has been an IO error for this page */ | ||
170 | #define BM_PAGE_IO_ERROR 30 | ||
171 | /* this is to be able to intelligently skip disk IO, | ||
172 | * set if bits have been set since last IO. */ | ||
173 | #define BM_PAGE_NEED_WRITEOUT 29 | ||
174 | /* to mark for lazy writeout once syncer cleared all clearable bits, | ||
175 | * we if bits have been cleared since last IO. */ | ||
176 | #define BM_PAGE_LAZY_WRITEOUT 28 | ||
177 | |||
178 | /* store_page_idx uses non-atomic assingment. It is only used directly after | ||
179 | * allocating the page. All other bm_set_page_* and bm_clear_page_* need to | ||
180 | * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap | ||
181 | * changes) may happen from various contexts, and wait_on_bit/wake_up_bit | ||
182 | * requires it all to be atomic as well. */ | ||
183 | static void bm_store_page_idx(struct page *page, unsigned long idx) | ||
159 | { | 184 | { |
160 | /* page_nr = (bitnr/8) >> PAGE_SHIFT; */ | 185 | BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK)); |
161 | unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3); | 186 | page_private(page) |= idx; |
162 | BUG_ON(page_nr >= b->bm_number_of_pages); | ||
163 | return page_nr; | ||
164 | } | 187 | } |
165 | 188 | ||
166 | /* word offset to long pointer */ | 189 | static unsigned long bm_page_to_idx(struct page *page) |
167 | static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km) | ||
168 | { | 190 | { |
169 | struct page *page; | 191 | return page_private(page) & BM_PAGE_IDX_MASK; |
170 | unsigned long page_nr; | 192 | } |
193 | |||
194 | /* As is very unlikely that the same page is under IO from more than one | ||
195 | * context, we can get away with a bit per page and one wait queue per bitmap. | ||
196 | */ | ||
197 | static void bm_page_lock_io(struct drbd_conf *mdev, int page_nr) | ||
198 | { | ||
199 | struct drbd_bitmap *b = mdev->bitmap; | ||
200 | void *addr = &page_private(b->bm_pages[page_nr]); | ||
201 | wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr)); | ||
202 | } | ||
203 | |||
204 | static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr) | ||
205 | { | ||
206 | struct drbd_bitmap *b = mdev->bitmap; | ||
207 | void *addr = &page_private(b->bm_pages[page_nr]); | ||
208 | clear_bit(BM_PAGE_IO_LOCK, addr); | ||
209 | smp_mb__after_clear_bit(); | ||
210 | wake_up(&mdev->bitmap->bm_io_wait); | ||
211 | } | ||
212 | |||
213 | /* set _before_ submit_io, so it may be reset due to being changed | ||
214 | * while this page is in flight... will get submitted later again */ | ||
215 | static void bm_set_page_unchanged(struct page *page) | ||
216 | { | ||
217 | /* use cmpxchg? */ | ||
218 | clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); | ||
219 | clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page)); | ||
220 | } | ||
221 | |||
222 | static void bm_set_page_need_writeout(struct page *page) | ||
223 | { | ||
224 | set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); | ||
225 | } | ||
226 | |||
227 | static int bm_test_page_unchanged(struct page *page) | ||
228 | { | ||
229 | volatile const unsigned long *addr = &page_private(page); | ||
230 | return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0; | ||
231 | } | ||
171 | 232 | ||
233 | static void bm_set_page_io_err(struct page *page) | ||
234 | { | ||
235 | set_bit(BM_PAGE_IO_ERROR, &page_private(page)); | ||
236 | } | ||
237 | |||
238 | static void bm_clear_page_io_err(struct page *page) | ||
239 | { | ||
240 | clear_bit(BM_PAGE_IO_ERROR, &page_private(page)); | ||
241 | } | ||
242 | |||
243 | static void bm_set_page_lazy_writeout(struct page *page) | ||
244 | { | ||
245 | set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page)); | ||
246 | } | ||
247 | |||
248 | static int bm_test_page_lazy_writeout(struct page *page) | ||
249 | { | ||
250 | return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page)); | ||
251 | } | ||
252 | |||
253 | /* on a 32bit box, this would allow for exactly (2<<38) bits. */ | ||
254 | static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr) | ||
255 | { | ||
172 | /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */ | 256 | /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */ |
173 | page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); | 257 | unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3); |
174 | BUG_ON(page_nr >= b->bm_number_of_pages); | 258 | BUG_ON(page_nr >= b->bm_number_of_pages); |
175 | page = b->bm_pages[page_nr]; | 259 | return page_nr; |
260 | } | ||
176 | 261 | ||
177 | return (unsigned long *) kmap_atomic(page, km); | 262 | static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr) |
263 | { | ||
264 | /* page_nr = (bitnr/8) >> PAGE_SHIFT; */ | ||
265 | unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3); | ||
266 | BUG_ON(page_nr >= b->bm_number_of_pages); | ||
267 | return page_nr; | ||
178 | } | 268 | } |
179 | 269 | ||
180 | static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km) | 270 | static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km) |
@@ -188,11 +278,6 @@ static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx) | |||
188 | return __bm_map_pidx(b, idx, KM_IRQ1); | 278 | return __bm_map_pidx(b, idx, KM_IRQ1); |
189 | } | 279 | } |
190 | 280 | ||
191 | static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset) | ||
192 | { | ||
193 | return __bm_map_paddr(b, offset, KM_IRQ1); | ||
194 | } | ||
195 | |||
196 | static void __bm_unmap(unsigned long *p_addr, const enum km_type km) | 281 | static void __bm_unmap(unsigned long *p_addr, const enum km_type km) |
197 | { | 282 | { |
198 | kunmap_atomic(p_addr, km); | 283 | kunmap_atomic(p_addr, km); |
@@ -222,6 +307,7 @@ static void bm_unmap(unsigned long *p_addr) | |||
222 | * to be able to report device specific. | 307 | * to be able to report device specific. |
223 | */ | 308 | */ |
224 | 309 | ||
310 | |||
225 | static void bm_free_pages(struct page **pages, unsigned long number) | 311 | static void bm_free_pages(struct page **pages, unsigned long number) |
226 | { | 312 | { |
227 | unsigned long i; | 313 | unsigned long i; |
@@ -289,6 +375,9 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) | |||
289 | bm_vk_free(new_pages, vmalloced); | 375 | bm_vk_free(new_pages, vmalloced); |
290 | return NULL; | 376 | return NULL; |
291 | } | 377 | } |
378 | /* we want to know which page it is | ||
379 | * from the endio handlers */ | ||
380 | bm_store_page_idx(page, i); | ||
292 | new_pages[i] = page; | 381 | new_pages[i] = page; |
293 | } | 382 | } |
294 | } else { | 383 | } else { |
@@ -443,7 +532,7 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b) | |||
443 | 532 | ||
444 | while (offset < words) { | 533 | while (offset < words) { |
445 | i = do_now = min_t(size_t, words-offset, LWPP); | 534 | i = do_now = min_t(size_t, words-offset, LWPP); |
446 | p_addr = __bm_map_paddr(b, offset, KM_USER0); | 535 | p_addr = __bm_map_pidx(b, bm_word_to_page_idx(b, offset), KM_USER0); |
447 | bm = p_addr + MLPP(offset); | 536 | bm = p_addr + MLPP(offset); |
448 | while (i--) { | 537 | while (i--) { |
449 | bits += hweight_long(*bm++); | 538 | bits += hweight_long(*bm++); |
@@ -472,6 +561,7 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b) | |||
472 | static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) | 561 | static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) |
473 | { | 562 | { |
474 | unsigned long *p_addr, *bm; | 563 | unsigned long *p_addr, *bm; |
564 | unsigned int idx; | ||
475 | size_t do_now, end; | 565 | size_t do_now, end; |
476 | 566 | ||
477 | #define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512) | 567 | #define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512) |
@@ -485,7 +575,8 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) | |||
485 | 575 | ||
486 | while (offset < end) { | 576 | while (offset < end) { |
487 | do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset; | 577 | do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset; |
488 | p_addr = bm_map_paddr(b, offset); | 578 | idx = bm_word_to_page_idx(b, offset); |
579 | p_addr = bm_map_pidx(b, idx); | ||
489 | bm = p_addr + MLPP(offset); | 580 | bm = p_addr + MLPP(offset); |
490 | if (bm+do_now > p_addr + LWPP) { | 581 | if (bm+do_now > p_addr + LWPP) { |
491 | printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", | 582 | printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", |
@@ -494,6 +585,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) | |||
494 | } | 585 | } |
495 | memset(bm, c, do_now * sizeof(long)); | 586 | memset(bm, c, do_now * sizeof(long)); |
496 | bm_unmap(p_addr); | 587 | bm_unmap(p_addr); |
588 | bm_set_page_need_writeout(b->bm_pages[idx]); | ||
497 | offset += do_now; | 589 | offset += do_now; |
498 | } | 590 | } |
499 | } | 591 | } |
@@ -604,7 +696,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) | |||
604 | bm_free_pages(opages + want, have - want); | 696 | bm_free_pages(opages + want, have - want); |
605 | } | 697 | } |
606 | 698 | ||
607 | p_addr = bm_map_paddr(b, words); | 699 | p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, words)); |
608 | bm = p_addr + MLPP(words); | 700 | bm = p_addr + MLPP(words); |
609 | *bm = DRBD_MAGIC; | 701 | *bm = DRBD_MAGIC; |
610 | bm_unmap(p_addr); | 702 | bm_unmap(p_addr); |
@@ -616,7 +708,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) | |||
616 | bm_vk_free(opages, opages_vmalloced); | 708 | bm_vk_free(opages, opages_vmalloced); |
617 | if (!growing) | 709 | if (!growing) |
618 | b->bm_set = bm_count_bits(b); | 710 | b->bm_set = bm_count_bits(b); |
619 | dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words); | 711 | dev_info(DEV, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want); |
620 | 712 | ||
621 | out: | 713 | out: |
622 | drbd_bm_unlock(mdev); | 714 | drbd_bm_unlock(mdev); |
@@ -686,6 +778,7 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number, | |||
686 | struct drbd_bitmap *b = mdev->bitmap; | 778 | struct drbd_bitmap *b = mdev->bitmap; |
687 | unsigned long *p_addr, *bm; | 779 | unsigned long *p_addr, *bm; |
688 | unsigned long word, bits; | 780 | unsigned long word, bits; |
781 | unsigned int idx; | ||
689 | size_t end, do_now; | 782 | size_t end, do_now; |
690 | 783 | ||
691 | end = offset + number; | 784 | end = offset + number; |
@@ -700,7 +793,8 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number, | |||
700 | spin_lock_irq(&b->bm_lock); | 793 | spin_lock_irq(&b->bm_lock); |
701 | while (offset < end) { | 794 | while (offset < end) { |
702 | do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; | 795 | do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; |
703 | p_addr = bm_map_paddr(b, offset); | 796 | idx = bm_word_to_page_idx(b, offset); |
797 | p_addr = bm_map_pidx(b, idx); | ||
704 | bm = p_addr + MLPP(offset); | 798 | bm = p_addr + MLPP(offset); |
705 | offset += do_now; | 799 | offset += do_now; |
706 | while (do_now--) { | 800 | while (do_now--) { |
@@ -710,6 +804,7 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number, | |||
710 | b->bm_set += hweight_long(word) - bits; | 804 | b->bm_set += hweight_long(word) - bits; |
711 | } | 805 | } |
712 | bm_unmap(p_addr); | 806 | bm_unmap(p_addr); |
807 | bm_set_page_need_writeout(b->bm_pages[idx]); | ||
713 | } | 808 | } |
714 | /* with 32bit <-> 64bit cross-platform connect | 809 | /* with 32bit <-> 64bit cross-platform connect |
715 | * this is only correct for current usage, | 810 | * this is only correct for current usage, |
@@ -748,7 +843,7 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number, | |||
748 | else { | 843 | else { |
749 | while (offset < end) { | 844 | while (offset < end) { |
750 | do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; | 845 | do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; |
751 | p_addr = bm_map_paddr(b, offset); | 846 | p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset)); |
752 | bm = p_addr + MLPP(offset); | 847 | bm = p_addr + MLPP(offset); |
753 | offset += do_now; | 848 | offset += do_now; |
754 | while (do_now--) | 849 | while (do_now--) |
@@ -786,9 +881,22 @@ void drbd_bm_clear_all(struct drbd_conf *mdev) | |||
786 | spin_unlock_irq(&b->bm_lock); | 881 | spin_unlock_irq(&b->bm_lock); |
787 | } | 882 | } |
788 | 883 | ||
884 | struct bm_aio_ctx { | ||
885 | struct drbd_conf *mdev; | ||
886 | atomic_t in_flight; | ||
887 | wait_queue_head_t io_wait; | ||
888 | unsigned flags; | ||
889 | #define BM_AIO_COPY_PAGES 1 | ||
890 | int error; | ||
891 | }; | ||
892 | |||
893 | /* bv_page may be a copy, or may be the original */ | ||
789 | static void bm_async_io_complete(struct bio *bio, int error) | 894 | static void bm_async_io_complete(struct bio *bio, int error) |
790 | { | 895 | { |
791 | struct drbd_bitmap *b = bio->bi_private; | 896 | struct bm_aio_ctx *ctx = bio->bi_private; |
897 | struct drbd_conf *mdev = ctx->mdev; | ||
898 | struct drbd_bitmap *b = mdev->bitmap; | ||
899 | unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page); | ||
792 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | 900 | int uptodate = bio_flagged(bio, BIO_UPTODATE); |
793 | 901 | ||
794 | 902 | ||
@@ -799,35 +907,79 @@ static void bm_async_io_complete(struct bio *bio, int error) | |||
799 | if (!error && !uptodate) | 907 | if (!error && !uptodate) |
800 | error = -EIO; | 908 | error = -EIO; |
801 | 909 | ||
910 | if (!bm_test_page_unchanged(b->bm_pages[idx])) | ||
911 | dev_info(DEV, "bitmap page idx %u changed during IO!\n", idx); | ||
912 | |||
802 | if (error) { | 913 | if (error) { |
803 | /* doh. what now? | 914 | /* ctx error will hold the completed-last non-zero error code, |
804 | * for now, set all bits, and flag MD_IO_ERROR */ | 915 | * in case error codes differ. */ |
805 | __set_bit(BM_MD_IO_ERROR, &b->bm_flags); | 916 | ctx->error = error; |
917 | bm_set_page_io_err(b->bm_pages[idx]); | ||
918 | /* Not identical to on disk version of it. | ||
919 | * Is BM_PAGE_IO_ERROR enough? */ | ||
920 | if (__ratelimit(&drbd_ratelimit_state)) | ||
921 | dev_err(DEV, "IO ERROR %d on bitmap page idx %u\n", | ||
922 | error, idx); | ||
923 | } else { | ||
924 | bm_clear_page_io_err(b->bm_pages[idx]); | ||
925 | dynamic_dev_dbg(DEV, "bitmap page idx %u completed\n", idx); | ||
806 | } | 926 | } |
807 | if (atomic_dec_and_test(&b->bm_async_io)) | 927 | |
808 | wake_up(&b->bm_io_wait); | 928 | bm_page_unlock_io(mdev, idx); |
929 | |||
930 | /* FIXME give back to page pool */ | ||
931 | if (ctx->flags & BM_AIO_COPY_PAGES) | ||
932 | put_page(bio->bi_io_vec[0].bv_page); | ||
809 | 933 | ||
810 | bio_put(bio); | 934 | bio_put(bio); |
935 | |||
936 | if (atomic_dec_and_test(&ctx->in_flight)) | ||
937 | wake_up(&ctx->io_wait); | ||
811 | } | 938 | } |
812 | 939 | ||
813 | static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local) | 940 | static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local) |
814 | { | 941 | { |
815 | /* we are process context. we always get a bio */ | 942 | /* we are process context. we always get a bio */ |
816 | struct bio *bio = bio_alloc(GFP_KERNEL, 1); | 943 | struct bio *bio = bio_alloc(GFP_KERNEL, 1); |
944 | struct drbd_conf *mdev = ctx->mdev; | ||
945 | struct drbd_bitmap *b = mdev->bitmap; | ||
946 | struct page *page; | ||
817 | unsigned int len; | 947 | unsigned int len; |
948 | |||
818 | sector_t on_disk_sector = | 949 | sector_t on_disk_sector = |
819 | mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset; | 950 | mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset; |
820 | on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); | 951 | on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); |
821 | 952 | ||
822 | /* this might happen with very small | 953 | /* this might happen with very small |
823 | * flexible external meta data device */ | 954 | * flexible external meta data device, |
955 | * or with PAGE_SIZE > 4k */ | ||
824 | len = min_t(unsigned int, PAGE_SIZE, | 956 | len = min_t(unsigned int, PAGE_SIZE, |
825 | (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9); | 957 | (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9); |
826 | 958 | ||
959 | /* serialize IO on this page */ | ||
960 | bm_page_lock_io(mdev, page_nr); | ||
961 | /* before memcpy and submit, | ||
962 | * so it can be redirtied any time */ | ||
963 | bm_set_page_unchanged(b->bm_pages[page_nr]); | ||
964 | |||
965 | if (ctx->flags & BM_AIO_COPY_PAGES) { | ||
966 | /* FIXME alloc_page is good enough for now, but actually needs | ||
967 | * to use pre-allocated page pool */ | ||
968 | void *src, *dest; | ||
969 | page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT); | ||
970 | dest = kmap_atomic(page, KM_USER0); | ||
971 | src = kmap_atomic(b->bm_pages[page_nr], KM_USER1); | ||
972 | memcpy(dest, src, PAGE_SIZE); | ||
973 | kunmap_atomic(src, KM_USER1); | ||
974 | kunmap_atomic(dest, KM_USER0); | ||
975 | bm_store_page_idx(page, page_nr); | ||
976 | } else | ||
977 | page = b->bm_pages[page_nr]; | ||
978 | |||
827 | bio->bi_bdev = mdev->ldev->md_bdev; | 979 | bio->bi_bdev = mdev->ldev->md_bdev; |
828 | bio->bi_sector = on_disk_sector; | 980 | bio->bi_sector = on_disk_sector; |
829 | bio_add_page(bio, b->bm_pages[page_nr], len, 0); | 981 | bio_add_page(bio, page, len, 0); |
830 | bio->bi_private = b; | 982 | bio->bi_private = ctx; |
831 | bio->bi_end_io = bm_async_io_complete; | 983 | bio->bi_end_io = bm_async_io_complete; |
832 | 984 | ||
833 | if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { | 985 | if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { |
@@ -841,36 +993,72 @@ static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int | |||
841 | /* | 993 | /* |
842 | * bm_rw: read/write the whole bitmap from/to its on disk location. | 994 | * bm_rw: read/write the whole bitmap from/to its on disk location. |
843 | */ | 995 | */ |
844 | static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local) | 996 | static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local) |
845 | { | 997 | { |
998 | struct bm_aio_ctx ctx = | ||
999 | { .flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0 }; | ||
846 | struct drbd_bitmap *b = mdev->bitmap; | 1000 | struct drbd_bitmap *b = mdev->bitmap; |
847 | /* sector_t sector; */ | 1001 | int last_page, i, count = 0; |
848 | int bm_words, num_pages, i; | ||
849 | unsigned long now; | 1002 | unsigned long now; |
850 | char ppb[10]; | 1003 | char ppb[10]; |
851 | int err = 0; | 1004 | int err = 0; |
852 | 1005 | ||
853 | WARN_ON(!bm_is_locked(b)); | 1006 | /* |
854 | 1007 | * We are protected against bitmap disappearing/resizing by holding an | |
855 | /* no spinlock here, the drbd_bm_lock should be enough! */ | 1008 | * ldev reference (caller must have called get_ldev()). |
1009 | * For read/write, we are protected against changes to the bitmap by | ||
1010 | * the bitmap lock (see drbd_bitmap_io). | ||
1011 | * For lazy writeout, we don't care for ongoing changes to the bitmap, | ||
1012 | * as we submit copies of pages anyways. | ||
1013 | */ | ||
1014 | if (!ctx.flags) | ||
1015 | WARN_ON(!bm_is_locked(b)); | ||
856 | 1016 | ||
857 | bm_words = drbd_bm_words(mdev); | 1017 | /* because of the "extra long to catch oob access" we allocate in |
858 | num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT; | 1018 | * drbd_bm_resize, bm_number_of_pages -1 is not necessarily the page |
1019 | * containing the last _relevant_ bitmap word */ | ||
1020 | last_page = bm_word_to_page_idx(b, b->bm_words - 1); | ||
859 | 1021 | ||
860 | now = jiffies; | 1022 | now = jiffies; |
861 | atomic_set(&b->bm_async_io, num_pages); | 1023 | ctx.mdev = mdev; |
862 | __clear_bit(BM_MD_IO_ERROR, &b->bm_flags); | 1024 | atomic_set(&ctx.in_flight, 1); /* one extra ref */ |
1025 | init_waitqueue_head(&ctx.io_wait); | ||
1026 | ctx.error = 0; | ||
863 | 1027 | ||
864 | /* let the layers below us try to merge these bios... */ | 1028 | /* let the layers below us try to merge these bios... */ |
865 | for (i = 0; i < num_pages; i++) | 1029 | for (i = 0; i <= last_page; i++) { |
866 | bm_page_io_async(mdev, b, i, rw); | 1030 | /* ignore completely unchanged pages */ |
1031 | if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) | ||
1032 | break; | ||
1033 | if (rw & WRITE) { | ||
1034 | if (bm_test_page_unchanged(b->bm_pages[i])) { | ||
1035 | dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); | ||
1036 | continue; | ||
1037 | } | ||
1038 | /* during lazy writeout, | ||
1039 | * ignore those pages not marked for lazy writeout. */ | ||
1040 | if (lazy_writeout_upper_idx && | ||
1041 | !bm_test_page_lazy_writeout(b->bm_pages[i])) { | ||
1042 | dynamic_dev_dbg(DEV, "skipped bm lazy write for idx %u\n", i); | ||
1043 | continue; | ||
1044 | } | ||
1045 | } | ||
1046 | atomic_inc(&ctx.in_flight); | ||
1047 | bm_page_io_async(&ctx, i, rw); | ||
1048 | ++count; | ||
1049 | cond_resched(); | ||
1050 | } | ||
867 | 1051 | ||
868 | wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0); | 1052 | atomic_dec(&ctx.in_flight); /* drop the extra ref */ |
1053 | wait_event(ctx.io_wait, atomic_read(&ctx.in_flight) == 0); | ||
1054 | dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", | ||
1055 | rw == WRITE ? "WRITE" : "READ", | ||
1056 | count, jiffies - now); | ||
869 | 1057 | ||
870 | if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) { | 1058 | if (ctx.error) { |
871 | dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); | 1059 | dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); |
872 | drbd_chk_io_error(mdev, 1, true); | 1060 | drbd_chk_io_error(mdev, 1, true); |
873 | err = -EIO; | 1061 | err = -EIO; /* ctx.error ? */ |
874 | } | 1062 | } |
875 | 1063 | ||
876 | now = jiffies; | 1064 | now = jiffies; |
@@ -895,55 +1083,63 @@ static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local) | |||
895 | */ | 1083 | */ |
896 | int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) | 1084 | int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) |
897 | { | 1085 | { |
898 | return bm_rw(mdev, READ); | 1086 | return bm_rw(mdev, READ, 0); |
899 | } | 1087 | } |
900 | 1088 | ||
901 | /** | 1089 | /** |
902 | * drbd_bm_write() - Write the whole bitmap to its on disk location. | 1090 | * drbd_bm_write() - Write the whole bitmap to its on disk location. |
903 | * @mdev: DRBD device. | 1091 | * @mdev: DRBD device. |
1092 | * | ||
1093 | * Will only write pages that have changed since last IO. | ||
904 | */ | 1094 | */ |
905 | int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) | 1095 | int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) |
906 | { | 1096 | { |
907 | return bm_rw(mdev, WRITE); | 1097 | return bm_rw(mdev, WRITE, 0); |
908 | } | 1098 | } |
909 | 1099 | ||
910 | /** | 1100 | /** |
911 | * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap | 1101 | * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed. |
912 | * @mdev: DRBD device. | 1102 | * @mdev: DRBD device. |
913 | * @enr: Extent number in the resync lru (happens to be sector offset) | 1103 | * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages |
1104 | */ | ||
1105 | int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local) | ||
1106 | { | ||
1107 | return bm_rw(mdev, WRITE, upper_idx); | ||
1108 | } | ||
1109 | |||
1110 | |||
1111 | /** | ||
1112 | * drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap | ||
1113 | * @mdev: DRBD device. | ||
1114 | * @idx: bitmap page index | ||
914 | * | 1115 | * |
915 | * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered | 1116 | * We don't want to special case on logical_block_size of the underlaying |
916 | * by a single sector write. Therefore enr == sector offset from the | 1117 | * device, so we submit PAGE_SIZE aligned pieces containing the requested enr. |
917 | * start of the bitmap. | 1118 | * Note that on "most" systems, PAGE_SIZE is 4k. |
918 | */ | 1119 | */ |
919 | int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local) | 1120 | int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local) |
920 | { | 1121 | { |
921 | sector_t on_disk_sector = enr + mdev->ldev->md.md_offset | 1122 | struct bm_aio_ctx ctx = { .flags = BM_AIO_COPY_PAGES, }; |
922 | + mdev->ldev->md.bm_offset; | ||
923 | int bm_words, num_words, offset; | ||
924 | int err = 0; | ||
925 | 1123 | ||
926 | mutex_lock(&mdev->md_io_mutex); | 1124 | if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) { |
927 | bm_words = drbd_bm_words(mdev); | 1125 | dev_info(DEV, "skipped bm page write for idx %u\n", idx); |
928 | offset = S2W(enr); /* word offset into bitmap */ | 1126 | return 0; |
929 | num_words = min(S2W(1), bm_words - offset); | ||
930 | if (num_words < S2W(1)) | ||
931 | memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE); | ||
932 | drbd_bm_get_lel(mdev, offset, num_words, | ||
933 | page_address(mdev->md_io_page)); | ||
934 | if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) { | ||
935 | int i; | ||
936 | err = -EIO; | ||
937 | dev_err(DEV, "IO ERROR writing bitmap sector %lu " | ||
938 | "(meta-disk sector %llus)\n", | ||
939 | enr, (unsigned long long)on_disk_sector); | ||
940 | drbd_chk_io_error(mdev, 1, true); | ||
941 | for (i = 0; i < AL_EXT_PER_BM_SECT; i++) | ||
942 | drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i); | ||
943 | } | 1127 | } |
1128 | |||
1129 | ctx.mdev = mdev; | ||
1130 | atomic_set(&ctx.in_flight, 1); | ||
1131 | init_waitqueue_head(&ctx.io_wait); | ||
1132 | |||
1133 | bm_page_io_async(&ctx, idx, WRITE_SYNC); | ||
1134 | wait_event(ctx.io_wait, atomic_read(&ctx.in_flight) == 0); | ||
1135 | |||
1136 | if (ctx.error) | ||
1137 | drbd_chk_io_error(mdev, 1, true); | ||
1138 | /* that should force detach, so the in memory bitmap will be | ||
1139 | * gone in a moment as well. */ | ||
1140 | |||
944 | mdev->bm_writ_cnt++; | 1141 | mdev->bm_writ_cnt++; |
945 | mutex_unlock(&mdev->md_io_mutex); | 1142 | return ctx.error; |
946 | return err; | ||
947 | } | 1143 | } |
948 | 1144 | ||
949 | /* NOTE | 1145 | /* NOTE |
@@ -965,10 +1161,9 @@ static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, | |||
965 | dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits); | 1161 | dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits); |
966 | } else { | 1162 | } else { |
967 | while (bm_fo < b->bm_bits) { | 1163 | while (bm_fo < b->bm_bits) { |
968 | unsigned long offset; | 1164 | /* bit offset of the first bit in the page */ |
969 | bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */ | 1165 | bit_offset = bm_fo & ~BPP_MASK; |
970 | offset = bit_offset >> LN2_BPL; /* word offset of the page */ | 1166 | p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km); |
971 | p_addr = __bm_map_paddr(b, offset, km); | ||
972 | 1167 | ||
973 | if (find_zero_bit) | 1168 | if (find_zero_bit) |
974 | i = generic_find_next_zero_le_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); | 1169 | i = generic_find_next_zero_le_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); |
@@ -1048,8 +1243,9 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | |||
1048 | struct drbd_bitmap *b = mdev->bitmap; | 1243 | struct drbd_bitmap *b = mdev->bitmap; |
1049 | unsigned long *p_addr = NULL; | 1244 | unsigned long *p_addr = NULL; |
1050 | unsigned long bitnr; | 1245 | unsigned long bitnr; |
1051 | unsigned long last_page_nr = -1UL; | 1246 | unsigned int last_page_nr = -1U; |
1052 | int c = 0; | 1247 | int c = 0; |
1248 | int changed_total = 0; | ||
1053 | 1249 | ||
1054 | if (e >= b->bm_bits) { | 1250 | if (e >= b->bm_bits) { |
1055 | dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n", | 1251 | dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n", |
@@ -1057,12 +1253,17 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | |||
1057 | e = b->bm_bits ? b->bm_bits -1 : 0; | 1253 | e = b->bm_bits ? b->bm_bits -1 : 0; |
1058 | } | 1254 | } |
1059 | for (bitnr = s; bitnr <= e; bitnr++) { | 1255 | for (bitnr = s; bitnr <= e; bitnr++) { |
1060 | unsigned long offset = bitnr>>LN2_BPL; | 1256 | unsigned int page_nr = bm_bit_to_page_idx(b, bitnr); |
1061 | unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); | ||
1062 | if (page_nr != last_page_nr) { | 1257 | if (page_nr != last_page_nr) { |
1063 | if (p_addr) | 1258 | if (p_addr) |
1064 | __bm_unmap(p_addr, km); | 1259 | __bm_unmap(p_addr, km); |
1065 | p_addr = __bm_map_paddr(b, offset, km); | 1260 | if (c < 0) |
1261 | bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]); | ||
1262 | else if (c > 0) | ||
1263 | bm_set_page_need_writeout(b->bm_pages[last_page_nr]); | ||
1264 | changed_total += c; | ||
1265 | c = 0; | ||
1266 | p_addr = __bm_map_pidx(b, page_nr, km); | ||
1066 | last_page_nr = page_nr; | 1267 | last_page_nr = page_nr; |
1067 | } | 1268 | } |
1068 | if (val) | 1269 | if (val) |
@@ -1072,8 +1273,13 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | |||
1072 | } | 1273 | } |
1073 | if (p_addr) | 1274 | if (p_addr) |
1074 | __bm_unmap(p_addr, km); | 1275 | __bm_unmap(p_addr, km); |
1075 | b->bm_set += c; | 1276 | if (c < 0) |
1076 | return c; | 1277 | bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]); |
1278 | else if (c > 0) | ||
1279 | bm_set_page_need_writeout(b->bm_pages[last_page_nr]); | ||
1280 | changed_total += c; | ||
1281 | b->bm_set += changed_total; | ||
1282 | return changed_total; | ||
1077 | } | 1283 | } |
1078 | 1284 | ||
1079 | /* returns number of bits actually changed. | 1285 | /* returns number of bits actually changed. |
@@ -1211,8 +1417,7 @@ int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr) | |||
1211 | if (bm_is_locked(b)) | 1417 | if (bm_is_locked(b)) |
1212 | bm_print_lock_info(mdev); | 1418 | bm_print_lock_info(mdev); |
1213 | if (bitnr < b->bm_bits) { | 1419 | if (bitnr < b->bm_bits) { |
1214 | unsigned long offset = bitnr>>LN2_BPL; | 1420 | p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr)); |
1215 | p_addr = bm_map_paddr(b, offset); | ||
1216 | i = generic_test_le_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0; | 1421 | i = generic_test_le_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0; |
1217 | bm_unmap(p_addr); | 1422 | bm_unmap(p_addr); |
1218 | } else if (bitnr == b->bm_bits) { | 1423 | } else if (bitnr == b->bm_bits) { |
@@ -1231,10 +1436,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi | |||
1231 | { | 1436 | { |
1232 | unsigned long flags; | 1437 | unsigned long flags; |
1233 | struct drbd_bitmap *b = mdev->bitmap; | 1438 | struct drbd_bitmap *b = mdev->bitmap; |
1234 | unsigned long *p_addr = NULL, page_nr = -1; | 1439 | unsigned long *p_addr = NULL; |
1235 | unsigned long bitnr; | 1440 | unsigned long bitnr; |
1441 | unsigned int page_nr = -1U; | ||
1236 | int c = 0; | 1442 | int c = 0; |
1237 | size_t w; | ||
1238 | 1443 | ||
1239 | /* If this is called without a bitmap, that is a bug. But just to be | 1444 | /* If this is called without a bitmap, that is a bug. But just to be |
1240 | * robust in case we screwed up elsewhere, in that case pretend there | 1445 | * robust in case we screwed up elsewhere, in that case pretend there |
@@ -1247,12 +1452,12 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi | |||
1247 | if (bm_is_locked(b)) | 1452 | if (bm_is_locked(b)) |
1248 | bm_print_lock_info(mdev); | 1453 | bm_print_lock_info(mdev); |
1249 | for (bitnr = s; bitnr <= e; bitnr++) { | 1454 | for (bitnr = s; bitnr <= e; bitnr++) { |
1250 | w = bitnr >> LN2_BPL; | 1455 | unsigned int idx = bm_bit_to_page_idx(b, bitnr); |
1251 | if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) { | 1456 | if (page_nr != idx) { |
1252 | page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3); | 1457 | page_nr = idx; |
1253 | if (p_addr) | 1458 | if (p_addr) |
1254 | bm_unmap(p_addr); | 1459 | bm_unmap(p_addr); |
1255 | p_addr = bm_map_paddr(b, w); | 1460 | p_addr = bm_map_pidx(b, idx); |
1256 | } | 1461 | } |
1257 | ERR_IF (bitnr >= b->bm_bits) { | 1462 | ERR_IF (bitnr >= b->bm_bits) { |
1258 | dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); | 1463 | dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); |
@@ -1300,7 +1505,7 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr) | |||
1300 | count = 0; | 1505 | count = 0; |
1301 | if (s < b->bm_words) { | 1506 | if (s < b->bm_words) { |
1302 | int n = e-s; | 1507 | int n = e-s; |
1303 | p_addr = bm_map_paddr(b, s); | 1508 | p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s)); |
1304 | bm = p_addr + MLPP(s); | 1509 | bm = p_addr + MLPP(s); |
1305 | while (n--) | 1510 | while (n--) |
1306 | count += hweight_long(*bm++); | 1511 | count += hweight_long(*bm++); |
@@ -1335,7 +1540,7 @@ unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr) | |||
1335 | count = 0; | 1540 | count = 0; |
1336 | if (s < b->bm_words) { | 1541 | if (s < b->bm_words) { |
1337 | i = do_now = e-s; | 1542 | i = do_now = e-s; |
1338 | p_addr = bm_map_paddr(b, s); | 1543 | p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s)); |
1339 | bm = p_addr + MLPP(s); | 1544 | bm = p_addr + MLPP(s); |
1340 | while (i--) { | 1545 | while (i--) { |
1341 | count += hweight_long(*bm); | 1546 | count += hweight_long(*bm); |
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index f6da48bb8c70..74cc50a21822 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -833,7 +833,7 @@ enum { | |||
833 | CRASHED_PRIMARY, /* This node was a crashed primary. | 833 | CRASHED_PRIMARY, /* This node was a crashed primary. |
834 | * Gets cleared when the state.conn | 834 | * Gets cleared when the state.conn |
835 | * goes into C_CONNECTED state. */ | 835 | * goes into C_CONNECTED state. */ |
836 | WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */ | 836 | NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */ |
837 | CONSIDER_RESYNC, | 837 | CONSIDER_RESYNC, |
838 | 838 | ||
839 | MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ | 839 | MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ |
@@ -1428,7 +1428,7 @@ extern void _drbd_bm_set_bits(struct drbd_conf *mdev, | |||
1428 | const unsigned long s, const unsigned long e); | 1428 | const unsigned long s, const unsigned long e); |
1429 | extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr); | 1429 | extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr); |
1430 | extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); | 1430 | extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); |
1431 | extern int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local); | 1431 | extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); |
1432 | extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); | 1432 | extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); |
1433 | extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); | 1433 | extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); |
1434 | extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, | 1434 | extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, |
@@ -1446,7 +1446,7 @@ extern int drbd_bm_rs_done(struct drbd_conf *mdev); | |||
1446 | /* for receive_bitmap */ | 1446 | /* for receive_bitmap */ |
1447 | extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, | 1447 | extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, |
1448 | size_t number, unsigned long *buffer); | 1448 | size_t number, unsigned long *buffer); |
1449 | /* for _drbd_send_bitmap and drbd_bm_write_sect */ | 1449 | /* for _drbd_send_bitmap */ |
1450 | extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, | 1450 | extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, |
1451 | size_t number, unsigned long *buffer); | 1451 | size_t number, unsigned long *buffer); |
1452 | 1452 | ||
@@ -1641,7 +1641,6 @@ extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, | |||
1641 | #define drbd_set_out_of_sync(mdev, sector, size) \ | 1641 | #define drbd_set_out_of_sync(mdev, sector, size) \ |
1642 | __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) | 1642 | __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) |
1643 | extern void drbd_al_apply_to_bm(struct drbd_conf *mdev); | 1643 | extern void drbd_al_apply_to_bm(struct drbd_conf *mdev); |
1644 | extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev); | ||
1645 | extern void drbd_al_shrink(struct drbd_conf *mdev); | 1644 | extern void drbd_al_shrink(struct drbd_conf *mdev); |
1646 | 1645 | ||
1647 | 1646 | ||
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 67fffad213ec..57ed7181742d 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c | |||
@@ -1289,6 +1289,26 @@ static void abw_start_sync(struct drbd_conf *mdev, int rv) | |||
1289 | } | 1289 | } |
1290 | } | 1290 | } |
1291 | 1291 | ||
1292 | int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why) | ||
1293 | { | ||
1294 | int rv; | ||
1295 | |||
1296 | D_ASSERT(current == mdev->worker.task); | ||
1297 | |||
1298 | /* open coded non-blocking drbd_suspend_io(mdev); */ | ||
1299 | set_bit(SUSPEND_IO, &mdev->flags); | ||
1300 | if (!is_susp(mdev->state)) | ||
1301 | D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0); | ||
1302 | |||
1303 | drbd_bm_lock(mdev, why); | ||
1304 | rv = io_fn(mdev); | ||
1305 | drbd_bm_unlock(mdev); | ||
1306 | |||
1307 | drbd_resume_io(mdev); | ||
1308 | |||
1309 | return rv; | ||
1310 | } | ||
1311 | |||
1292 | /** | 1312 | /** |
1293 | * after_state_ch() - Perform after state change actions that may sleep | 1313 | * after_state_ch() - Perform after state change actions that may sleep |
1294 | * @mdev: DRBD device. | 1314 | * @mdev: DRBD device. |
@@ -1404,7 +1424,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, | |||
1404 | 1424 | ||
1405 | /* D_DISKLESS Peer becomes secondary */ | 1425 | /* D_DISKLESS Peer becomes secondary */ |
1406 | if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) | 1426 | if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) |
1407 | drbd_al_to_on_disk_bm(mdev); | 1427 | drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, "demote diskless peer"); |
1428 | put_ldev(mdev); | ||
1429 | } | ||
1430 | |||
1431 | if (os.role == R_PRIMARY && ns.role == R_SECONDARY && get_ldev(mdev)) { | ||
1432 | drbd_bitmap_io_from_worker(mdev, &drbd_bm_write, "demote"); | ||
1408 | put_ldev(mdev); | 1433 | put_ldev(mdev); |
1409 | } | 1434 | } |
1410 | 1435 | ||
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 2f0724982143..77dc022eaf6b 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c | |||
@@ -407,10 +407,8 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) | |||
407 | } | 407 | } |
408 | } | 408 | } |
409 | 409 | ||
410 | if ((new_role == R_SECONDARY) && get_ldev(mdev)) { | 410 | /* writeout of activity log covered areas of the bitmap |
411 | drbd_al_to_on_disk_bm(mdev); | 411 | * to stable storage done in after state change already */ |
412 | put_ldev(mdev); | ||
413 | } | ||
414 | 412 | ||
415 | if (mdev->state.conn >= C_WF_REPORT_PARAMS) { | 413 | if (mdev->state.conn >= C_WF_REPORT_PARAMS) { |
416 | /* if this was forced, we should consider sync */ | 414 | /* if this was forced, we should consider sync */ |
@@ -1174,7 +1172,10 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp | |||
1174 | 1172 | ||
1175 | if (cp_discovered) { | 1173 | if (cp_discovered) { |
1176 | drbd_al_apply_to_bm(mdev); | 1174 | drbd_al_apply_to_bm(mdev); |
1177 | drbd_al_to_on_disk_bm(mdev); | 1175 | if (drbd_bitmap_io(mdev, &drbd_bm_write, "crashed primary apply AL")) { |
1176 | retcode = ERR_IO_MD_DISK; | ||
1177 | goto force_diskless_dec; | ||
1178 | } | ||
1178 | } | 1179 | } |
1179 | 1180 | ||
1180 | if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev)) | 1181 | if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev)) |
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 9fe3e890da0f..d17f2ed777ce 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c | |||
@@ -907,10 +907,8 @@ out: | |||
907 | 907 | ||
908 | drbd_md_sync(mdev); | 908 | drbd_md_sync(mdev); |
909 | 909 | ||
910 | if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) { | 910 | dev_info(DEV, "Writing changed bitmap pages\n"); |
911 | dev_info(DEV, "Writing the whole bitmap\n"); | 911 | drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished"); |
912 | drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished"); | ||
913 | } | ||
914 | 912 | ||
915 | if (khelper_cmd) | 913 | if (khelper_cmd) |
916 | drbd_khelper(mdev, khelper_cmd); | 914 | drbd_khelper(mdev, khelper_cmd); |
@@ -1127,7 +1125,6 @@ void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size) | |||
1127 | mdev->ov_last_oos_size = size>>9; | 1125 | mdev->ov_last_oos_size = size>>9; |
1128 | } | 1126 | } |
1129 | drbd_set_out_of_sync(mdev, sector, size); | 1127 | drbd_set_out_of_sync(mdev, sector, size); |
1130 | set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); | ||
1131 | } | 1128 | } |
1132 | 1129 | ||
1133 | int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) | 1130 | int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) |