diff options
author | Lars Ellenberg <lars.ellenberg@linbit.com> | 2010-12-15 02:59:11 -0500 |
---|---|---|
committer | Philipp Reisner <philipp.reisner@linbit.com> | 2011-03-10 05:43:19 -0500 |
commit | 19f843aa08e2d8f87a09b4c2edc43b00638423a8 (patch) | |
tree | 49919bd17ba7e03eb7cb76175910714d55704997 /drivers/block/drbd/drbd_bitmap.c | |
parent | 95a0f10cddbf93ce89c175ac1c53dad2d20ad309 (diff) |
drbd: bitmap keep track of changes vs on-disk bitmap
When we set or clear bits in a bitmap page,
also set a flag in the page->private pointer.
This allows us to skip writes of unchanged pages.
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers/block/drbd/drbd_bitmap.c')
-rw-r--r-- | drivers/block/drbd/drbd_bitmap.c | 409 |
1 files changed, 307 insertions, 102 deletions
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 8d959ed6c2cc..72cd41a96ef9 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c | |||
@@ -70,8 +70,7 @@ struct drbd_bitmap { | |||
70 | sector_t bm_dev_capacity; | 70 | sector_t bm_dev_capacity; |
71 | struct mutex bm_change; /* serializes resize operations */ | 71 | struct mutex bm_change; /* serializes resize operations */ |
72 | 72 | ||
73 | atomic_t bm_async_io; | 73 | wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */ |
74 | wait_queue_head_t bm_io_wait; | ||
75 | 74 | ||
76 | unsigned long bm_flags; | 75 | unsigned long bm_flags; |
77 | 76 | ||
@@ -82,7 +81,7 @@ struct drbd_bitmap { | |||
82 | 81 | ||
83 | /* definition of bits in bm_flags */ | 82 | /* definition of bits in bm_flags */ |
84 | #define BM_LOCKED 0 | 83 | #define BM_LOCKED 0 |
85 | #define BM_MD_IO_ERROR 1 | 84 | // #define BM_MD_IO_ERROR 1 unused now. |
86 | #define BM_P_VMALLOCED 2 | 85 | #define BM_P_VMALLOCED 2 |
87 | 86 | ||
88 | static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | 87 | static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, |
@@ -155,26 +154,117 @@ void drbd_bm_unlock(struct drbd_conf *mdev) | |||
155 | mutex_unlock(&b->bm_change); | 154 | mutex_unlock(&b->bm_change); |
156 | } | 155 | } |
157 | 156 | ||
158 | static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr) | 157 | /* we store some "meta" info about our pages in page->private */ |
158 | /* at a granularity of 4k storage per bitmap bit: | ||
159 | * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks | ||
160 | * 1<<38 bits, | ||
161 | * 1<<23 4k bitmap pages. | ||
162 | * Use 24 bits as page index, covers 2 peta byte storage | ||
163 | * at a granularity of 4k per bit. | ||
164 | * Used to report the failed page idx on io error from the endio handlers. | ||
165 | */ | ||
166 | #define BM_PAGE_IDX_MASK ((1UL<<24)-1) | ||
167 | /* this page is currently read in, or written back */ | ||
168 | #define BM_PAGE_IO_LOCK 31 | ||
169 | /* if there has been an IO error for this page */ | ||
170 | #define BM_PAGE_IO_ERROR 30 | ||
171 | /* this is to be able to intelligently skip disk IO, | ||
172 | * set if bits have been set since last IO. */ | ||
173 | #define BM_PAGE_NEED_WRITEOUT 29 | ||
174 | /* to mark for lazy writeout once syncer cleared all clearable bits, | ||
175 | * we if bits have been cleared since last IO. */ | ||
176 | #define BM_PAGE_LAZY_WRITEOUT 28 | ||
177 | |||
178 | /* store_page_idx uses non-atomic assingment. It is only used directly after | ||
179 | * allocating the page. All other bm_set_page_* and bm_clear_page_* need to | ||
180 | * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap | ||
181 | * changes) may happen from various contexts, and wait_on_bit/wake_up_bit | ||
182 | * requires it all to be atomic as well. */ | ||
183 | static void bm_store_page_idx(struct page *page, unsigned long idx) | ||
159 | { | 184 | { |
160 | /* page_nr = (bitnr/8) >> PAGE_SHIFT; */ | 185 | BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK)); |
161 | unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3); | 186 | page_private(page) |= idx; |
162 | BUG_ON(page_nr >= b->bm_number_of_pages); | ||
163 | return page_nr; | ||
164 | } | 187 | } |
165 | 188 | ||
166 | /* word offset to long pointer */ | 189 | static unsigned long bm_page_to_idx(struct page *page) |
167 | static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km) | ||
168 | { | 190 | { |
169 | struct page *page; | 191 | return page_private(page) & BM_PAGE_IDX_MASK; |
170 | unsigned long page_nr; | 192 | } |
193 | |||
194 | /* As is very unlikely that the same page is under IO from more than one | ||
195 | * context, we can get away with a bit per page and one wait queue per bitmap. | ||
196 | */ | ||
197 | static void bm_page_lock_io(struct drbd_conf *mdev, int page_nr) | ||
198 | { | ||
199 | struct drbd_bitmap *b = mdev->bitmap; | ||
200 | void *addr = &page_private(b->bm_pages[page_nr]); | ||
201 | wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr)); | ||
202 | } | ||
203 | |||
204 | static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr) | ||
205 | { | ||
206 | struct drbd_bitmap *b = mdev->bitmap; | ||
207 | void *addr = &page_private(b->bm_pages[page_nr]); | ||
208 | clear_bit(BM_PAGE_IO_LOCK, addr); | ||
209 | smp_mb__after_clear_bit(); | ||
210 | wake_up(&mdev->bitmap->bm_io_wait); | ||
211 | } | ||
212 | |||
213 | /* set _before_ submit_io, so it may be reset due to being changed | ||
214 | * while this page is in flight... will get submitted later again */ | ||
215 | static void bm_set_page_unchanged(struct page *page) | ||
216 | { | ||
217 | /* use cmpxchg? */ | ||
218 | clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); | ||
219 | clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page)); | ||
220 | } | ||
221 | |||
222 | static void bm_set_page_need_writeout(struct page *page) | ||
223 | { | ||
224 | set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); | ||
225 | } | ||
226 | |||
227 | static int bm_test_page_unchanged(struct page *page) | ||
228 | { | ||
229 | volatile const unsigned long *addr = &page_private(page); | ||
230 | return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0; | ||
231 | } | ||
171 | 232 | ||
233 | static void bm_set_page_io_err(struct page *page) | ||
234 | { | ||
235 | set_bit(BM_PAGE_IO_ERROR, &page_private(page)); | ||
236 | } | ||
237 | |||
238 | static void bm_clear_page_io_err(struct page *page) | ||
239 | { | ||
240 | clear_bit(BM_PAGE_IO_ERROR, &page_private(page)); | ||
241 | } | ||
242 | |||
243 | static void bm_set_page_lazy_writeout(struct page *page) | ||
244 | { | ||
245 | set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page)); | ||
246 | } | ||
247 | |||
248 | static int bm_test_page_lazy_writeout(struct page *page) | ||
249 | { | ||
250 | return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page)); | ||
251 | } | ||
252 | |||
253 | /* on a 32bit box, this would allow for exactly (2<<38) bits. */ | ||
254 | static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr) | ||
255 | { | ||
172 | /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */ | 256 | /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */ |
173 | page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); | 257 | unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3); |
174 | BUG_ON(page_nr >= b->bm_number_of_pages); | 258 | BUG_ON(page_nr >= b->bm_number_of_pages); |
175 | page = b->bm_pages[page_nr]; | 259 | return page_nr; |
260 | } | ||
176 | 261 | ||
177 | return (unsigned long *) kmap_atomic(page, km); | 262 | static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr) |
263 | { | ||
264 | /* page_nr = (bitnr/8) >> PAGE_SHIFT; */ | ||
265 | unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3); | ||
266 | BUG_ON(page_nr >= b->bm_number_of_pages); | ||
267 | return page_nr; | ||
178 | } | 268 | } |
179 | 269 | ||
180 | static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km) | 270 | static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km) |
@@ -188,11 +278,6 @@ static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx) | |||
188 | return __bm_map_pidx(b, idx, KM_IRQ1); | 278 | return __bm_map_pidx(b, idx, KM_IRQ1); |
189 | } | 279 | } |
190 | 280 | ||
191 | static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset) | ||
192 | { | ||
193 | return __bm_map_paddr(b, offset, KM_IRQ1); | ||
194 | } | ||
195 | |||
196 | static void __bm_unmap(unsigned long *p_addr, const enum km_type km) | 281 | static void __bm_unmap(unsigned long *p_addr, const enum km_type km) |
197 | { | 282 | { |
198 | kunmap_atomic(p_addr, km); | 283 | kunmap_atomic(p_addr, km); |
@@ -222,6 +307,7 @@ static void bm_unmap(unsigned long *p_addr) | |||
222 | * to be able to report device specific. | 307 | * to be able to report device specific. |
223 | */ | 308 | */ |
224 | 309 | ||
310 | |||
225 | static void bm_free_pages(struct page **pages, unsigned long number) | 311 | static void bm_free_pages(struct page **pages, unsigned long number) |
226 | { | 312 | { |
227 | unsigned long i; | 313 | unsigned long i; |
@@ -289,6 +375,9 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) | |||
289 | bm_vk_free(new_pages, vmalloced); | 375 | bm_vk_free(new_pages, vmalloced); |
290 | return NULL; | 376 | return NULL; |
291 | } | 377 | } |
378 | /* we want to know which page it is | ||
379 | * from the endio handlers */ | ||
380 | bm_store_page_idx(page, i); | ||
292 | new_pages[i] = page; | 381 | new_pages[i] = page; |
293 | } | 382 | } |
294 | } else { | 383 | } else { |
@@ -443,7 +532,7 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b) | |||
443 | 532 | ||
444 | while (offset < words) { | 533 | while (offset < words) { |
445 | i = do_now = min_t(size_t, words-offset, LWPP); | 534 | i = do_now = min_t(size_t, words-offset, LWPP); |
446 | p_addr = __bm_map_paddr(b, offset, KM_USER0); | 535 | p_addr = __bm_map_pidx(b, bm_word_to_page_idx(b, offset), KM_USER0); |
447 | bm = p_addr + MLPP(offset); | 536 | bm = p_addr + MLPP(offset); |
448 | while (i--) { | 537 | while (i--) { |
449 | bits += hweight_long(*bm++); | 538 | bits += hweight_long(*bm++); |
@@ -472,6 +561,7 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b) | |||
472 | static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) | 561 | static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) |
473 | { | 562 | { |
474 | unsigned long *p_addr, *bm; | 563 | unsigned long *p_addr, *bm; |
564 | unsigned int idx; | ||
475 | size_t do_now, end; | 565 | size_t do_now, end; |
476 | 566 | ||
477 | #define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512) | 567 | #define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512) |
@@ -485,7 +575,8 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) | |||
485 | 575 | ||
486 | while (offset < end) { | 576 | while (offset < end) { |
487 | do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset; | 577 | do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset; |
488 | p_addr = bm_map_paddr(b, offset); | 578 | idx = bm_word_to_page_idx(b, offset); |
579 | p_addr = bm_map_pidx(b, idx); | ||
489 | bm = p_addr + MLPP(offset); | 580 | bm = p_addr + MLPP(offset); |
490 | if (bm+do_now > p_addr + LWPP) { | 581 | if (bm+do_now > p_addr + LWPP) { |
491 | printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", | 582 | printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", |
@@ -494,6 +585,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) | |||
494 | } | 585 | } |
495 | memset(bm, c, do_now * sizeof(long)); | 586 | memset(bm, c, do_now * sizeof(long)); |
496 | bm_unmap(p_addr); | 587 | bm_unmap(p_addr); |
588 | bm_set_page_need_writeout(b->bm_pages[idx]); | ||
497 | offset += do_now; | 589 | offset += do_now; |
498 | } | 590 | } |
499 | } | 591 | } |
@@ -604,7 +696,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) | |||
604 | bm_free_pages(opages + want, have - want); | 696 | bm_free_pages(opages + want, have - want); |
605 | } | 697 | } |
606 | 698 | ||
607 | p_addr = bm_map_paddr(b, words); | 699 | p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, words)); |
608 | bm = p_addr + MLPP(words); | 700 | bm = p_addr + MLPP(words); |
609 | *bm = DRBD_MAGIC; | 701 | *bm = DRBD_MAGIC; |
610 | bm_unmap(p_addr); | 702 | bm_unmap(p_addr); |
@@ -616,7 +708,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) | |||
616 | bm_vk_free(opages, opages_vmalloced); | 708 | bm_vk_free(opages, opages_vmalloced); |
617 | if (!growing) | 709 | if (!growing) |
618 | b->bm_set = bm_count_bits(b); | 710 | b->bm_set = bm_count_bits(b); |
619 | dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words); | 711 | dev_info(DEV, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want); |
620 | 712 | ||
621 | out: | 713 | out: |
622 | drbd_bm_unlock(mdev); | 714 | drbd_bm_unlock(mdev); |
@@ -686,6 +778,7 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number, | |||
686 | struct drbd_bitmap *b = mdev->bitmap; | 778 | struct drbd_bitmap *b = mdev->bitmap; |
687 | unsigned long *p_addr, *bm; | 779 | unsigned long *p_addr, *bm; |
688 | unsigned long word, bits; | 780 | unsigned long word, bits; |
781 | unsigned int idx; | ||
689 | size_t end, do_now; | 782 | size_t end, do_now; |
690 | 783 | ||
691 | end = offset + number; | 784 | end = offset + number; |
@@ -700,7 +793,8 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number, | |||
700 | spin_lock_irq(&b->bm_lock); | 793 | spin_lock_irq(&b->bm_lock); |
701 | while (offset < end) { | 794 | while (offset < end) { |
702 | do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; | 795 | do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; |
703 | p_addr = bm_map_paddr(b, offset); | 796 | idx = bm_word_to_page_idx(b, offset); |
797 | p_addr = bm_map_pidx(b, idx); | ||
704 | bm = p_addr + MLPP(offset); | 798 | bm = p_addr + MLPP(offset); |
705 | offset += do_now; | 799 | offset += do_now; |
706 | while (do_now--) { | 800 | while (do_now--) { |
@@ -710,6 +804,7 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number, | |||
710 | b->bm_set += hweight_long(word) - bits; | 804 | b->bm_set += hweight_long(word) - bits; |
711 | } | 805 | } |
712 | bm_unmap(p_addr); | 806 | bm_unmap(p_addr); |
807 | bm_set_page_need_writeout(b->bm_pages[idx]); | ||
713 | } | 808 | } |
714 | /* with 32bit <-> 64bit cross-platform connect | 809 | /* with 32bit <-> 64bit cross-platform connect |
715 | * this is only correct for current usage, | 810 | * this is only correct for current usage, |
@@ -748,7 +843,7 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number, | |||
748 | else { | 843 | else { |
749 | while (offset < end) { | 844 | while (offset < end) { |
750 | do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; | 845 | do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; |
751 | p_addr = bm_map_paddr(b, offset); | 846 | p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset)); |
752 | bm = p_addr + MLPP(offset); | 847 | bm = p_addr + MLPP(offset); |
753 | offset += do_now; | 848 | offset += do_now; |
754 | while (do_now--) | 849 | while (do_now--) |
@@ -786,9 +881,22 @@ void drbd_bm_clear_all(struct drbd_conf *mdev) | |||
786 | spin_unlock_irq(&b->bm_lock); | 881 | spin_unlock_irq(&b->bm_lock); |
787 | } | 882 | } |
788 | 883 | ||
884 | struct bm_aio_ctx { | ||
885 | struct drbd_conf *mdev; | ||
886 | atomic_t in_flight; | ||
887 | wait_queue_head_t io_wait; | ||
888 | unsigned flags; | ||
889 | #define BM_AIO_COPY_PAGES 1 | ||
890 | int error; | ||
891 | }; | ||
892 | |||
893 | /* bv_page may be a copy, or may be the original */ | ||
789 | static void bm_async_io_complete(struct bio *bio, int error) | 894 | static void bm_async_io_complete(struct bio *bio, int error) |
790 | { | 895 | { |
791 | struct drbd_bitmap *b = bio->bi_private; | 896 | struct bm_aio_ctx *ctx = bio->bi_private; |
897 | struct drbd_conf *mdev = ctx->mdev; | ||
898 | struct drbd_bitmap *b = mdev->bitmap; | ||
899 | unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page); | ||
792 | int uptodate = bio_flagged(bio, BIO_UPTODATE); | 900 | int uptodate = bio_flagged(bio, BIO_UPTODATE); |
793 | 901 | ||
794 | 902 | ||
@@ -799,35 +907,79 @@ static void bm_async_io_complete(struct bio *bio, int error) | |||
799 | if (!error && !uptodate) | 907 | if (!error && !uptodate) |
800 | error = -EIO; | 908 | error = -EIO; |
801 | 909 | ||
910 | if (!bm_test_page_unchanged(b->bm_pages[idx])) | ||
911 | dev_info(DEV, "bitmap page idx %u changed during IO!\n", idx); | ||
912 | |||
802 | if (error) { | 913 | if (error) { |
803 | /* doh. what now? | 914 | /* ctx error will hold the completed-last non-zero error code, |
804 | * for now, set all bits, and flag MD_IO_ERROR */ | 915 | * in case error codes differ. */ |
805 | __set_bit(BM_MD_IO_ERROR, &b->bm_flags); | 916 | ctx->error = error; |
917 | bm_set_page_io_err(b->bm_pages[idx]); | ||
918 | /* Not identical to on disk version of it. | ||
919 | * Is BM_PAGE_IO_ERROR enough? */ | ||
920 | if (__ratelimit(&drbd_ratelimit_state)) | ||
921 | dev_err(DEV, "IO ERROR %d on bitmap page idx %u\n", | ||
922 | error, idx); | ||
923 | } else { | ||
924 | bm_clear_page_io_err(b->bm_pages[idx]); | ||
925 | dynamic_dev_dbg(DEV, "bitmap page idx %u completed\n", idx); | ||
806 | } | 926 | } |
807 | if (atomic_dec_and_test(&b->bm_async_io)) | 927 | |
808 | wake_up(&b->bm_io_wait); | 928 | bm_page_unlock_io(mdev, idx); |
929 | |||
930 | /* FIXME give back to page pool */ | ||
931 | if (ctx->flags & BM_AIO_COPY_PAGES) | ||
932 | put_page(bio->bi_io_vec[0].bv_page); | ||
809 | 933 | ||
810 | bio_put(bio); | 934 | bio_put(bio); |
935 | |||
936 | if (atomic_dec_and_test(&ctx->in_flight)) | ||
937 | wake_up(&ctx->io_wait); | ||
811 | } | 938 | } |
812 | 939 | ||
813 | static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local) | 940 | static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local) |
814 | { | 941 | { |
815 | /* we are process context. we always get a bio */ | 942 | /* we are process context. we always get a bio */ |
816 | struct bio *bio = bio_alloc(GFP_KERNEL, 1); | 943 | struct bio *bio = bio_alloc(GFP_KERNEL, 1); |
944 | struct drbd_conf *mdev = ctx->mdev; | ||
945 | struct drbd_bitmap *b = mdev->bitmap; | ||
946 | struct page *page; | ||
817 | unsigned int len; | 947 | unsigned int len; |
948 | |||
818 | sector_t on_disk_sector = | 949 | sector_t on_disk_sector = |
819 | mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset; | 950 | mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset; |
820 | on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); | 951 | on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); |
821 | 952 | ||
822 | /* this might happen with very small | 953 | /* this might happen with very small |
823 | * flexible external meta data device */ | 954 | * flexible external meta data device, |
955 | * or with PAGE_SIZE > 4k */ | ||
824 | len = min_t(unsigned int, PAGE_SIZE, | 956 | len = min_t(unsigned int, PAGE_SIZE, |
825 | (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9); | 957 | (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9); |
826 | 958 | ||
959 | /* serialize IO on this page */ | ||
960 | bm_page_lock_io(mdev, page_nr); | ||
961 | /* before memcpy and submit, | ||
962 | * so it can be redirtied any time */ | ||
963 | bm_set_page_unchanged(b->bm_pages[page_nr]); | ||
964 | |||
965 | if (ctx->flags & BM_AIO_COPY_PAGES) { | ||
966 | /* FIXME alloc_page is good enough for now, but actually needs | ||
967 | * to use pre-allocated page pool */ | ||
968 | void *src, *dest; | ||
969 | page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT); | ||
970 | dest = kmap_atomic(page, KM_USER0); | ||
971 | src = kmap_atomic(b->bm_pages[page_nr], KM_USER1); | ||
972 | memcpy(dest, src, PAGE_SIZE); | ||
973 | kunmap_atomic(src, KM_USER1); | ||
974 | kunmap_atomic(dest, KM_USER0); | ||
975 | bm_store_page_idx(page, page_nr); | ||
976 | } else | ||
977 | page = b->bm_pages[page_nr]; | ||
978 | |||
827 | bio->bi_bdev = mdev->ldev->md_bdev; | 979 | bio->bi_bdev = mdev->ldev->md_bdev; |
828 | bio->bi_sector = on_disk_sector; | 980 | bio->bi_sector = on_disk_sector; |
829 | bio_add_page(bio, b->bm_pages[page_nr], len, 0); | 981 | bio_add_page(bio, page, len, 0); |
830 | bio->bi_private = b; | 982 | bio->bi_private = ctx; |
831 | bio->bi_end_io = bm_async_io_complete; | 983 | bio->bi_end_io = bm_async_io_complete; |
832 | 984 | ||
833 | if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { | 985 | if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { |
@@ -841,36 +993,72 @@ static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int | |||
841 | /* | 993 | /* |
842 | * bm_rw: read/write the whole bitmap from/to its on disk location. | 994 | * bm_rw: read/write the whole bitmap from/to its on disk location. |
843 | */ | 995 | */ |
844 | static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local) | 996 | static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local) |
845 | { | 997 | { |
998 | struct bm_aio_ctx ctx = | ||
999 | { .flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0 }; | ||
846 | struct drbd_bitmap *b = mdev->bitmap; | 1000 | struct drbd_bitmap *b = mdev->bitmap; |
847 | /* sector_t sector; */ | 1001 | int last_page, i, count = 0; |
848 | int bm_words, num_pages, i; | ||
849 | unsigned long now; | 1002 | unsigned long now; |
850 | char ppb[10]; | 1003 | char ppb[10]; |
851 | int err = 0; | 1004 | int err = 0; |
852 | 1005 | ||
853 | WARN_ON(!bm_is_locked(b)); | 1006 | /* |
854 | 1007 | * We are protected against bitmap disappearing/resizing by holding an | |
855 | /* no spinlock here, the drbd_bm_lock should be enough! */ | 1008 | * ldev reference (caller must have called get_ldev()). |
1009 | * For read/write, we are protected against changes to the bitmap by | ||
1010 | * the bitmap lock (see drbd_bitmap_io). | ||
1011 | * For lazy writeout, we don't care for ongoing changes to the bitmap, | ||
1012 | * as we submit copies of pages anyways. | ||
1013 | */ | ||
1014 | if (!ctx.flags) | ||
1015 | WARN_ON(!bm_is_locked(b)); | ||
856 | 1016 | ||
857 | bm_words = drbd_bm_words(mdev); | 1017 | /* because of the "extra long to catch oob access" we allocate in |
858 | num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT; | 1018 | * drbd_bm_resize, bm_number_of_pages -1 is not necessarily the page |
1019 | * containing the last _relevant_ bitmap word */ | ||
1020 | last_page = bm_word_to_page_idx(b, b->bm_words - 1); | ||
859 | 1021 | ||
860 | now = jiffies; | 1022 | now = jiffies; |
861 | atomic_set(&b->bm_async_io, num_pages); | 1023 | ctx.mdev = mdev; |
862 | __clear_bit(BM_MD_IO_ERROR, &b->bm_flags); | 1024 | atomic_set(&ctx.in_flight, 1); /* one extra ref */ |
1025 | init_waitqueue_head(&ctx.io_wait); | ||
1026 | ctx.error = 0; | ||
863 | 1027 | ||
864 | /* let the layers below us try to merge these bios... */ | 1028 | /* let the layers below us try to merge these bios... */ |
865 | for (i = 0; i < num_pages; i++) | 1029 | for (i = 0; i <= last_page; i++) { |
866 | bm_page_io_async(mdev, b, i, rw); | 1030 | /* ignore completely unchanged pages */ |
1031 | if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) | ||
1032 | break; | ||
1033 | if (rw & WRITE) { | ||
1034 | if (bm_test_page_unchanged(b->bm_pages[i])) { | ||
1035 | dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); | ||
1036 | continue; | ||
1037 | } | ||
1038 | /* during lazy writeout, | ||
1039 | * ignore those pages not marked for lazy writeout. */ | ||
1040 | if (lazy_writeout_upper_idx && | ||
1041 | !bm_test_page_lazy_writeout(b->bm_pages[i])) { | ||
1042 | dynamic_dev_dbg(DEV, "skipped bm lazy write for idx %u\n", i); | ||
1043 | continue; | ||
1044 | } | ||
1045 | } | ||
1046 | atomic_inc(&ctx.in_flight); | ||
1047 | bm_page_io_async(&ctx, i, rw); | ||
1048 | ++count; | ||
1049 | cond_resched(); | ||
1050 | } | ||
867 | 1051 | ||
868 | wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0); | 1052 | atomic_dec(&ctx.in_flight); /* drop the extra ref */ |
1053 | wait_event(ctx.io_wait, atomic_read(&ctx.in_flight) == 0); | ||
1054 | dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", | ||
1055 | rw == WRITE ? "WRITE" : "READ", | ||
1056 | count, jiffies - now); | ||
869 | 1057 | ||
870 | if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) { | 1058 | if (ctx.error) { |
871 | dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); | 1059 | dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); |
872 | drbd_chk_io_error(mdev, 1, true); | 1060 | drbd_chk_io_error(mdev, 1, true); |
873 | err = -EIO; | 1061 | err = -EIO; /* ctx.error ? */ |
874 | } | 1062 | } |
875 | 1063 | ||
876 | now = jiffies; | 1064 | now = jiffies; |
@@ -895,55 +1083,63 @@ static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local) | |||
895 | */ | 1083 | */ |
896 | int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) | 1084 | int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) |
897 | { | 1085 | { |
898 | return bm_rw(mdev, READ); | 1086 | return bm_rw(mdev, READ, 0); |
899 | } | 1087 | } |
900 | 1088 | ||
901 | /** | 1089 | /** |
902 | * drbd_bm_write() - Write the whole bitmap to its on disk location. | 1090 | * drbd_bm_write() - Write the whole bitmap to its on disk location. |
903 | * @mdev: DRBD device. | 1091 | * @mdev: DRBD device. |
1092 | * | ||
1093 | * Will only write pages that have changed since last IO. | ||
904 | */ | 1094 | */ |
905 | int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) | 1095 | int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) |
906 | { | 1096 | { |
907 | return bm_rw(mdev, WRITE); | 1097 | return bm_rw(mdev, WRITE, 0); |
908 | } | 1098 | } |
909 | 1099 | ||
910 | /** | 1100 | /** |
911 | * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap | 1101 | * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed. |
912 | * @mdev: DRBD device. | 1102 | * @mdev: DRBD device. |
913 | * @enr: Extent number in the resync lru (happens to be sector offset) | 1103 | * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages |
1104 | */ | ||
1105 | int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local) | ||
1106 | { | ||
1107 | return bm_rw(mdev, WRITE, upper_idx); | ||
1108 | } | ||
1109 | |||
1110 | |||
1111 | /** | ||
1112 | * drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap | ||
1113 | * @mdev: DRBD device. | ||
1114 | * @idx: bitmap page index | ||
914 | * | 1115 | * |
915 | * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered | 1116 | * We don't want to special case on logical_block_size of the underlaying |
916 | * by a single sector write. Therefore enr == sector offset from the | 1117 | * device, so we submit PAGE_SIZE aligned pieces containing the requested enr. |
917 | * start of the bitmap. | 1118 | * Note that on "most" systems, PAGE_SIZE is 4k. |
918 | */ | 1119 | */ |
919 | int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local) | 1120 | int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local) |
920 | { | 1121 | { |
921 | sector_t on_disk_sector = enr + mdev->ldev->md.md_offset | 1122 | struct bm_aio_ctx ctx = { .flags = BM_AIO_COPY_PAGES, }; |
922 | + mdev->ldev->md.bm_offset; | ||
923 | int bm_words, num_words, offset; | ||
924 | int err = 0; | ||
925 | 1123 | ||
926 | mutex_lock(&mdev->md_io_mutex); | 1124 | if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) { |
927 | bm_words = drbd_bm_words(mdev); | 1125 | dev_info(DEV, "skipped bm page write for idx %u\n", idx); |
928 | offset = S2W(enr); /* word offset into bitmap */ | 1126 | return 0; |
929 | num_words = min(S2W(1), bm_words - offset); | ||
930 | if (num_words < S2W(1)) | ||
931 | memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE); | ||
932 | drbd_bm_get_lel(mdev, offset, num_words, | ||
933 | page_address(mdev->md_io_page)); | ||
934 | if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) { | ||
935 | int i; | ||
936 | err = -EIO; | ||
937 | dev_err(DEV, "IO ERROR writing bitmap sector %lu " | ||
938 | "(meta-disk sector %llus)\n", | ||
939 | enr, (unsigned long long)on_disk_sector); | ||
940 | drbd_chk_io_error(mdev, 1, true); | ||
941 | for (i = 0; i < AL_EXT_PER_BM_SECT; i++) | ||
942 | drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i); | ||
943 | } | 1127 | } |
1128 | |||
1129 | ctx.mdev = mdev; | ||
1130 | atomic_set(&ctx.in_flight, 1); | ||
1131 | init_waitqueue_head(&ctx.io_wait); | ||
1132 | |||
1133 | bm_page_io_async(&ctx, idx, WRITE_SYNC); | ||
1134 | wait_event(ctx.io_wait, atomic_read(&ctx.in_flight) == 0); | ||
1135 | |||
1136 | if (ctx.error) | ||
1137 | drbd_chk_io_error(mdev, 1, true); | ||
1138 | /* that should force detach, so the in memory bitmap will be | ||
1139 | * gone in a moment as well. */ | ||
1140 | |||
944 | mdev->bm_writ_cnt++; | 1141 | mdev->bm_writ_cnt++; |
945 | mutex_unlock(&mdev->md_io_mutex); | 1142 | return ctx.error; |
946 | return err; | ||
947 | } | 1143 | } |
948 | 1144 | ||
949 | /* NOTE | 1145 | /* NOTE |
@@ -965,10 +1161,9 @@ static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, | |||
965 | dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits); | 1161 | dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits); |
966 | } else { | 1162 | } else { |
967 | while (bm_fo < b->bm_bits) { | 1163 | while (bm_fo < b->bm_bits) { |
968 | unsigned long offset; | 1164 | /* bit offset of the first bit in the page */ |
969 | bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */ | 1165 | bit_offset = bm_fo & ~BPP_MASK; |
970 | offset = bit_offset >> LN2_BPL; /* word offset of the page */ | 1166 | p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km); |
971 | p_addr = __bm_map_paddr(b, offset, km); | ||
972 | 1167 | ||
973 | if (find_zero_bit) | 1168 | if (find_zero_bit) |
974 | i = generic_find_next_zero_le_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); | 1169 | i = generic_find_next_zero_le_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); |
@@ -1048,8 +1243,9 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | |||
1048 | struct drbd_bitmap *b = mdev->bitmap; | 1243 | struct drbd_bitmap *b = mdev->bitmap; |
1049 | unsigned long *p_addr = NULL; | 1244 | unsigned long *p_addr = NULL; |
1050 | unsigned long bitnr; | 1245 | unsigned long bitnr; |
1051 | unsigned long last_page_nr = -1UL; | 1246 | unsigned int last_page_nr = -1U; |
1052 | int c = 0; | 1247 | int c = 0; |
1248 | int changed_total = 0; | ||
1053 | 1249 | ||
1054 | if (e >= b->bm_bits) { | 1250 | if (e >= b->bm_bits) { |
1055 | dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n", | 1251 | dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n", |
@@ -1057,12 +1253,17 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | |||
1057 | e = b->bm_bits ? b->bm_bits -1 : 0; | 1253 | e = b->bm_bits ? b->bm_bits -1 : 0; |
1058 | } | 1254 | } |
1059 | for (bitnr = s; bitnr <= e; bitnr++) { | 1255 | for (bitnr = s; bitnr <= e; bitnr++) { |
1060 | unsigned long offset = bitnr>>LN2_BPL; | 1256 | unsigned int page_nr = bm_bit_to_page_idx(b, bitnr); |
1061 | unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); | ||
1062 | if (page_nr != last_page_nr) { | 1257 | if (page_nr != last_page_nr) { |
1063 | if (p_addr) | 1258 | if (p_addr) |
1064 | __bm_unmap(p_addr, km); | 1259 | __bm_unmap(p_addr, km); |
1065 | p_addr = __bm_map_paddr(b, offset, km); | 1260 | if (c < 0) |
1261 | bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]); | ||
1262 | else if (c > 0) | ||
1263 | bm_set_page_need_writeout(b->bm_pages[last_page_nr]); | ||
1264 | changed_total += c; | ||
1265 | c = 0; | ||
1266 | p_addr = __bm_map_pidx(b, page_nr, km); | ||
1066 | last_page_nr = page_nr; | 1267 | last_page_nr = page_nr; |
1067 | } | 1268 | } |
1068 | if (val) | 1269 | if (val) |
@@ -1072,8 +1273,13 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, | |||
1072 | } | 1273 | } |
1073 | if (p_addr) | 1274 | if (p_addr) |
1074 | __bm_unmap(p_addr, km); | 1275 | __bm_unmap(p_addr, km); |
1075 | b->bm_set += c; | 1276 | if (c < 0) |
1076 | return c; | 1277 | bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]); |
1278 | else if (c > 0) | ||
1279 | bm_set_page_need_writeout(b->bm_pages[last_page_nr]); | ||
1280 | changed_total += c; | ||
1281 | b->bm_set += changed_total; | ||
1282 | return changed_total; | ||
1077 | } | 1283 | } |
1078 | 1284 | ||
1079 | /* returns number of bits actually changed. | 1285 | /* returns number of bits actually changed. |
@@ -1211,8 +1417,7 @@ int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr) | |||
1211 | if (bm_is_locked(b)) | 1417 | if (bm_is_locked(b)) |
1212 | bm_print_lock_info(mdev); | 1418 | bm_print_lock_info(mdev); |
1213 | if (bitnr < b->bm_bits) { | 1419 | if (bitnr < b->bm_bits) { |
1214 | unsigned long offset = bitnr>>LN2_BPL; | 1420 | p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr)); |
1215 | p_addr = bm_map_paddr(b, offset); | ||
1216 | i = generic_test_le_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0; | 1421 | i = generic_test_le_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0; |
1217 | bm_unmap(p_addr); | 1422 | bm_unmap(p_addr); |
1218 | } else if (bitnr == b->bm_bits) { | 1423 | } else if (bitnr == b->bm_bits) { |
@@ -1231,10 +1436,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi | |||
1231 | { | 1436 | { |
1232 | unsigned long flags; | 1437 | unsigned long flags; |
1233 | struct drbd_bitmap *b = mdev->bitmap; | 1438 | struct drbd_bitmap *b = mdev->bitmap; |
1234 | unsigned long *p_addr = NULL, page_nr = -1; | 1439 | unsigned long *p_addr = NULL; |
1235 | unsigned long bitnr; | 1440 | unsigned long bitnr; |
1441 | unsigned int page_nr = -1U; | ||
1236 | int c = 0; | 1442 | int c = 0; |
1237 | size_t w; | ||
1238 | 1443 | ||
1239 | /* If this is called without a bitmap, that is a bug. But just to be | 1444 | /* If this is called without a bitmap, that is a bug. But just to be |
1240 | * robust in case we screwed up elsewhere, in that case pretend there | 1445 | * robust in case we screwed up elsewhere, in that case pretend there |
@@ -1247,12 +1452,12 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi | |||
1247 | if (bm_is_locked(b)) | 1452 | if (bm_is_locked(b)) |
1248 | bm_print_lock_info(mdev); | 1453 | bm_print_lock_info(mdev); |
1249 | for (bitnr = s; bitnr <= e; bitnr++) { | 1454 | for (bitnr = s; bitnr <= e; bitnr++) { |
1250 | w = bitnr >> LN2_BPL; | 1455 | unsigned int idx = bm_bit_to_page_idx(b, bitnr); |
1251 | if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) { | 1456 | if (page_nr != idx) { |
1252 | page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3); | 1457 | page_nr = idx; |
1253 | if (p_addr) | 1458 | if (p_addr) |
1254 | bm_unmap(p_addr); | 1459 | bm_unmap(p_addr); |
1255 | p_addr = bm_map_paddr(b, w); | 1460 | p_addr = bm_map_pidx(b, idx); |
1256 | } | 1461 | } |
1257 | ERR_IF (bitnr >= b->bm_bits) { | 1462 | ERR_IF (bitnr >= b->bm_bits) { |
1258 | dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); | 1463 | dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); |
@@ -1300,7 +1505,7 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr) | |||
1300 | count = 0; | 1505 | count = 0; |
1301 | if (s < b->bm_words) { | 1506 | if (s < b->bm_words) { |
1302 | int n = e-s; | 1507 | int n = e-s; |
1303 | p_addr = bm_map_paddr(b, s); | 1508 | p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s)); |
1304 | bm = p_addr + MLPP(s); | 1509 | bm = p_addr + MLPP(s); |
1305 | while (n--) | 1510 | while (n--) |
1306 | count += hweight_long(*bm++); | 1511 | count += hweight_long(*bm++); |
@@ -1335,7 +1540,7 @@ unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr) | |||
1335 | count = 0; | 1540 | count = 0; |
1336 | if (s < b->bm_words) { | 1541 | if (s < b->bm_words) { |
1337 | i = do_now = e-s; | 1542 | i = do_now = e-s; |
1338 | p_addr = bm_map_paddr(b, s); | 1543 | p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s)); |
1339 | bm = p_addr + MLPP(s); | 1544 | bm = p_addr + MLPP(s); |
1340 | while (i--) { | 1545 | while (i--) { |
1341 | count += hweight_long(*bm); | 1546 | count += hweight_long(*bm); |