aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorLars Ellenberg <lars.ellenberg@linbit.com>2014-01-27 09:58:22 -0500
committerPhilipp Reisner <philipp.reisner@linbit.com>2014-07-10 12:34:50 -0400
commit5ab7d2c005135849cf0bb1485d954c98f2cca57c (patch)
tree43340069d199c864871c04f9672639415a7bb8fa /drivers/block
parenta80ca1ae81fc52e304e753f6de4ef248df364f9e (diff)
drbd: fix resync finished detection
This fixes one recent regresion, and one long existing bug. The bug: drbd_try_clear_on_disk_bm() assumed that all "count" bits have to be accounted in the resync extent corresponding to the start sector. Since we allow application requests to cross our "extent" boundaries, this assumption is no longer true, resulting in possible misaccounting, scary messages ("BAD! sector=12345s enr=6 rs_left=-7 rs_failed=0 count=58 cstate=..."), and potentially, if the last bit to be cleared during resync would reside in previously misaccounted resync extent, the resync would never be recognized as finished, but would be "stalled" forever, even though all blocks are in sync again and all bits have been cleared... The regression was introduced by drbd: get rid of atomic update on disk bitmap works For an "empty" resync (rs_total == 0), we must not "finish" the resync on the SyncSource before the SyncTarget knows all relevant information (sync uuid). We need to wait for the full round-trip, the SyncTarget will then explicitly notify us. Also for normal, non-empty resyncs (rs_total > 0), the resync-finished condition needs to be tested before the schedule() in wait_for_work, or it is likely to be missed. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/drbd/drbd_actlog.c315
-rw-r--r--drivers/block/drbd/drbd_int.h42
-rw-r--r--drivers/block/drbd/drbd_state.c3
-rw-r--r--drivers/block/drbd/drbd_worker.c42
4 files changed, 197 insertions, 205 deletions
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 9c42edf4871b..278c31f24639 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -667,36 +667,56 @@ int drbd_initialize_al(struct drbd_device *device, void *buffer)
667 return 0; 667 return 0;
668} 668}
669 669
670static const char *drbd_change_sync_fname[] = {
671 [RECORD_RS_FAILED] = "drbd_rs_failed_io",
672 [SET_IN_SYNC] = "drbd_set_in_sync",
673 [SET_OUT_OF_SYNC] = "drbd_set_out_of_sync"
674};
675
670/* ATTENTION. The AL's extents are 4MB each, while the extents in the 676/* ATTENTION. The AL's extents are 4MB each, while the extents in the
671 * resync LRU-cache are 16MB each. 677 * resync LRU-cache are 16MB each.
672 * The caller of this function has to hold an get_ldev() reference. 678 * The caller of this function has to hold an get_ldev() reference.
673 * 679 *
680 * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success),
681 * potentially pulling in (and recounting the corresponding bits)
682 * this resync extent into the resync extent lru cache.
683 *
684 * Returns whether all bits have been cleared for this resync extent,
685 * precisely: (rs_left <= rs_failed)
686 *
674 * TODO will be obsoleted once we have a caching lru of the on disk bitmap 687 * TODO will be obsoleted once we have a caching lru of the on disk bitmap
675 */ 688 */
676static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t sector, 689static bool update_rs_extent(struct drbd_device *device,
677 int count, int success) 690 unsigned int enr, int count,
691 enum update_sync_bits_mode mode)
678{ 692{
679 struct lc_element *e; 693 struct lc_element *e;
680 unsigned int enr;
681 694
682 D_ASSERT(device, atomic_read(&device->local_cnt)); 695 D_ASSERT(device, atomic_read(&device->local_cnt));
683 696
684 /* I simply assume that a sector/size pair never crosses 697 /* When setting out-of-sync bits,
685 * a 16 MB extent border. (Currently this is true...) */ 698 * we don't need it cached (lc_find).
686 enr = BM_SECT_TO_EXT(sector); 699 * But if it is present in the cache,
687 700 * we should update the cached bit count.
688 e = lc_get(device->resync, enr); 701 * Otherwise, that extent should be in the resync extent lru cache
702 * already -- or we want to pull it in if necessary -- (lc_get),
703 * then update and check rs_left and rs_failed. */
704 if (mode == SET_OUT_OF_SYNC)
705 e = lc_find(device->resync, enr);
706 else
707 e = lc_get(device->resync, enr);
689 if (e) { 708 if (e) {
690 struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); 709 struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
691 if (ext->lce.lc_number == enr) { 710 if (ext->lce.lc_number == enr) {
692 if (success) 711 if (mode == SET_IN_SYNC)
693 ext->rs_left -= count; 712 ext->rs_left -= count;
713 else if (mode == SET_OUT_OF_SYNC)
714 ext->rs_left += count;
694 else 715 else
695 ext->rs_failed += count; 716 ext->rs_failed += count;
696 if (ext->rs_left < ext->rs_failed) { 717 if (ext->rs_left < ext->rs_failed) {
697 drbd_warn(device, "BAD! sector=%llus enr=%u rs_left=%d " 718 drbd_warn(device, "BAD! enr=%u rs_left=%d "
698 "rs_failed=%d count=%d cstate=%s\n", 719 "rs_failed=%d count=%d cstate=%s\n",
699 (unsigned long long)sector,
700 ext->lce.lc_number, ext->rs_left, 720 ext->lce.lc_number, ext->rs_left,
701 ext->rs_failed, count, 721 ext->rs_failed, count,
702 drbd_conn_str(device->state.conn)); 722 drbd_conn_str(device->state.conn));
@@ -730,24 +750,27 @@ static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t secto
730 ext->lce.lc_number, ext->rs_failed); 750 ext->lce.lc_number, ext->rs_failed);
731 } 751 }
732 ext->rs_left = rs_left; 752 ext->rs_left = rs_left;
733 ext->rs_failed = success ? 0 : count; 753 ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0;
734 /* we don't keep a persistent log of the resync lru, 754 /* we don't keep a persistent log of the resync lru,
735 * we can commit any change right away. */ 755 * we can commit any change right away. */
736 lc_committed(device->resync); 756 lc_committed(device->resync);
737 } 757 }
738 lc_put(device->resync, &ext->lce); 758 if (mode != SET_OUT_OF_SYNC)
759 lc_put(device->resync, &ext->lce);
739 /* no race, we are within the al_lock! */ 760 /* no race, we are within the al_lock! */
740 761
741 if (ext->rs_left == ext->rs_failed) { 762 if (ext->rs_left <= ext->rs_failed) {
742 ext->rs_failed = 0; 763 ext->rs_failed = 0;
743 wake_up(&first_peer_device(device)->connection->sender_work.q_wait); 764 return true;
744 } 765 }
745 } else { 766 } else if (mode != SET_OUT_OF_SYNC) {
767 /* be quiet if lc_find() did not find it. */
746 drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n", 768 drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n",
747 device->resync_locked, 769 device->resync_locked,
748 device->resync->nr_elements, 770 device->resync->nr_elements,
749 device->resync->flags); 771 device->resync->flags);
750 } 772 }
773 return false;
751} 774}
752 775
753void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go) 776void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go)
@@ -766,105 +789,112 @@ void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go
766 } 789 }
767} 790}
768 791
769/* clear the bit corresponding to the piece of storage in question: 792/* It is called lazy update, so don't do write-out too often. */
770 * size byte of data starting from sector. Only clear a bits of the affected 793static bool lazy_bitmap_update_due(struct drbd_device *device)
771 * one ore more _aligned_ BM_BLOCK_SIZE blocks.
772 *
773 * called by worker on C_SYNC_TARGET and receiver on SyncSource.
774 *
775 */
776void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size,
777 const char *file, const unsigned int line)
778{ 794{
779 /* Is called from worker and receiver context _only_ */ 795 return time_after(jiffies, device->rs_last_bcast + 2*HZ);
780 unsigned long sbnr, ebnr, lbnr; 796}
781 unsigned long count = 0;
782 sector_t esector, nr_sectors;
783 int wake_up = 0;
784 unsigned long flags;
785 797
786 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { 798static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done)
787 drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", 799{
788 (unsigned long long)sector, size); 800 struct drbd_connection *connection;
801 if (rs_done)
802 set_bit(RS_DONE, &device->flags);
803 /* and also set RS_PROGRESS below */
804 else if (!lazy_bitmap_update_due(device))
789 return; 805 return;
790 }
791 806
792 if (!get_ldev(device)) 807 /* compare with test_and_clear_bit() calls in and above
793 return; /* no disk, no metadata, no bitmap to clear bits in */ 808 * try_update_all_on_disk_bitmaps() from the drbd_worker(). */
794 809 if (test_and_set_bit(RS_PROGRESS, &device->flags))
795 nr_sectors = drbd_get_capacity(device->this_bdev); 810 return;
796 esector = sector + (size >> 9) - 1; 811 connection = first_peer_device(device)->connection;
797 812 if (!test_and_set_bit(CONN_RS_PROGRESS, &connection->flags))
798 if (!expect(sector < nr_sectors)) 813 wake_up(&connection->sender_work.q_wait);
799 goto out; 814}
800 if (!expect(esector < nr_sectors))
801 esector = nr_sectors - 1;
802
803 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
804
805 /* we clear it (in sync).
806 * round up start sector, round down end sector. we make sure we only
807 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
808 if (unlikely(esector < BM_SECT_PER_BIT-1))
809 goto out;
810 if (unlikely(esector == (nr_sectors-1)))
811 ebnr = lbnr;
812 else
813 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
814 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
815
816 if (sbnr > ebnr)
817 goto out;
818 815
816static int update_sync_bits(struct drbd_device *device,
817 unsigned long sbnr, unsigned long ebnr,
818 enum update_sync_bits_mode mode)
819{
819 /* 820 /*
820 * ok, (capacity & 7) != 0 sometimes, but who cares... 821 * We keep a count of set bits per resync-extent in the ->rs_left
821 * we count rs_{total,left} in bits, not sectors. 822 * caching member, so we need to loop and work within the resync extent
823 * alignment. Typically this loop will execute exactly once.
822 */ 824 */
823 count = drbd_bm_clear_bits(device, sbnr, ebnr); 825 unsigned long flags;
824 if (count) { 826 unsigned long count = 0;
825 drbd_advance_rs_marks(device, drbd_bm_total_weight(device)); 827 unsigned int cleared = 0;
826 spin_lock_irqsave(&device->al_lock, flags); 828 while (sbnr <= ebnr) {
827 drbd_try_clear_on_disk_bm(device, sector, count, true); 829 /* set temporary boundary bit number to last bit number within
828 spin_unlock_irqrestore(&device->al_lock, flags); 830 * the resync extent of the current start bit number,
829 831 * but cap at provided end bit number */
830 /* just wake_up unconditional now, various lc_chaged(), 832 unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK);
831 * lc_put() in drbd_try_clear_on_disk_bm(). */ 833 unsigned long c;
832 wake_up = 1; 834
835 if (mode == RECORD_RS_FAILED)
836 /* Only called from drbd_rs_failed_io(), bits
837 * supposedly still set. Recount, maybe some
838 * of the bits have been successfully cleared
839 * by application IO meanwhile.
840 */
841 c = drbd_bm_count_bits(device, sbnr, tbnr);
842 else if (mode == SET_IN_SYNC)
843 c = drbd_bm_clear_bits(device, sbnr, tbnr);
844 else /* if (mode == SET_OUT_OF_SYNC) */
845 c = drbd_bm_set_bits(device, sbnr, tbnr);
846
847 if (c) {
848 spin_lock_irqsave(&device->al_lock, flags);
849 cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode);
850 spin_unlock_irqrestore(&device->al_lock, flags);
851 count += c;
852 }
853 sbnr = tbnr + 1;
833 } 854 }
834out: 855 if (count) {
835 put_ldev(device); 856 if (mode == SET_IN_SYNC) {
836 if (wake_up) 857 unsigned long still_to_go = drbd_bm_total_weight(device);
858 bool rs_is_done = (still_to_go <= device->rs_failed);
859 drbd_advance_rs_marks(device, still_to_go);
860 if (cleared || rs_is_done)
861 maybe_schedule_on_disk_bitmap_update(device, rs_is_done);
862 } else if (mode == RECORD_RS_FAILED)
863 device->rs_failed += count;
837 wake_up(&device->al_wait); 864 wake_up(&device->al_wait);
865 }
866 return count;
838} 867}
839 868
840/* 869/* clear the bit corresponding to the piece of storage in question:
841 * this is intended to set one request worth of data out of sync. 870 * size byte of data starting from sector. Only clear a bits of the affected
842 * affects at least 1 bit, 871 * one ore more _aligned_ BM_BLOCK_SIZE blocks.
843 * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits. 872 *
873 * called by worker on C_SYNC_TARGET and receiver on SyncSource.
844 * 874 *
845 * called by tl_clear and drbd_send_dblock (==drbd_make_request).
846 * so this can be _any_ process.
847 */ 875 */
848int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size, 876int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
849 const char *file, const unsigned int line) 877 enum update_sync_bits_mode mode,
878 const char *file, const unsigned int line)
850{ 879{
851 unsigned long sbnr, ebnr, flags; 880 /* Is called from worker and receiver context _only_ */
881 unsigned long sbnr, ebnr, lbnr;
882 unsigned long count = 0;
852 sector_t esector, nr_sectors; 883 sector_t esector, nr_sectors;
853 unsigned int enr, count = 0;
854 struct lc_element *e;
855 884
856 /* this should be an empty REQ_FLUSH */ 885 /* This would be an empty REQ_FLUSH, be silent. */
857 if (size == 0) 886 if ((mode == SET_OUT_OF_SYNC) && size == 0)
858 return 0; 887 return 0;
859 888
860 if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { 889 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
861 drbd_err(device, "sector: %llus, size: %d\n", 890 drbd_err(device, "%s: sector=%llus size=%d nonsense!\n",
862 (unsigned long long)sector, size); 891 drbd_change_sync_fname[mode],
892 (unsigned long long)sector, size);
863 return 0; 893 return 0;
864 } 894 }
865 895
866 if (!get_ldev(device)) 896 if (!get_ldev(device))
867 return 0; /* no disk, no metadata, no bitmap to set bits in */ 897 return 0; /* no disk, no metadata, no bitmap to manipulate bits in */
868 898
869 nr_sectors = drbd_get_capacity(device->this_bdev); 899 nr_sectors = drbd_get_capacity(device->this_bdev);
870 esector = sector + (size >> 9) - 1; 900 esector = sector + (size >> 9) - 1;
@@ -874,25 +904,28 @@ int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size
874 if (!expect(esector < nr_sectors)) 904 if (!expect(esector < nr_sectors))
875 esector = nr_sectors - 1; 905 esector = nr_sectors - 1;
876 906
877 /* we set it out of sync, 907 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
878 * we do not need to round anything here */
879 sbnr = BM_SECT_TO_BIT(sector);
880 ebnr = BM_SECT_TO_BIT(esector);
881
882 /* ok, (capacity & 7) != 0 sometimes, but who cares...
883 * we count rs_{total,left} in bits, not sectors. */
884 spin_lock_irqsave(&device->al_lock, flags);
885 count = drbd_bm_set_bits(device, sbnr, ebnr);
886 908
887 enr = BM_SECT_TO_EXT(sector); 909 if (mode == SET_IN_SYNC) {
888 e = lc_find(device->resync, enr); 910 /* Round up start sector, round down end sector. We make sure
889 if (e) 911 * we only clear full, aligned, BM_BLOCK_SIZE blocks. */
890 lc_entry(e, struct bm_extent, lce)->rs_left += count; 912 if (unlikely(esector < BM_SECT_PER_BIT-1))
891 spin_unlock_irqrestore(&device->al_lock, flags); 913 goto out;
914 if (unlikely(esector == (nr_sectors-1)))
915 ebnr = lbnr;
916 else
917 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
918 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
919 } else {
920 /* We set it out of sync, or record resync failure.
921 * Should not round anything here. */
922 sbnr = BM_SECT_TO_BIT(sector);
923 ebnr = BM_SECT_TO_BIT(esector);
924 }
892 925
926 count = update_sync_bits(device, sbnr, ebnr, mode);
893out: 927out:
894 put_ldev(device); 928 put_ldev(device);
895
896 return count; 929 return count;
897} 930}
898 931
@@ -1209,69 +1242,3 @@ int drbd_rs_del_all(struct drbd_device *device)
1209 1242
1210 return 0; 1243 return 0;
1211} 1244}
1212
1213/**
1214 * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
1215 * @device: DRBD device.
1216 * @sector: The sector number.
1217 * @size: Size of failed IO operation, in byte.
1218 */
1219void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size)
1220{
1221 /* Is called from worker and receiver context _only_ */
1222 unsigned long sbnr, ebnr, lbnr;
1223 unsigned long count;
1224 sector_t esector, nr_sectors;
1225 int wake_up = 0;
1226
1227 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
1228 drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
1229 (unsigned long long)sector, size);
1230 return;
1231 }
1232 nr_sectors = drbd_get_capacity(device->this_bdev);
1233 esector = sector + (size >> 9) - 1;
1234
1235 if (!expect(sector < nr_sectors))
1236 return;
1237 if (!expect(esector < nr_sectors))
1238 esector = nr_sectors - 1;
1239
1240 lbnr = BM_SECT_TO_BIT(nr_sectors-1);
1241
1242 /*
1243 * round up start sector, round down end sector. we make sure we only
1244 * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
1245 if (unlikely(esector < BM_SECT_PER_BIT-1))
1246 return;
1247 if (unlikely(esector == (nr_sectors-1)))
1248 ebnr = lbnr;
1249 else
1250 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
1251 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
1252
1253 if (sbnr > ebnr)
1254 return;
1255
1256 /*
1257 * ok, (capacity & 7) != 0 sometimes, but who cares...
1258 * we count rs_{total,left} in bits, not sectors.
1259 */
1260 spin_lock_irq(&device->al_lock);
1261 count = drbd_bm_count_bits(device, sbnr, ebnr);
1262 if (count) {
1263 device->rs_failed += count;
1264
1265 if (get_ldev(device)) {
1266 drbd_try_clear_on_disk_bm(device, sector, count, false);
1267 put_ldev(device);
1268 }
1269
1270 /* just wake_up unconditional now, various lc_chaged(),
1271 * lc_put() in drbd_try_clear_on_disk_bm(). */
1272 wake_up = 1;
1273 }
1274 spin_unlock_irq(&device->al_lock);
1275 if (wake_up)
1276 wake_up(&device->al_wait);
1277}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index eb002a7656af..a16f9ae3c98a 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -432,7 +432,11 @@ enum {
432 * goes into C_CONNECTED state. */ 432 * goes into C_CONNECTED state. */
433 CONSIDER_RESYNC, 433 CONSIDER_RESYNC,
434 434
435 RS_PROGRESS, /* tell worker that resync made significant progress */
436 RS_DONE, /* tell worker that resync is done */
437
435 MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ 438 MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */
439
436 SUSPEND_IO, /* suspend application io */ 440 SUSPEND_IO, /* suspend application io */
437 BITMAP_IO, /* suspend application io; 441 BITMAP_IO, /* suspend application io;
438 once no more io in flight, start bitmap io */ 442 once no more io in flight, start bitmap io */
@@ -577,6 +581,7 @@ enum {
577 * and potentially deadlock on, this drbd worker. 581 * and potentially deadlock on, this drbd worker.
578 */ 582 */
579 DISCONNECT_SENT, 583 DISCONNECT_SENT,
584 CONN_RS_PROGRESS, /* tell worker that resync made significant progress */
580}; 585};
581 586
582struct drbd_resource { 587struct drbd_resource {
@@ -1106,17 +1111,21 @@ struct bm_extent {
1106/* in which _bitmap_ extent (resp. sector) the bit for a certain 1111/* in which _bitmap_ extent (resp. sector) the bit for a certain
1107 * _storage_ sector is located in */ 1112 * _storage_ sector is located in */
1108#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9)) 1113#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9))
1114#define BM_BIT_TO_EXT(x) ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
1109 1115
1110/* how much _storage_ sectors we have per bitmap sector */ 1116/* first storage sector a bitmap extent corresponds to */
1111#define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9)) 1117#define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9))
1118/* how much _storage_ sectors we have per bitmap extent */
1112#define BM_SECT_PER_EXT BM_EXT_TO_SECT(1) 1119#define BM_SECT_PER_EXT BM_EXT_TO_SECT(1)
1120/* how many bits are covered by one bitmap extent (resync extent) */
1121#define BM_BITS_PER_EXT (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
1122
1123#define BM_BLOCKS_PER_BM_EXT_MASK (BM_BITS_PER_EXT - 1)
1124
1113 1125
1114/* in one sector of the bitmap, we have this many activity_log extents. */ 1126/* in one sector of the bitmap, we have this many activity_log extents. */
1115#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) 1127#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
1116 1128
1117#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
1118#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
1119
1120/* the extent in "PER_EXTENT" below is an activity log extent 1129/* the extent in "PER_EXTENT" below is an activity log extent
1121 * we need that many (long words/bytes) to store the bitmap 1130 * we need that many (long words/bytes) to store the bitmap
1122 * of one AL_EXTENT_SIZE chunk of storage. 1131 * of one AL_EXTENT_SIZE chunk of storage.
@@ -1214,7 +1223,6 @@ extern unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned lon
1214extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo); 1223extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo);
1215extern unsigned long _drbd_bm_total_weight(struct drbd_device *device); 1224extern unsigned long _drbd_bm_total_weight(struct drbd_device *device);
1216extern unsigned long drbd_bm_total_weight(struct drbd_device *device); 1225extern unsigned long drbd_bm_total_weight(struct drbd_device *device);
1217extern int drbd_bm_rs_done(struct drbd_device *device);
1218/* for receive_bitmap */ 1226/* for receive_bitmap */
1219extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, 1227extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset,
1220 size_t number, unsigned long *buffer); 1228 size_t number, unsigned long *buffer);
@@ -1503,14 +1511,17 @@ extern int drbd_rs_del_all(struct drbd_device *device);
1503extern void drbd_rs_failed_io(struct drbd_device *device, 1511extern void drbd_rs_failed_io(struct drbd_device *device,
1504 sector_t sector, int size); 1512 sector_t sector, int size);
1505extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go); 1513extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go);
1506extern void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, 1514
1507 int size, const char *file, const unsigned int line); 1515enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC };
1516extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
1517 enum update_sync_bits_mode mode,
1518 const char *file, const unsigned int line);
1508#define drbd_set_in_sync(device, sector, size) \ 1519#define drbd_set_in_sync(device, sector, size) \
1509 __drbd_set_in_sync(device, sector, size, __FILE__, __LINE__) 1520 __drbd_change_sync(device, sector, size, SET_IN_SYNC, __FILE__, __LINE__)
1510extern int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector,
1511 int size, const char *file, const unsigned int line);
1512#define drbd_set_out_of_sync(device, sector, size) \ 1521#define drbd_set_out_of_sync(device, sector, size) \
1513 __drbd_set_out_of_sync(device, sector, size, __FILE__, __LINE__) 1522 __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC, __FILE__, __LINE__)
1523#define drbd_rs_failed_io(device, sector, size) \
1524 __drbd_change_sync(device, sector, size, RECORD_RS_FAILED, __FILE__, __LINE__)
1514extern void drbd_al_shrink(struct drbd_device *device); 1525extern void drbd_al_shrink(struct drbd_device *device);
1515extern int drbd_initialize_al(struct drbd_device *, void *); 1526extern int drbd_initialize_al(struct drbd_device *, void *);
1516 1527
@@ -1915,6 +1926,15 @@ static inline void _sub_unacked(struct drbd_device *device, int n, const char *f
1915 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); 1926 ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
1916} 1927}
1917 1928
1929static inline bool is_sync_state(enum drbd_conns connection_state)
1930{
1931 return
1932 (connection_state == C_SYNC_SOURCE
1933 || connection_state == C_SYNC_TARGET
1934 || connection_state == C_PAUSED_SYNC_S
1935 || connection_state == C_PAUSED_SYNC_T);
1936}
1937
1918/** 1938/**
1919 * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev 1939 * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev
1920 * @M: DRBD device. 1940 * @M: DRBD device.
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 19da7c7590cd..1bddd6cf8ac7 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -1011,6 +1011,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
1011 atomic_inc(&device->local_cnt); 1011 atomic_inc(&device->local_cnt);
1012 1012
1013 did_remote = drbd_should_do_remote(device->state); 1013 did_remote = drbd_should_do_remote(device->state);
1014 if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
1015 clear_bit(RS_DONE, &device->flags);
1016
1014 device->state.i = ns.i; 1017 device->state.i = ns.i;
1015 should_do_remote = drbd_should_do_remote(device->state); 1018 should_do_remote = drbd_should_do_remote(device->state);
1016 device->resource->susp = ns.susp; 1019 device->resource->susp = ns.susp;
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 47bc84017b5b..bafb62eb22c9 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1740,11 +1740,20 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1740 device->rs_mark_time[i] = now; 1740 device->rs_mark_time[i] = now;
1741 } 1741 }
1742 _drbd_pause_after(device); 1742 _drbd_pause_after(device);
1743 /* Forget potentially stale cached per resync extent bit-counts.
1744 * Open coded drbd_rs_cancel_all(device), we already have IRQs
1745 * disabled, and know the disk state is ok. */
1746 spin_lock(&device->al_lock);
1747 lc_reset(device->resync);
1748 device->resync_locked = 0;
1749 device->resync_wenr = LC_FREE;
1750 spin_unlock(&device->al_lock);
1743 } 1751 }
1744 write_unlock(&global_state_lock); 1752 write_unlock(&global_state_lock);
1745 spin_unlock_irq(&device->resource->req_lock); 1753 spin_unlock_irq(&device->resource->req_lock);
1746 1754
1747 if (r == SS_SUCCESS) { 1755 if (r == SS_SUCCESS) {
1756 wake_up(&device->al_wait); /* for lc_reset() above */
1748 /* reset rs_last_bcast when a resync or verify is started, 1757 /* reset rs_last_bcast when a resync or verify is started,
1749 * to deal with potential jiffies wrap. */ 1758 * to deal with potential jiffies wrap. */
1750 device->rs_last_bcast = jiffies - HZ; 1759 device->rs_last_bcast = jiffies - HZ;
@@ -1807,36 +1816,22 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1807static void update_on_disk_bitmap(struct drbd_device *device) 1816static void update_on_disk_bitmap(struct drbd_device *device)
1808{ 1817{
1809 struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; 1818 struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
1819 bool resync_done = test_and_clear_bit(RS_DONE, &device->flags);
1810 device->rs_last_bcast = jiffies; 1820 device->rs_last_bcast = jiffies;
1811 1821
1812 if (!get_ldev(device)) 1822 if (!get_ldev(device))
1813 return; 1823 return;
1814 1824
1815 drbd_bm_write_lazy(device, 0); 1825 drbd_bm_write_lazy(device, 0);
1816 if (drbd_bm_total_weight(device) <= device->rs_failed) 1826 if (resync_done && is_sync_state(device->state.conn))
1817 drbd_resync_finished(device); 1827 drbd_resync_finished(device);
1828
1818 drbd_bcast_event(device, &sib); 1829 drbd_bcast_event(device, &sib);
1819 /* update timestamp, in case it took a while to write out stuff */ 1830 /* update timestamp, in case it took a while to write out stuff */
1820 device->rs_last_bcast = jiffies; 1831 device->rs_last_bcast = jiffies;
1821 put_ldev(device); 1832 put_ldev(device);
1822} 1833}
1823 1834
1824bool wants_lazy_bitmap_update(struct drbd_device *device)
1825{
1826 enum drbd_conns connection_state = device->state.conn;
1827 return
1828 /* only do a lazy writeout, if device is in some resync state */
1829 (connection_state == C_SYNC_SOURCE
1830 || connection_state == C_SYNC_TARGET
1831 || connection_state == C_PAUSED_SYNC_S
1832 || connection_state == C_PAUSED_SYNC_T) &&
1833 /* AND
1834 * either we just finished, or the last lazy update
1835 * was some time ago already. */
1836 (drbd_bm_total_weight(device) <= device->rs_failed
1837 || time_after(jiffies, device->rs_last_bcast + 2*HZ));
1838}
1839
1840static void try_update_all_on_disk_bitmaps(struct drbd_connection *connection) 1835static void try_update_all_on_disk_bitmaps(struct drbd_connection *connection)
1841{ 1836{
1842 struct drbd_peer_device *peer_device; 1837 struct drbd_peer_device *peer_device;
@@ -1845,8 +1840,9 @@ static void try_update_all_on_disk_bitmaps(struct drbd_connection *connection)
1845 rcu_read_lock(); 1840 rcu_read_lock();
1846 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 1841 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1847 struct drbd_device *device = peer_device->device; 1842 struct drbd_device *device = peer_device->device;
1848 if (!wants_lazy_bitmap_update(device)) 1843 if (!test_and_clear_bit(RS_PROGRESS, &device->flags))
1849 continue; 1844 continue;
1845
1850 kref_get(&device->kref); 1846 kref_get(&device->kref);
1851 rcu_read_unlock(); 1847 rcu_read_unlock();
1852 update_on_disk_bitmap(device); 1848 update_on_disk_bitmap(device);
@@ -1930,15 +1926,18 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
1930 if (send_barrier) 1926 if (send_barrier)
1931 maybe_send_barrier(connection, 1927 maybe_send_barrier(connection,
1932 connection->send.current_epoch_nr + 1); 1928 connection->send.current_epoch_nr + 1);
1929
1930 if (test_bit(CONN_RS_PROGRESS, &connection->flags))
1931 break;
1932
1933 /* drbd_send() may have called flush_signals() */ 1933 /* drbd_send() may have called flush_signals() */
1934 if (get_t_state(&connection->worker) != RUNNING) 1934 if (get_t_state(&connection->worker) != RUNNING)
1935 break; 1935 break;
1936
1936 schedule(); 1937 schedule();
1937 /* may be woken up for other things but new work, too, 1938 /* may be woken up for other things but new work, too,
1938 * e.g. if the current epoch got closed. 1939 * e.g. if the current epoch got closed.
1939 * In which case we send the barrier above. */ 1940 * In which case we send the barrier above. */
1940
1941 try_update_all_on_disk_bitmaps(connection);
1942 } 1941 }
1943 finish_wait(&connection->sender_work.q_wait, &wait); 1942 finish_wait(&connection->sender_work.q_wait, &wait);
1944 1943
@@ -1973,6 +1972,9 @@ int drbd_worker(struct drbd_thread *thi)
1973 if (list_empty(&work_list)) 1972 if (list_empty(&work_list))
1974 wait_for_work(connection, &work_list); 1973 wait_for_work(connection, &work_list);
1975 1974
1975 if (test_and_clear_bit(CONN_RS_PROGRESS, &connection->flags))
1976 try_update_all_on_disk_bitmaps(connection);
1977
1976 if (signal_pending(current)) { 1978 if (signal_pending(current)) {
1977 flush_signals(current); 1979 flush_signals(current);
1978 if (get_t_state(thi) == RUNNING) { 1980 if (get_t_state(thi) == RUNNING) {