diff options
author | Lars Ellenberg <lars.ellenberg@linbit.com> | 2014-01-27 09:58:22 -0500 |
---|---|---|
committer | Philipp Reisner <philipp.reisner@linbit.com> | 2014-07-10 12:34:50 -0400 |
commit | 5ab7d2c005135849cf0bb1485d954c98f2cca57c (patch) | |
tree | 43340069d199c864871c04f9672639415a7bb8fa /drivers/block | |
parent | a80ca1ae81fc52e304e753f6de4ef248df364f9e (diff) |
drbd: fix resync finished detection
This fixes one recent regresion,
and one long existing bug.
The bug:
drbd_try_clear_on_disk_bm() assumed that all "count" bits have to be
accounted in the resync extent corresponding to the start sector.
Since we allow application requests to cross our "extent" boundaries,
this assumption is no longer true, resulting in possible misaccounting,
scary messages
("BAD! sector=12345s enr=6 rs_left=-7 rs_failed=0 count=58 cstate=..."),
and potentially, if the last bit to be cleared during resync would
reside in previously misaccounted resync extent, the resync would never
be recognized as finished, but would be "stalled" forever, even though
all blocks are in sync again and all bits have been cleared...
The regression was introduced by
drbd: get rid of atomic update on disk bitmap works
For an "empty" resync (rs_total == 0), we must not "finish" the
resync on the SyncSource before the SyncTarget knows all relevant
information (sync uuid). We need to wait for the full round-trip,
the SyncTarget will then explicitly notify us.
Also for normal, non-empty resyncs (rs_total > 0), the resync-finished
condition needs to be tested before the schedule() in wait_for_work, or
it is likely to be missed.
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/drbd/drbd_actlog.c | 315 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 42 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_state.c | 3 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 42 |
4 files changed, 197 insertions, 205 deletions
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 9c42edf4871b..278c31f24639 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c | |||
@@ -667,36 +667,56 @@ int drbd_initialize_al(struct drbd_device *device, void *buffer) | |||
667 | return 0; | 667 | return 0; |
668 | } | 668 | } |
669 | 669 | ||
670 | static const char *drbd_change_sync_fname[] = { | ||
671 | [RECORD_RS_FAILED] = "drbd_rs_failed_io", | ||
672 | [SET_IN_SYNC] = "drbd_set_in_sync", | ||
673 | [SET_OUT_OF_SYNC] = "drbd_set_out_of_sync" | ||
674 | }; | ||
675 | |||
670 | /* ATTENTION. The AL's extents are 4MB each, while the extents in the | 676 | /* ATTENTION. The AL's extents are 4MB each, while the extents in the |
671 | * resync LRU-cache are 16MB each. | 677 | * resync LRU-cache are 16MB each. |
672 | * The caller of this function has to hold an get_ldev() reference. | 678 | * The caller of this function has to hold an get_ldev() reference. |
673 | * | 679 | * |
680 | * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success), | ||
681 | * potentially pulling in (and recounting the corresponding bits) | ||
682 | * this resync extent into the resync extent lru cache. | ||
683 | * | ||
684 | * Returns whether all bits have been cleared for this resync extent, | ||
685 | * precisely: (rs_left <= rs_failed) | ||
686 | * | ||
674 | * TODO will be obsoleted once we have a caching lru of the on disk bitmap | 687 | * TODO will be obsoleted once we have a caching lru of the on disk bitmap |
675 | */ | 688 | */ |
676 | static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t sector, | 689 | static bool update_rs_extent(struct drbd_device *device, |
677 | int count, int success) | 690 | unsigned int enr, int count, |
691 | enum update_sync_bits_mode mode) | ||
678 | { | 692 | { |
679 | struct lc_element *e; | 693 | struct lc_element *e; |
680 | unsigned int enr; | ||
681 | 694 | ||
682 | D_ASSERT(device, atomic_read(&device->local_cnt)); | 695 | D_ASSERT(device, atomic_read(&device->local_cnt)); |
683 | 696 | ||
684 | /* I simply assume that a sector/size pair never crosses | 697 | /* When setting out-of-sync bits, |
685 | * a 16 MB extent border. (Currently this is true...) */ | 698 | * we don't need it cached (lc_find). |
686 | enr = BM_SECT_TO_EXT(sector); | 699 | * But if it is present in the cache, |
687 | 700 | * we should update the cached bit count. | |
688 | e = lc_get(device->resync, enr); | 701 | * Otherwise, that extent should be in the resync extent lru cache |
702 | * already -- or we want to pull it in if necessary -- (lc_get), | ||
703 | * then update and check rs_left and rs_failed. */ | ||
704 | if (mode == SET_OUT_OF_SYNC) | ||
705 | e = lc_find(device->resync, enr); | ||
706 | else | ||
707 | e = lc_get(device->resync, enr); | ||
689 | if (e) { | 708 | if (e) { |
690 | struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); | 709 | struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); |
691 | if (ext->lce.lc_number == enr) { | 710 | if (ext->lce.lc_number == enr) { |
692 | if (success) | 711 | if (mode == SET_IN_SYNC) |
693 | ext->rs_left -= count; | 712 | ext->rs_left -= count; |
713 | else if (mode == SET_OUT_OF_SYNC) | ||
714 | ext->rs_left += count; | ||
694 | else | 715 | else |
695 | ext->rs_failed += count; | 716 | ext->rs_failed += count; |
696 | if (ext->rs_left < ext->rs_failed) { | 717 | if (ext->rs_left < ext->rs_failed) { |
697 | drbd_warn(device, "BAD! sector=%llus enr=%u rs_left=%d " | 718 | drbd_warn(device, "BAD! enr=%u rs_left=%d " |
698 | "rs_failed=%d count=%d cstate=%s\n", | 719 | "rs_failed=%d count=%d cstate=%s\n", |
699 | (unsigned long long)sector, | ||
700 | ext->lce.lc_number, ext->rs_left, | 720 | ext->lce.lc_number, ext->rs_left, |
701 | ext->rs_failed, count, | 721 | ext->rs_failed, count, |
702 | drbd_conn_str(device->state.conn)); | 722 | drbd_conn_str(device->state.conn)); |
@@ -730,24 +750,27 @@ static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t secto | |||
730 | ext->lce.lc_number, ext->rs_failed); | 750 | ext->lce.lc_number, ext->rs_failed); |
731 | } | 751 | } |
732 | ext->rs_left = rs_left; | 752 | ext->rs_left = rs_left; |
733 | ext->rs_failed = success ? 0 : count; | 753 | ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0; |
734 | /* we don't keep a persistent log of the resync lru, | 754 | /* we don't keep a persistent log of the resync lru, |
735 | * we can commit any change right away. */ | 755 | * we can commit any change right away. */ |
736 | lc_committed(device->resync); | 756 | lc_committed(device->resync); |
737 | } | 757 | } |
738 | lc_put(device->resync, &ext->lce); | 758 | if (mode != SET_OUT_OF_SYNC) |
759 | lc_put(device->resync, &ext->lce); | ||
739 | /* no race, we are within the al_lock! */ | 760 | /* no race, we are within the al_lock! */ |
740 | 761 | ||
741 | if (ext->rs_left == ext->rs_failed) { | 762 | if (ext->rs_left <= ext->rs_failed) { |
742 | ext->rs_failed = 0; | 763 | ext->rs_failed = 0; |
743 | wake_up(&first_peer_device(device)->connection->sender_work.q_wait); | 764 | return true; |
744 | } | 765 | } |
745 | } else { | 766 | } else if (mode != SET_OUT_OF_SYNC) { |
767 | /* be quiet if lc_find() did not find it. */ | ||
746 | drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n", | 768 | drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n", |
747 | device->resync_locked, | 769 | device->resync_locked, |
748 | device->resync->nr_elements, | 770 | device->resync->nr_elements, |
749 | device->resync->flags); | 771 | device->resync->flags); |
750 | } | 772 | } |
773 | return false; | ||
751 | } | 774 | } |
752 | 775 | ||
753 | void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go) | 776 | void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go) |
@@ -766,105 +789,112 @@ void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go | |||
766 | } | 789 | } |
767 | } | 790 | } |
768 | 791 | ||
769 | /* clear the bit corresponding to the piece of storage in question: | 792 | /* It is called lazy update, so don't do write-out too often. */ |
770 | * size byte of data starting from sector. Only clear a bits of the affected | 793 | static bool lazy_bitmap_update_due(struct drbd_device *device) |
771 | * one ore more _aligned_ BM_BLOCK_SIZE blocks. | ||
772 | * | ||
773 | * called by worker on C_SYNC_TARGET and receiver on SyncSource. | ||
774 | * | ||
775 | */ | ||
776 | void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size, | ||
777 | const char *file, const unsigned int line) | ||
778 | { | 794 | { |
779 | /* Is called from worker and receiver context _only_ */ | 795 | return time_after(jiffies, device->rs_last_bcast + 2*HZ); |
780 | unsigned long sbnr, ebnr, lbnr; | 796 | } |
781 | unsigned long count = 0; | ||
782 | sector_t esector, nr_sectors; | ||
783 | int wake_up = 0; | ||
784 | unsigned long flags; | ||
785 | 797 | ||
786 | if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { | 798 | static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done) |
787 | drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", | 799 | { |
788 | (unsigned long long)sector, size); | 800 | struct drbd_connection *connection; |
801 | if (rs_done) | ||
802 | set_bit(RS_DONE, &device->flags); | ||
803 | /* and also set RS_PROGRESS below */ | ||
804 | else if (!lazy_bitmap_update_due(device)) | ||
789 | return; | 805 | return; |
790 | } | ||
791 | 806 | ||
792 | if (!get_ldev(device)) | 807 | /* compare with test_and_clear_bit() calls in and above |
793 | return; /* no disk, no metadata, no bitmap to clear bits in */ | 808 | * try_update_all_on_disk_bitmaps() from the drbd_worker(). */ |
794 | 809 | if (test_and_set_bit(RS_PROGRESS, &device->flags)) | |
795 | nr_sectors = drbd_get_capacity(device->this_bdev); | 810 | return; |
796 | esector = sector + (size >> 9) - 1; | 811 | connection = first_peer_device(device)->connection; |
797 | 812 | if (!test_and_set_bit(CONN_RS_PROGRESS, &connection->flags)) | |
798 | if (!expect(sector < nr_sectors)) | 813 | wake_up(&connection->sender_work.q_wait); |
799 | goto out; | 814 | } |
800 | if (!expect(esector < nr_sectors)) | ||
801 | esector = nr_sectors - 1; | ||
802 | |||
803 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | ||
804 | |||
805 | /* we clear it (in sync). | ||
806 | * round up start sector, round down end sector. we make sure we only | ||
807 | * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ | ||
808 | if (unlikely(esector < BM_SECT_PER_BIT-1)) | ||
809 | goto out; | ||
810 | if (unlikely(esector == (nr_sectors-1))) | ||
811 | ebnr = lbnr; | ||
812 | else | ||
813 | ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); | ||
814 | sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); | ||
815 | |||
816 | if (sbnr > ebnr) | ||
817 | goto out; | ||
818 | 815 | ||
816 | static int update_sync_bits(struct drbd_device *device, | ||
817 | unsigned long sbnr, unsigned long ebnr, | ||
818 | enum update_sync_bits_mode mode) | ||
819 | { | ||
819 | /* | 820 | /* |
820 | * ok, (capacity & 7) != 0 sometimes, but who cares... | 821 | * We keep a count of set bits per resync-extent in the ->rs_left |
821 | * we count rs_{total,left} in bits, not sectors. | 822 | * caching member, so we need to loop and work within the resync extent |
823 | * alignment. Typically this loop will execute exactly once. | ||
822 | */ | 824 | */ |
823 | count = drbd_bm_clear_bits(device, sbnr, ebnr); | 825 | unsigned long flags; |
824 | if (count) { | 826 | unsigned long count = 0; |
825 | drbd_advance_rs_marks(device, drbd_bm_total_weight(device)); | 827 | unsigned int cleared = 0; |
826 | spin_lock_irqsave(&device->al_lock, flags); | 828 | while (sbnr <= ebnr) { |
827 | drbd_try_clear_on_disk_bm(device, sector, count, true); | 829 | /* set temporary boundary bit number to last bit number within |
828 | spin_unlock_irqrestore(&device->al_lock, flags); | 830 | * the resync extent of the current start bit number, |
829 | 831 | * but cap at provided end bit number */ | |
830 | /* just wake_up unconditional now, various lc_chaged(), | 832 | unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK); |
831 | * lc_put() in drbd_try_clear_on_disk_bm(). */ | 833 | unsigned long c; |
832 | wake_up = 1; | 834 | |
835 | if (mode == RECORD_RS_FAILED) | ||
836 | /* Only called from drbd_rs_failed_io(), bits | ||
837 | * supposedly still set. Recount, maybe some | ||
838 | * of the bits have been successfully cleared | ||
839 | * by application IO meanwhile. | ||
840 | */ | ||
841 | c = drbd_bm_count_bits(device, sbnr, tbnr); | ||
842 | else if (mode == SET_IN_SYNC) | ||
843 | c = drbd_bm_clear_bits(device, sbnr, tbnr); | ||
844 | else /* if (mode == SET_OUT_OF_SYNC) */ | ||
845 | c = drbd_bm_set_bits(device, sbnr, tbnr); | ||
846 | |||
847 | if (c) { | ||
848 | spin_lock_irqsave(&device->al_lock, flags); | ||
849 | cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode); | ||
850 | spin_unlock_irqrestore(&device->al_lock, flags); | ||
851 | count += c; | ||
852 | } | ||
853 | sbnr = tbnr + 1; | ||
833 | } | 854 | } |
834 | out: | 855 | if (count) { |
835 | put_ldev(device); | 856 | if (mode == SET_IN_SYNC) { |
836 | if (wake_up) | 857 | unsigned long still_to_go = drbd_bm_total_weight(device); |
858 | bool rs_is_done = (still_to_go <= device->rs_failed); | ||
859 | drbd_advance_rs_marks(device, still_to_go); | ||
860 | if (cleared || rs_is_done) | ||
861 | maybe_schedule_on_disk_bitmap_update(device, rs_is_done); | ||
862 | } else if (mode == RECORD_RS_FAILED) | ||
863 | device->rs_failed += count; | ||
837 | wake_up(&device->al_wait); | 864 | wake_up(&device->al_wait); |
865 | } | ||
866 | return count; | ||
838 | } | 867 | } |
839 | 868 | ||
840 | /* | 869 | /* clear the bit corresponding to the piece of storage in question: |
841 | * this is intended to set one request worth of data out of sync. | 870 | * size byte of data starting from sector. Only clear a bits of the affected |
842 | * affects at least 1 bit, | 871 | * one ore more _aligned_ BM_BLOCK_SIZE blocks. |
843 | * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits. | 872 | * |
873 | * called by worker on C_SYNC_TARGET and receiver on SyncSource. | ||
844 | * | 874 | * |
845 | * called by tl_clear and drbd_send_dblock (==drbd_make_request). | ||
846 | * so this can be _any_ process. | ||
847 | */ | 875 | */ |
848 | int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size, | 876 | int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, |
849 | const char *file, const unsigned int line) | 877 | enum update_sync_bits_mode mode, |
878 | const char *file, const unsigned int line) | ||
850 | { | 879 | { |
851 | unsigned long sbnr, ebnr, flags; | 880 | /* Is called from worker and receiver context _only_ */ |
881 | unsigned long sbnr, ebnr, lbnr; | ||
882 | unsigned long count = 0; | ||
852 | sector_t esector, nr_sectors; | 883 | sector_t esector, nr_sectors; |
853 | unsigned int enr, count = 0; | ||
854 | struct lc_element *e; | ||
855 | 884 | ||
856 | /* this should be an empty REQ_FLUSH */ | 885 | /* This would be an empty REQ_FLUSH, be silent. */ |
857 | if (size == 0) | 886 | if ((mode == SET_OUT_OF_SYNC) && size == 0) |
858 | return 0; | 887 | return 0; |
859 | 888 | ||
860 | if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { | 889 | if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { |
861 | drbd_err(device, "sector: %llus, size: %d\n", | 890 | drbd_err(device, "%s: sector=%llus size=%d nonsense!\n", |
862 | (unsigned long long)sector, size); | 891 | drbd_change_sync_fname[mode], |
892 | (unsigned long long)sector, size); | ||
863 | return 0; | 893 | return 0; |
864 | } | 894 | } |
865 | 895 | ||
866 | if (!get_ldev(device)) | 896 | if (!get_ldev(device)) |
867 | return 0; /* no disk, no metadata, no bitmap to set bits in */ | 897 | return 0; /* no disk, no metadata, no bitmap to manipulate bits in */ |
868 | 898 | ||
869 | nr_sectors = drbd_get_capacity(device->this_bdev); | 899 | nr_sectors = drbd_get_capacity(device->this_bdev); |
870 | esector = sector + (size >> 9) - 1; | 900 | esector = sector + (size >> 9) - 1; |
@@ -874,25 +904,28 @@ int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size | |||
874 | if (!expect(esector < nr_sectors)) | 904 | if (!expect(esector < nr_sectors)) |
875 | esector = nr_sectors - 1; | 905 | esector = nr_sectors - 1; |
876 | 906 | ||
877 | /* we set it out of sync, | 907 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); |
878 | * we do not need to round anything here */ | ||
879 | sbnr = BM_SECT_TO_BIT(sector); | ||
880 | ebnr = BM_SECT_TO_BIT(esector); | ||
881 | |||
882 | /* ok, (capacity & 7) != 0 sometimes, but who cares... | ||
883 | * we count rs_{total,left} in bits, not sectors. */ | ||
884 | spin_lock_irqsave(&device->al_lock, flags); | ||
885 | count = drbd_bm_set_bits(device, sbnr, ebnr); | ||
886 | 908 | ||
887 | enr = BM_SECT_TO_EXT(sector); | 909 | if (mode == SET_IN_SYNC) { |
888 | e = lc_find(device->resync, enr); | 910 | /* Round up start sector, round down end sector. We make sure |
889 | if (e) | 911 | * we only clear full, aligned, BM_BLOCK_SIZE blocks. */ |
890 | lc_entry(e, struct bm_extent, lce)->rs_left += count; | 912 | if (unlikely(esector < BM_SECT_PER_BIT-1)) |
891 | spin_unlock_irqrestore(&device->al_lock, flags); | 913 | goto out; |
914 | if (unlikely(esector == (nr_sectors-1))) | ||
915 | ebnr = lbnr; | ||
916 | else | ||
917 | ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); | ||
918 | sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); | ||
919 | } else { | ||
920 | /* We set it out of sync, or record resync failure. | ||
921 | * Should not round anything here. */ | ||
922 | sbnr = BM_SECT_TO_BIT(sector); | ||
923 | ebnr = BM_SECT_TO_BIT(esector); | ||
924 | } | ||
892 | 925 | ||
926 | count = update_sync_bits(device, sbnr, ebnr, mode); | ||
893 | out: | 927 | out: |
894 | put_ldev(device); | 928 | put_ldev(device); |
895 | |||
896 | return count; | 929 | return count; |
897 | } | 930 | } |
898 | 931 | ||
@@ -1209,69 +1242,3 @@ int drbd_rs_del_all(struct drbd_device *device) | |||
1209 | 1242 | ||
1210 | return 0; | 1243 | return 0; |
1211 | } | 1244 | } |
1212 | |||
1213 | /** | ||
1214 | * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks | ||
1215 | * @device: DRBD device. | ||
1216 | * @sector: The sector number. | ||
1217 | * @size: Size of failed IO operation, in byte. | ||
1218 | */ | ||
1219 | void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size) | ||
1220 | { | ||
1221 | /* Is called from worker and receiver context _only_ */ | ||
1222 | unsigned long sbnr, ebnr, lbnr; | ||
1223 | unsigned long count; | ||
1224 | sector_t esector, nr_sectors; | ||
1225 | int wake_up = 0; | ||
1226 | |||
1227 | if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { | ||
1228 | drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", | ||
1229 | (unsigned long long)sector, size); | ||
1230 | return; | ||
1231 | } | ||
1232 | nr_sectors = drbd_get_capacity(device->this_bdev); | ||
1233 | esector = sector + (size >> 9) - 1; | ||
1234 | |||
1235 | if (!expect(sector < nr_sectors)) | ||
1236 | return; | ||
1237 | if (!expect(esector < nr_sectors)) | ||
1238 | esector = nr_sectors - 1; | ||
1239 | |||
1240 | lbnr = BM_SECT_TO_BIT(nr_sectors-1); | ||
1241 | |||
1242 | /* | ||
1243 | * round up start sector, round down end sector. we make sure we only | ||
1244 | * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */ | ||
1245 | if (unlikely(esector < BM_SECT_PER_BIT-1)) | ||
1246 | return; | ||
1247 | if (unlikely(esector == (nr_sectors-1))) | ||
1248 | ebnr = lbnr; | ||
1249 | else | ||
1250 | ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); | ||
1251 | sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); | ||
1252 | |||
1253 | if (sbnr > ebnr) | ||
1254 | return; | ||
1255 | |||
1256 | /* | ||
1257 | * ok, (capacity & 7) != 0 sometimes, but who cares... | ||
1258 | * we count rs_{total,left} in bits, not sectors. | ||
1259 | */ | ||
1260 | spin_lock_irq(&device->al_lock); | ||
1261 | count = drbd_bm_count_bits(device, sbnr, ebnr); | ||
1262 | if (count) { | ||
1263 | device->rs_failed += count; | ||
1264 | |||
1265 | if (get_ldev(device)) { | ||
1266 | drbd_try_clear_on_disk_bm(device, sector, count, false); | ||
1267 | put_ldev(device); | ||
1268 | } | ||
1269 | |||
1270 | /* just wake_up unconditional now, various lc_chaged(), | ||
1271 | * lc_put() in drbd_try_clear_on_disk_bm(). */ | ||
1272 | wake_up = 1; | ||
1273 | } | ||
1274 | spin_unlock_irq(&device->al_lock); | ||
1275 | if (wake_up) | ||
1276 | wake_up(&device->al_wait); | ||
1277 | } | ||
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index eb002a7656af..a16f9ae3c98a 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -432,7 +432,11 @@ enum { | |||
432 | * goes into C_CONNECTED state. */ | 432 | * goes into C_CONNECTED state. */ |
433 | CONSIDER_RESYNC, | 433 | CONSIDER_RESYNC, |
434 | 434 | ||
435 | RS_PROGRESS, /* tell worker that resync made significant progress */ | ||
436 | RS_DONE, /* tell worker that resync is done */ | ||
437 | |||
435 | MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ | 438 | MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ |
439 | |||
436 | SUSPEND_IO, /* suspend application io */ | 440 | SUSPEND_IO, /* suspend application io */ |
437 | BITMAP_IO, /* suspend application io; | 441 | BITMAP_IO, /* suspend application io; |
438 | once no more io in flight, start bitmap io */ | 442 | once no more io in flight, start bitmap io */ |
@@ -577,6 +581,7 @@ enum { | |||
577 | * and potentially deadlock on, this drbd worker. | 581 | * and potentially deadlock on, this drbd worker. |
578 | */ | 582 | */ |
579 | DISCONNECT_SENT, | 583 | DISCONNECT_SENT, |
584 | CONN_RS_PROGRESS, /* tell worker that resync made significant progress */ | ||
580 | }; | 585 | }; |
581 | 586 | ||
582 | struct drbd_resource { | 587 | struct drbd_resource { |
@@ -1106,17 +1111,21 @@ struct bm_extent { | |||
1106 | /* in which _bitmap_ extent (resp. sector) the bit for a certain | 1111 | /* in which _bitmap_ extent (resp. sector) the bit for a certain |
1107 | * _storage_ sector is located in */ | 1112 | * _storage_ sector is located in */ |
1108 | #define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9)) | 1113 | #define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9)) |
1114 | #define BM_BIT_TO_EXT(x) ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT)) | ||
1109 | 1115 | ||
1110 | /* how much _storage_ sectors we have per bitmap sector */ | 1116 | /* first storage sector a bitmap extent corresponds to */ |
1111 | #define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9)) | 1117 | #define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9)) |
1118 | /* how much _storage_ sectors we have per bitmap extent */ | ||
1112 | #define BM_SECT_PER_EXT BM_EXT_TO_SECT(1) | 1119 | #define BM_SECT_PER_EXT BM_EXT_TO_SECT(1) |
1120 | /* how many bits are covered by one bitmap extent (resync extent) */ | ||
1121 | #define BM_BITS_PER_EXT (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT)) | ||
1122 | |||
1123 | #define BM_BLOCKS_PER_BM_EXT_MASK (BM_BITS_PER_EXT - 1) | ||
1124 | |||
1113 | 1125 | ||
1114 | /* in one sector of the bitmap, we have this many activity_log extents. */ | 1126 | /* in one sector of the bitmap, we have this many activity_log extents. */ |
1115 | #define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) | 1127 | #define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) |
1116 | 1128 | ||
1117 | #define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT) | ||
1118 | #define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1) | ||
1119 | |||
1120 | /* the extent in "PER_EXTENT" below is an activity log extent | 1129 | /* the extent in "PER_EXTENT" below is an activity log extent |
1121 | * we need that many (long words/bytes) to store the bitmap | 1130 | * we need that many (long words/bytes) to store the bitmap |
1122 | * of one AL_EXTENT_SIZE chunk of storage. | 1131 | * of one AL_EXTENT_SIZE chunk of storage. |
@@ -1214,7 +1223,6 @@ extern unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned lon | |||
1214 | extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo); | 1223 | extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo); |
1215 | extern unsigned long _drbd_bm_total_weight(struct drbd_device *device); | 1224 | extern unsigned long _drbd_bm_total_weight(struct drbd_device *device); |
1216 | extern unsigned long drbd_bm_total_weight(struct drbd_device *device); | 1225 | extern unsigned long drbd_bm_total_weight(struct drbd_device *device); |
1217 | extern int drbd_bm_rs_done(struct drbd_device *device); | ||
1218 | /* for receive_bitmap */ | 1226 | /* for receive_bitmap */ |
1219 | extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, | 1227 | extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, |
1220 | size_t number, unsigned long *buffer); | 1228 | size_t number, unsigned long *buffer); |
@@ -1503,14 +1511,17 @@ extern int drbd_rs_del_all(struct drbd_device *device); | |||
1503 | extern void drbd_rs_failed_io(struct drbd_device *device, | 1511 | extern void drbd_rs_failed_io(struct drbd_device *device, |
1504 | sector_t sector, int size); | 1512 | sector_t sector, int size); |
1505 | extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go); | 1513 | extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go); |
1506 | extern void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, | 1514 | |
1507 | int size, const char *file, const unsigned int line); | 1515 | enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC }; |
1516 | extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, | ||
1517 | enum update_sync_bits_mode mode, | ||
1518 | const char *file, const unsigned int line); | ||
1508 | #define drbd_set_in_sync(device, sector, size) \ | 1519 | #define drbd_set_in_sync(device, sector, size) \ |
1509 | __drbd_set_in_sync(device, sector, size, __FILE__, __LINE__) | 1520 | __drbd_change_sync(device, sector, size, SET_IN_SYNC, __FILE__, __LINE__) |
1510 | extern int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, | ||
1511 | int size, const char *file, const unsigned int line); | ||
1512 | #define drbd_set_out_of_sync(device, sector, size) \ | 1521 | #define drbd_set_out_of_sync(device, sector, size) \ |
1513 | __drbd_set_out_of_sync(device, sector, size, __FILE__, __LINE__) | 1522 | __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC, __FILE__, __LINE__) |
1523 | #define drbd_rs_failed_io(device, sector, size) \ | ||
1524 | __drbd_change_sync(device, sector, size, RECORD_RS_FAILED, __FILE__, __LINE__) | ||
1514 | extern void drbd_al_shrink(struct drbd_device *device); | 1525 | extern void drbd_al_shrink(struct drbd_device *device); |
1515 | extern int drbd_initialize_al(struct drbd_device *, void *); | 1526 | extern int drbd_initialize_al(struct drbd_device *, void *); |
1516 | 1527 | ||
@@ -1915,6 +1926,15 @@ static inline void _sub_unacked(struct drbd_device *device, int n, const char *f | |||
1915 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); | 1926 | ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); |
1916 | } | 1927 | } |
1917 | 1928 | ||
1929 | static inline bool is_sync_state(enum drbd_conns connection_state) | ||
1930 | { | ||
1931 | return | ||
1932 | (connection_state == C_SYNC_SOURCE | ||
1933 | || connection_state == C_SYNC_TARGET | ||
1934 | || connection_state == C_PAUSED_SYNC_S | ||
1935 | || connection_state == C_PAUSED_SYNC_T); | ||
1936 | } | ||
1937 | |||
1918 | /** | 1938 | /** |
1919 | * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev | 1939 | * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev |
1920 | * @M: DRBD device. | 1940 | * @M: DRBD device. |
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 19da7c7590cd..1bddd6cf8ac7 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c | |||
@@ -1011,6 +1011,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, | |||
1011 | atomic_inc(&device->local_cnt); | 1011 | atomic_inc(&device->local_cnt); |
1012 | 1012 | ||
1013 | did_remote = drbd_should_do_remote(device->state); | 1013 | did_remote = drbd_should_do_remote(device->state); |
1014 | if (!is_sync_state(os.conn) && is_sync_state(ns.conn)) | ||
1015 | clear_bit(RS_DONE, &device->flags); | ||
1016 | |||
1014 | device->state.i = ns.i; | 1017 | device->state.i = ns.i; |
1015 | should_do_remote = drbd_should_do_remote(device->state); | 1018 | should_do_remote = drbd_should_do_remote(device->state); |
1016 | device->resource->susp = ns.susp; | 1019 | device->resource->susp = ns.susp; |
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 47bc84017b5b..bafb62eb22c9 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c | |||
@@ -1740,11 +1740,20 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) | |||
1740 | device->rs_mark_time[i] = now; | 1740 | device->rs_mark_time[i] = now; |
1741 | } | 1741 | } |
1742 | _drbd_pause_after(device); | 1742 | _drbd_pause_after(device); |
1743 | /* Forget potentially stale cached per resync extent bit-counts. | ||
1744 | * Open coded drbd_rs_cancel_all(device), we already have IRQs | ||
1745 | * disabled, and know the disk state is ok. */ | ||
1746 | spin_lock(&device->al_lock); | ||
1747 | lc_reset(device->resync); | ||
1748 | device->resync_locked = 0; | ||
1749 | device->resync_wenr = LC_FREE; | ||
1750 | spin_unlock(&device->al_lock); | ||
1743 | } | 1751 | } |
1744 | write_unlock(&global_state_lock); | 1752 | write_unlock(&global_state_lock); |
1745 | spin_unlock_irq(&device->resource->req_lock); | 1753 | spin_unlock_irq(&device->resource->req_lock); |
1746 | 1754 | ||
1747 | if (r == SS_SUCCESS) { | 1755 | if (r == SS_SUCCESS) { |
1756 | wake_up(&device->al_wait); /* for lc_reset() above */ | ||
1748 | /* reset rs_last_bcast when a resync or verify is started, | 1757 | /* reset rs_last_bcast when a resync or verify is started, |
1749 | * to deal with potential jiffies wrap. */ | 1758 | * to deal with potential jiffies wrap. */ |
1750 | device->rs_last_bcast = jiffies - HZ; | 1759 | device->rs_last_bcast = jiffies - HZ; |
@@ -1807,36 +1816,22 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) | |||
1807 | static void update_on_disk_bitmap(struct drbd_device *device) | 1816 | static void update_on_disk_bitmap(struct drbd_device *device) |
1808 | { | 1817 | { |
1809 | struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; | 1818 | struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; |
1819 | bool resync_done = test_and_clear_bit(RS_DONE, &device->flags); | ||
1810 | device->rs_last_bcast = jiffies; | 1820 | device->rs_last_bcast = jiffies; |
1811 | 1821 | ||
1812 | if (!get_ldev(device)) | 1822 | if (!get_ldev(device)) |
1813 | return; | 1823 | return; |
1814 | 1824 | ||
1815 | drbd_bm_write_lazy(device, 0); | 1825 | drbd_bm_write_lazy(device, 0); |
1816 | if (drbd_bm_total_weight(device) <= device->rs_failed) | 1826 | if (resync_done && is_sync_state(device->state.conn)) |
1817 | drbd_resync_finished(device); | 1827 | drbd_resync_finished(device); |
1828 | |||
1818 | drbd_bcast_event(device, &sib); | 1829 | drbd_bcast_event(device, &sib); |
1819 | /* update timestamp, in case it took a while to write out stuff */ | 1830 | /* update timestamp, in case it took a while to write out stuff */ |
1820 | device->rs_last_bcast = jiffies; | 1831 | device->rs_last_bcast = jiffies; |
1821 | put_ldev(device); | 1832 | put_ldev(device); |
1822 | } | 1833 | } |
1823 | 1834 | ||
1824 | bool wants_lazy_bitmap_update(struct drbd_device *device) | ||
1825 | { | ||
1826 | enum drbd_conns connection_state = device->state.conn; | ||
1827 | return | ||
1828 | /* only do a lazy writeout, if device is in some resync state */ | ||
1829 | (connection_state == C_SYNC_SOURCE | ||
1830 | || connection_state == C_SYNC_TARGET | ||
1831 | || connection_state == C_PAUSED_SYNC_S | ||
1832 | || connection_state == C_PAUSED_SYNC_T) && | ||
1833 | /* AND | ||
1834 | * either we just finished, or the last lazy update | ||
1835 | * was some time ago already. */ | ||
1836 | (drbd_bm_total_weight(device) <= device->rs_failed | ||
1837 | || time_after(jiffies, device->rs_last_bcast + 2*HZ)); | ||
1838 | } | ||
1839 | |||
1840 | static void try_update_all_on_disk_bitmaps(struct drbd_connection *connection) | 1835 | static void try_update_all_on_disk_bitmaps(struct drbd_connection *connection) |
1841 | { | 1836 | { |
1842 | struct drbd_peer_device *peer_device; | 1837 | struct drbd_peer_device *peer_device; |
@@ -1845,8 +1840,9 @@ static void try_update_all_on_disk_bitmaps(struct drbd_connection *connection) | |||
1845 | rcu_read_lock(); | 1840 | rcu_read_lock(); |
1846 | idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { | 1841 | idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { |
1847 | struct drbd_device *device = peer_device->device; | 1842 | struct drbd_device *device = peer_device->device; |
1848 | if (!wants_lazy_bitmap_update(device)) | 1843 | if (!test_and_clear_bit(RS_PROGRESS, &device->flags)) |
1849 | continue; | 1844 | continue; |
1845 | |||
1850 | kref_get(&device->kref); | 1846 | kref_get(&device->kref); |
1851 | rcu_read_unlock(); | 1847 | rcu_read_unlock(); |
1852 | update_on_disk_bitmap(device); | 1848 | update_on_disk_bitmap(device); |
@@ -1930,15 +1926,18 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * | |||
1930 | if (send_barrier) | 1926 | if (send_barrier) |
1931 | maybe_send_barrier(connection, | 1927 | maybe_send_barrier(connection, |
1932 | connection->send.current_epoch_nr + 1); | 1928 | connection->send.current_epoch_nr + 1); |
1929 | |||
1930 | if (test_bit(CONN_RS_PROGRESS, &connection->flags)) | ||
1931 | break; | ||
1932 | |||
1933 | /* drbd_send() may have called flush_signals() */ | 1933 | /* drbd_send() may have called flush_signals() */ |
1934 | if (get_t_state(&connection->worker) != RUNNING) | 1934 | if (get_t_state(&connection->worker) != RUNNING) |
1935 | break; | 1935 | break; |
1936 | |||
1936 | schedule(); | 1937 | schedule(); |
1937 | /* may be woken up for other things but new work, too, | 1938 | /* may be woken up for other things but new work, too, |
1938 | * e.g. if the current epoch got closed. | 1939 | * e.g. if the current epoch got closed. |
1939 | * In which case we send the barrier above. */ | 1940 | * In which case we send the barrier above. */ |
1940 | |||
1941 | try_update_all_on_disk_bitmaps(connection); | ||
1942 | } | 1941 | } |
1943 | finish_wait(&connection->sender_work.q_wait, &wait); | 1942 | finish_wait(&connection->sender_work.q_wait, &wait); |
1944 | 1943 | ||
@@ -1973,6 +1972,9 @@ int drbd_worker(struct drbd_thread *thi) | |||
1973 | if (list_empty(&work_list)) | 1972 | if (list_empty(&work_list)) |
1974 | wait_for_work(connection, &work_list); | 1973 | wait_for_work(connection, &work_list); |
1975 | 1974 | ||
1975 | if (test_and_clear_bit(CONN_RS_PROGRESS, &connection->flags)) | ||
1976 | try_update_all_on_disk_bitmaps(connection); | ||
1977 | |||
1976 | if (signal_pending(current)) { | 1978 | if (signal_pending(current)) { |
1977 | flush_signals(current); | 1979 | flush_signals(current); |
1978 | if (get_t_state(thi) == RUNNING) { | 1980 | if (get_t_state(thi) == RUNNING) { |