aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLars Ellenberg <lars.ellenberg@linbit.com>2012-09-22 06:26:57 -0400
committerJens Axboe <axboe@kernel.dk>2012-10-30 03:39:18 -0400
commita2a3c74f243d5d1793f89ccdceaa6918851f7fce (patch)
tree3d828d28daaabf6540e7a157454683a1a678524a /drivers
parent06f10adbdb027b225fd51584a218fa8344169514 (diff)
drbd: always write bitmap on detach
If we detach due to local read-error (which sets a bit in the bitmap), stay Primary, and then re-attach (which re-reads the bitmap from disk), we potentially lost the "out-of-sync" (or, "bad block") information in the bitmap. Always (try to) write out the changed bitmap pages before going diskless. That way, we don't lose the bit for the bad block, the next resync will fetch it from the peer, and rewrite it locally, which may result in block reallocation in some lower layer (or the hardware), and thereby "heal" the bad blocks. If the bitmap writeout errors out as well, we will (again: try to) mark the "we need a full sync" bit in our super block, if it was a READ error; writes are covered by the activity log already. If that superblock does not make it to disk either, we are sorry. Maybe we just lost an entire disk or controller (or iSCSI connection), and there actually are no bad blocks at all, so we don't need to re-fetch from the peer, there is no "auto-healing" necessary. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/block/drbd/drbd_int.h36
-rw-r--r--drivers/block/drbd/drbd_main.c41
-rw-r--r--drivers/block/drbd/drbd_nl.c2
-rw-r--r--drivers/block/drbd/drbd_req.c4
-rw-r--r--drivers/block/drbd/drbd_worker.c4
5 files changed, 68 insertions, 19 deletions
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 125fe1481ca2..277c69c9465b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -831,7 +831,8 @@ enum drbd_flag {
831 once no more io in flight, start bitmap io */ 831 once no more io in flight, start bitmap io */
832 BITMAP_IO_QUEUED, /* Started bitmap IO */ 832 BITMAP_IO_QUEUED, /* Started bitmap IO */
833 GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */ 833 GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */
834 WAS_IO_ERROR, /* Local disk failed returned IO error */ 834 WAS_IO_ERROR, /* Local disk failed, returned IO error */
835 WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */
835 FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */ 836 FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */
836 RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ 837 RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
837 NET_CONGESTED, /* The data socket is congested */ 838 NET_CONGESTED, /* The data socket is congested */
@@ -1879,30 +1880,53 @@ static inline int drbd_request_state(struct drbd_conf *mdev,
1879} 1880}
1880 1881
1881enum drbd_force_detach_flags { 1882enum drbd_force_detach_flags {
1882 DRBD_IO_ERROR, 1883 DRBD_READ_ERROR,
1884 DRBD_WRITE_ERROR,
1883 DRBD_META_IO_ERROR, 1885 DRBD_META_IO_ERROR,
1884 DRBD_FORCE_DETACH, 1886 DRBD_FORCE_DETACH,
1885}; 1887};
1886 1888
1887#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) 1889#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
1888static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, 1890static inline void __drbd_chk_io_error_(struct drbd_conf *mdev,
1889 enum drbd_force_detach_flags forcedetach, 1891 enum drbd_force_detach_flags df,
1890 const char *where) 1892 const char *where)
1891{ 1893{
1892 switch (mdev->ldev->dc.on_io_error) { 1894 switch (mdev->ldev->dc.on_io_error) {
1893 case EP_PASS_ON: 1895 case EP_PASS_ON:
1894 if (forcedetach == DRBD_IO_ERROR) { 1896 if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
1895 if (__ratelimit(&drbd_ratelimit_state)) 1897 if (__ratelimit(&drbd_ratelimit_state))
1896 dev_err(DEV, "Local IO failed in %s.\n", where); 1898 dev_err(DEV, "Local IO failed in %s.\n", where);
1897 if (mdev->state.disk > D_INCONSISTENT) 1899 if (mdev->state.disk > D_INCONSISTENT)
1898 _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL); 1900 _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL);
1899 break; 1901 break;
1900 } 1902 }
1901 /* NOTE fall through to detach case if forcedetach set */ 1903 /* NOTE fall through for DRBD_META_IO_ERROR or DRBD_FORCE_DETACH */
1902 case EP_DETACH: 1904 case EP_DETACH:
1903 case EP_CALL_HELPER: 1905 case EP_CALL_HELPER:
1906 /* Remember whether we saw a READ or WRITE error.
1907 *
1908 * Recovery of the affected area for WRITE failure is covered
1909 * by the activity log.
1910 * READ errors may fall outside that area though. Certain READ
1911 * errors can be "healed" by writing good data to the affected
1912 * blocks, which triggers block re-allocation in lower layers.
1913 *
1914 * If we can not write the bitmap after a READ error,
1915 * we may need to trigger a full sync (see w_go_diskless()).
1916 *
1917 * Force-detach is not really an IO error, but rather a
1918 * desperate measure to try to deal with a completely
1919 * unresponsive lower level IO stack.
1920 * Still it should be treated as a WRITE error.
1921 *
1922 * Meta IO error is always WRITE error:
1923 * we read meta data only once during attach,
1924 * which will fail in case of errors.
1925 */
1904 drbd_set_flag(mdev, WAS_IO_ERROR); 1926 drbd_set_flag(mdev, WAS_IO_ERROR);
1905 if (forcedetach == DRBD_FORCE_DETACH) 1927 if (df == DRBD_READ_ERROR)
1928 drbd_set_flag(mdev, WAS_READ_ERROR);
1929 if (df == DRBD_FORCE_DETACH)
1906 drbd_set_flag(mdev, FORCE_DETACH); 1930 drbd_set_flag(mdev, FORCE_DETACH);
1907 if (mdev->state.disk > D_FAILED) { 1931 if (mdev->state.disk > D_FAILED) {
1908 _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); 1932 _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index d8ba5c42670f..9b833e0fb440 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1617,17 +1617,20 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1617 /* first half of local IO error, failure to attach, 1617 /* first half of local IO error, failure to attach,
1618 * or administrative detach */ 1618 * or administrative detach */
1619 if (os.disk != D_FAILED && ns.disk == D_FAILED) { 1619 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1620 enum drbd_io_error_p eh = EP_PASS_ON;
1621 int was_io_error = 0;
1622 /* corresponding get_ldev was in __drbd_set_state, to serialize 1620 /* corresponding get_ldev was in __drbd_set_state, to serialize
1623 * our cleanup here with the transition to D_DISKLESS. 1621 * our cleanup here with the transition to D_DISKLESS.
1624 * But is is still not save to dreference ldev here, since 1622 * But it is still not safe to dreference ldev here, we may end
1625 * we might come from an failed Attach before ldev was set. */ 1623 * up here from a failed attach, before ldev was even set. */
1626 if (mdev->ldev) { 1624 if (mdev->ldev) {
1627 eh = mdev->ldev->dc.on_io_error; 1625 enum drbd_io_error_p eh = mdev->ldev->dc.on_io_error;
1628 was_io_error = drbd_test_and_clear_flag(mdev, WAS_IO_ERROR); 1626
1629 1627 /* In some setups, this handler triggers a suicide,
1630 if (was_io_error && eh == EP_CALL_HELPER) 1628 * basically mapping IO error to node failure, to
1629 * reduce the number of different failure scenarios.
1630 *
1631 * This handler intentionally runs before we abort IO,
1632 * notify the peer, or try to update our meta data. */
1633 if (eh == EP_CALL_HELPER && drbd_test_flag(mdev, WAS_IO_ERROR))
1631 drbd_khelper(mdev, "local-io-error"); 1634 drbd_khelper(mdev, "local-io-error");
1632 1635
1633 /* Immediately allow completion of all application IO, 1636 /* Immediately allow completion of all application IO,
@@ -1643,7 +1646,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1643 * So aborting local requests may cause crashes, 1646 * So aborting local requests may cause crashes,
1644 * or even worse, silent data corruption. 1647 * or even worse, silent data corruption.
1645 */ 1648 */
1646 if (drbd_test_and_clear_flag(mdev, FORCE_DETACH)) 1649 if (drbd_test_flag(mdev, FORCE_DETACH))
1647 tl_abort_disk_io(mdev); 1650 tl_abort_disk_io(mdev);
1648 1651
1649 /* current state still has to be D_FAILED, 1652 /* current state still has to be D_FAILED,
@@ -4220,6 +4223,26 @@ static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused
4220 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch 4223 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
4221 * the protected members anymore, though, so once put_ldev reaches zero 4224 * the protected members anymore, though, so once put_ldev reaches zero
4222 * again, it will be safe to free them. */ 4225 * again, it will be safe to free them. */
4226
4227 /* Try to write changed bitmap pages, read errors may have just
4228 * set some bits outside the area covered by the activity log.
4229 *
4230 * If we have an IO error during the bitmap writeout,
4231 * we will want a full sync next time, just in case.
4232 * (Do we want a specific meta data flag for this?)
4233 *
4234 * If that does not make it to stable storage either,
4235 * we cannot do anything about that anymore. */
4236 if (mdev->bitmap) {
4237 if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write,
4238 "detach", BM_LOCKED_MASK)) {
4239 if (drbd_test_flag(mdev, WAS_READ_ERROR)) {
4240 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
4241 drbd_md_sync(mdev);
4242 }
4243 }
4244 }
4245
4223 drbd_force_state(mdev, NS(disk, D_DISKLESS)); 4246 drbd_force_state(mdev, NS(disk, D_DISKLESS));
4224 return 1; 4247 return 1;
4225} 4248}
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 42d172877aea..c8dda4e8dfce 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -959,6 +959,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
959 959
960 /* make sure there is no leftover from previous force-detach attempts */ 960 /* make sure there is no leftover from previous force-detach attempts */
961 drbd_clear_flag(mdev, FORCE_DETACH); 961 drbd_clear_flag(mdev, FORCE_DETACH);
962 drbd_clear_flag(mdev, WAS_IO_ERROR);
963 drbd_clear_flag(mdev, WAS_READ_ERROR);
962 964
963 /* and no leftover from previously aborted resync or verify, either */ 965 /* and no leftover from previously aborted resync or verify, either */
964 mdev->rs_total = 0; 966 mdev->rs_total = 0;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 9220d9f9d6cd..d9e5962a9a8c 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -455,7 +455,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
455 req->rq_state |= RQ_LOCAL_COMPLETED; 455 req->rq_state |= RQ_LOCAL_COMPLETED;
456 req->rq_state &= ~RQ_LOCAL_PENDING; 456 req->rq_state &= ~RQ_LOCAL_PENDING;
457 457
458 __drbd_chk_io_error(mdev, DRBD_IO_ERROR); 458 __drbd_chk_io_error(mdev, DRBD_WRITE_ERROR);
459 _req_may_be_done_not_susp(req, m); 459 _req_may_be_done_not_susp(req, m);
460 break; 460 break;
461 461
@@ -477,7 +477,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
477 break; 477 break;
478 } 478 }
479 479
480 __drbd_chk_io_error(mdev, DRBD_IO_ERROR); 480 __drbd_chk_io_error(mdev, DRBD_READ_ERROR);
481 481
482 goto_queue_for_net_read: 482 goto_queue_for_net_read:
483 483
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index acb614ac9fe1..7cd32e73b016 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -111,7 +111,7 @@ void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
111 if (list_empty(&mdev->read_ee)) 111 if (list_empty(&mdev->read_ee))
112 wake_up(&mdev->ee_wait); 112 wake_up(&mdev->ee_wait);
113 if (test_bit(__EE_WAS_ERROR, &e->flags)) 113 if (test_bit(__EE_WAS_ERROR, &e->flags))
114 __drbd_chk_io_error(mdev, DRBD_IO_ERROR); 114 __drbd_chk_io_error(mdev, DRBD_READ_ERROR);
115 spin_unlock_irqrestore(&mdev->req_lock, flags); 115 spin_unlock_irqrestore(&mdev->req_lock, flags);
116 116
117 drbd_queue_work(&mdev->data.work, &e->w); 117 drbd_queue_work(&mdev->data.work, &e->w);
@@ -154,7 +154,7 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo
154 : list_empty(&mdev->active_ee); 154 : list_empty(&mdev->active_ee);
155 155
156 if (test_bit(__EE_WAS_ERROR, &e->flags)) 156 if (test_bit(__EE_WAS_ERROR, &e->flags))
157 __drbd_chk_io_error(mdev, DRBD_IO_ERROR); 157 __drbd_chk_io_error(mdev, DRBD_WRITE_ERROR);
158 spin_unlock_irqrestore(&mdev->req_lock, flags); 158 spin_unlock_irqrestore(&mdev->req_lock, flags);
159 159
160 if (is_syncer_req) 160 if (is_syncer_req)