diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2006-03-24 17:20:17 -0500 |
---|---|---|
committer | Mark Fasheh <mark.fasheh@oracle.com> | 2006-04-07 21:03:09 -0400 |
commit | a9e2ae39170d01937725e1fff2e606baaa71346c (patch) | |
tree | 95fa7ddef8c4111a9d4871ed514582da0fdbcc7e | |
parent | 2cd9888590c52ac7592e3607d0a3174ccd57ef86 (diff) |
ocfs2: Better I/O error handling in heartbeat
Propagate errors received in o2hb_bio_end_io() back to the heartbeat thread
so it can skip re-arming the timer.
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
-rw-r--r-- | fs/ocfs2/cluster/heartbeat.c | 40 |
1 files changed, 32 insertions, 8 deletions
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index bff0f0d0686..21f38accd03 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -153,6 +153,7 @@ struct o2hb_region { | |||
153 | struct o2hb_bio_wait_ctxt { | 153 | struct o2hb_bio_wait_ctxt { |
154 | atomic_t wc_num_reqs; | 154 | atomic_t wc_num_reqs; |
155 | struct completion wc_io_complete; | 155 | struct completion wc_io_complete; |
156 | int wc_error; | ||
156 | }; | 157 | }; |
157 | 158 | ||
158 | static void o2hb_write_timeout(void *arg) | 159 | static void o2hb_write_timeout(void *arg) |
@@ -186,6 +187,7 @@ static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc, | |||
186 | { | 187 | { |
187 | atomic_set(&wc->wc_num_reqs, num_ios); | 188 | atomic_set(&wc->wc_num_reqs, num_ios); |
188 | init_completion(&wc->wc_io_complete); | 189 | init_completion(&wc->wc_io_complete); |
190 | wc->wc_error = 0; | ||
189 | } | 191 | } |
190 | 192 | ||
191 | /* Used in error paths too */ | 193 | /* Used in error paths too */ |
@@ -218,8 +220,10 @@ static int o2hb_bio_end_io(struct bio *bio, | |||
218 | { | 220 | { |
219 | struct o2hb_bio_wait_ctxt *wc = bio->bi_private; | 221 | struct o2hb_bio_wait_ctxt *wc = bio->bi_private; |
220 | 222 | ||
221 | if (error) | 223 | if (error) { |
222 | mlog(ML_ERROR, "IO Error %d\n", error); | 224 | mlog(ML_ERROR, "IO Error %d\n", error); |
225 | wc->wc_error = error; | ||
226 | } | ||
223 | 227 | ||
224 | if (bio->bi_size) | 228 | if (bio->bi_size) |
225 | return 1; | 229 | return 1; |
@@ -390,6 +394,8 @@ static int o2hb_read_slots(struct o2hb_region *reg, | |||
390 | 394 | ||
391 | bail_and_wait: | 395 | bail_and_wait: |
392 | o2hb_wait_on_io(reg, &wc); | 396 | o2hb_wait_on_io(reg, &wc); |
397 | if (wc.wc_error && !status) | ||
398 | status = wc.wc_error; | ||
393 | 399 | ||
394 | if (bios) { | 400 | if (bios) { |
395 | for(i = 0; i < num_bios; i++) | 401 | for(i = 0; i < num_bios; i++) |
@@ -790,20 +796,24 @@ static int o2hb_highest_node(unsigned long *nodes, | |||
790 | return highest; | 796 | return highest; |
791 | } | 797 | } |
792 | 798 | ||
793 | static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) | 799 | static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) |
794 | { | 800 | { |
795 | int i, ret, highest_node, change = 0; | 801 | int i, ret, highest_node, change = 0; |
796 | unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 802 | unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
797 | struct bio *write_bio; | 803 | struct bio *write_bio; |
798 | struct o2hb_bio_wait_ctxt write_wc; | 804 | struct o2hb_bio_wait_ctxt write_wc; |
799 | 805 | ||
800 | if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes))) | 806 | ret = o2nm_configured_node_map(configured_nodes, |
801 | return; | 807 | sizeof(configured_nodes)); |
808 | if (ret) { | ||
809 | mlog_errno(ret); | ||
810 | return ret; | ||
811 | } | ||
802 | 812 | ||
803 | highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); | 813 | highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); |
804 | if (highest_node >= O2NM_MAX_NODES) { | 814 | if (highest_node >= O2NM_MAX_NODES) { |
805 | mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); | 815 | mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); |
806 | return; | 816 | return -EINVAL; |
807 | } | 817 | } |
808 | 818 | ||
809 | /* No sense in reading the slots of nodes that don't exist | 819 | /* No sense in reading the slots of nodes that don't exist |
@@ -813,7 +823,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
813 | ret = o2hb_read_slots(reg, highest_node + 1); | 823 | ret = o2hb_read_slots(reg, highest_node + 1); |
814 | if (ret < 0) { | 824 | if (ret < 0) { |
815 | mlog_errno(ret); | 825 | mlog_errno(ret); |
816 | return; | 826 | return ret; |
817 | } | 827 | } |
818 | 828 | ||
819 | /* With an up to date view of the slots, we can check that no | 829 | /* With an up to date view of the slots, we can check that no |
@@ -831,7 +841,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
831 | ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); | 841 | ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); |
832 | if (ret < 0) { | 842 | if (ret < 0) { |
833 | mlog_errno(ret); | 843 | mlog_errno(ret); |
834 | return; | 844 | return ret; |
835 | } | 845 | } |
836 | 846 | ||
837 | i = -1; | 847 | i = -1; |
@@ -847,6 +857,15 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
847 | */ | 857 | */ |
848 | o2hb_wait_on_io(reg, &write_wc); | 858 | o2hb_wait_on_io(reg, &write_wc); |
849 | bio_put(write_bio); | 859 | bio_put(write_bio); |
860 | if (write_wc.wc_error) { | ||
861 | /* Do not re-arm the write timeout on I/O error - we | ||
862 | * can't be sure that the new block ever made it to | ||
863 | * disk */ | ||
864 | mlog(ML_ERROR, "Write error %d on device \"%s\"\n", | ||
865 | write_wc.wc_error, reg->hr_dev_name); | ||
866 | return write_wc.wc_error; | ||
867 | } | ||
868 | |||
850 | o2hb_arm_write_timeout(reg); | 869 | o2hb_arm_write_timeout(reg); |
851 | 870 | ||
852 | /* let the person who launched us know when things are steady */ | 871 | /* let the person who launched us know when things are steady */ |
@@ -854,6 +873,8 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
854 | if (atomic_dec_and_test(®->hr_steady_iterations)) | 873 | if (atomic_dec_and_test(®->hr_steady_iterations)) |
855 | wake_up(&o2hb_steady_queue); | 874 | wake_up(&o2hb_steady_queue); |
856 | } | 875 | } |
876 | |||
877 | return 0; | ||
857 | } | 878 | } |
858 | 879 | ||
859 | /* Subtract b from a, storing the result in a. a *must* have a larger | 880 | /* Subtract b from a, storing the result in a. a *must* have a larger |
@@ -913,7 +934,10 @@ static int o2hb_thread(void *data) | |||
913 | * likely to time itself out. */ | 934 | * likely to time itself out. */ |
914 | do_gettimeofday(&before_hb); | 935 | do_gettimeofday(&before_hb); |
915 | 936 | ||
916 | o2hb_do_disk_heartbeat(reg); | 937 | i = 0; |
938 | do { | ||
939 | ret = o2hb_do_disk_heartbeat(reg); | ||
940 | } while (ret && ++i < 2); | ||
917 | 941 | ||
918 | do_gettimeofday(&after_hb); | 942 | do_gettimeofday(&after_hb); |
919 | elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); | 943 | elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); |