aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block/drbd/drbd_receiver.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/drbd/drbd_receiver.c')
-rw-r--r--drivers/block/drbd/drbd_receiver.c218
1 files changed, 33 insertions, 185 deletions
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index efd6169acf2f..89d8a7cc4054 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -36,7 +36,6 @@
36#include <linux/memcontrol.h> 36#include <linux/memcontrol.h>
37#include <linux/mm_inline.h> 37#include <linux/mm_inline.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/smp_lock.h>
40#include <linux/pkt_sched.h> 39#include <linux/pkt_sched.h>
41#define __KERNEL_SYSCALLS__ 40#define __KERNEL_SYSCALLS__
42#include <linux/unistd.h> 41#include <linux/unistd.h>
@@ -49,11 +48,6 @@
49 48
50#include "drbd_vli.h" 49#include "drbd_vli.h"
51 50
52struct flush_work {
53 struct drbd_work w;
54 struct drbd_epoch *epoch;
55};
56
57enum finish_epoch { 51enum finish_epoch {
58 FE_STILL_LIVE, 52 FE_STILL_LIVE,
59 FE_DESTROYED, 53 FE_DESTROYED,
@@ -66,16 +60,6 @@ static int drbd_do_auth(struct drbd_conf *mdev);
66static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event); 60static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
67static int e_end_block(struct drbd_conf *, struct drbd_work *, int); 61static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
68 62
69static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
70{
71 struct drbd_epoch *prev;
72 spin_lock(&mdev->epoch_lock);
73 prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
74 if (prev == epoch || prev == mdev->current_epoch)
75 prev = NULL;
76 spin_unlock(&mdev->epoch_lock);
77 return prev;
78}
79 63
80#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) 64#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
81 65
@@ -981,7 +965,7 @@ static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsi
981 return TRUE; 965 return TRUE;
982} 966}
983 967
984static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) 968static void drbd_flush(struct drbd_conf *mdev)
985{ 969{
986 int rv; 970 int rv;
987 971
@@ -997,24 +981,6 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
997 } 981 }
998 put_ldev(mdev); 982 put_ldev(mdev);
999 } 983 }
1000
1001 return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
1002}
1003
1004static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1005{
1006 struct flush_work *fw = (struct flush_work *)w;
1007 struct drbd_epoch *epoch = fw->epoch;
1008
1009 kfree(w);
1010
1011 if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
1012 drbd_flush_after_epoch(mdev, epoch);
1013
1014 drbd_may_finish_epoch(mdev, epoch, EV_PUT |
1015 (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
1016
1017 return 1;
1018} 984}
1019 985
1020/** 986/**
@@ -1027,15 +993,13 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1027 struct drbd_epoch *epoch, 993 struct drbd_epoch *epoch,
1028 enum epoch_event ev) 994 enum epoch_event ev)
1029{ 995{
1030 int finish, epoch_size; 996 int epoch_size;
1031 struct drbd_epoch *next_epoch; 997 struct drbd_epoch *next_epoch;
1032 int schedule_flush = 0;
1033 enum finish_epoch rv = FE_STILL_LIVE; 998 enum finish_epoch rv = FE_STILL_LIVE;
1034 999
1035 spin_lock(&mdev->epoch_lock); 1000 spin_lock(&mdev->epoch_lock);
1036 do { 1001 do {
1037 next_epoch = NULL; 1002 next_epoch = NULL;
1038 finish = 0;
1039 1003
1040 epoch_size = atomic_read(&epoch->epoch_size); 1004 epoch_size = atomic_read(&epoch->epoch_size);
1041 1005
@@ -1045,16 +1009,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1045 break; 1009 break;
1046 case EV_GOT_BARRIER_NR: 1010 case EV_GOT_BARRIER_NR:
1047 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags); 1011 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
1048
1049 /* Special case: If we just switched from WO_bio_barrier to
1050 WO_bdev_flush we should not finish the current epoch */
1051 if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
1052 mdev->write_ordering != WO_bio_barrier &&
1053 epoch == mdev->current_epoch)
1054 clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
1055 break;
1056 case EV_BARRIER_DONE:
1057 set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
1058 break; 1012 break;
1059 case EV_BECAME_LAST: 1013 case EV_BECAME_LAST:
1060 /* nothing to do*/ 1014 /* nothing to do*/
@@ -1063,23 +1017,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1063 1017
1064 if (epoch_size != 0 && 1018 if (epoch_size != 0 &&
1065 atomic_read(&epoch->active) == 0 && 1019 atomic_read(&epoch->active) == 0 &&
1066 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) && 1020 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) {
1067 epoch->list.prev == &mdev->current_epoch->list &&
1068 !test_bit(DE_IS_FINISHING, &epoch->flags)) {
1069 /* Nearly all conditions are met to finish that epoch... */
1070 if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
1071 mdev->write_ordering == WO_none ||
1072 (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
1073 ev & EV_CLEANUP) {
1074 finish = 1;
1075 set_bit(DE_IS_FINISHING, &epoch->flags);
1076 } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
1077 mdev->write_ordering == WO_bio_barrier) {
1078 atomic_inc(&epoch->active);
1079 schedule_flush = 1;
1080 }
1081 }
1082 if (finish) {
1083 if (!(ev & EV_CLEANUP)) { 1021 if (!(ev & EV_CLEANUP)) {
1084 spin_unlock(&mdev->epoch_lock); 1022 spin_unlock(&mdev->epoch_lock);
1085 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); 1023 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
@@ -1102,6 +1040,7 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1102 /* atomic_set(&epoch->active, 0); is already zero */ 1040 /* atomic_set(&epoch->active, 0); is already zero */
1103 if (rv == FE_STILL_LIVE) 1041 if (rv == FE_STILL_LIVE)
1104 rv = FE_RECYCLED; 1042 rv = FE_RECYCLED;
1043 wake_up(&mdev->ee_wait);
1105 } 1044 }
1106 } 1045 }
1107 1046
@@ -1113,22 +1052,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1113 1052
1114 spin_unlock(&mdev->epoch_lock); 1053 spin_unlock(&mdev->epoch_lock);
1115 1054
1116 if (schedule_flush) {
1117 struct flush_work *fw;
1118 fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
1119 if (fw) {
1120 fw->w.cb = w_flush;
1121 fw->epoch = epoch;
1122 drbd_queue_work(&mdev->data.work, &fw->w);
1123 } else {
1124 dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
1125 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1126 /* That is not a recursion, only one level */
1127 drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
1128 drbd_may_finish_epoch(mdev, epoch, EV_PUT);
1129 }
1130 }
1131
1132 return rv; 1055 return rv;
1133} 1056}
1134 1057
@@ -1144,19 +1067,16 @@ void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo)
1144 [WO_none] = "none", 1067 [WO_none] = "none",
1145 [WO_drain_io] = "drain", 1068 [WO_drain_io] = "drain",
1146 [WO_bdev_flush] = "flush", 1069 [WO_bdev_flush] = "flush",
1147 [WO_bio_barrier] = "barrier",
1148 }; 1070 };
1149 1071
1150 pwo = mdev->write_ordering; 1072 pwo = mdev->write_ordering;
1151 wo = min(pwo, wo); 1073 wo = min(pwo, wo);
1152 if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
1153 wo = WO_bdev_flush;
1154 if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush) 1074 if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
1155 wo = WO_drain_io; 1075 wo = WO_drain_io;
1156 if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain) 1076 if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
1157 wo = WO_none; 1077 wo = WO_none;
1158 mdev->write_ordering = wo; 1078 mdev->write_ordering = wo;
1159 if (pwo != mdev->write_ordering || wo == WO_bio_barrier) 1079 if (pwo != mdev->write_ordering || wo == WO_bdev_flush)
1160 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]); 1080 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
1161} 1081}
1162 1082
@@ -1192,7 +1112,7 @@ next_bio:
1192 bio->bi_sector = sector; 1112 bio->bi_sector = sector;
1193 bio->bi_bdev = mdev->ldev->backing_bdev; 1113 bio->bi_bdev = mdev->ldev->backing_bdev;
1194 /* we special case some flags in the multi-bio case, see below 1114 /* we special case some flags in the multi-bio case, see below
1195 * (REQ_UNPLUG, REQ_HARDBARRIER) */ 1115 * (REQ_UNPLUG) */
1196 bio->bi_rw = rw; 1116 bio->bi_rw = rw;
1197 bio->bi_private = e; 1117 bio->bi_private = e;
1198 bio->bi_end_io = drbd_endio_sec; 1118 bio->bi_end_io = drbd_endio_sec;
@@ -1226,11 +1146,6 @@ next_bio:
1226 bio->bi_rw &= ~REQ_UNPLUG; 1146 bio->bi_rw &= ~REQ_UNPLUG;
1227 1147
1228 drbd_generic_make_request(mdev, fault_type, bio); 1148 drbd_generic_make_request(mdev, fault_type, bio);
1229
1230 /* strip off REQ_HARDBARRIER,
1231 * unless it is the first or last bio */
1232 if (bios && bios->bi_next)
1233 bios->bi_rw &= ~REQ_HARDBARRIER;
1234 } while (bios); 1149 } while (bios);
1235 maybe_kick_lo(mdev); 1150 maybe_kick_lo(mdev);
1236 return 0; 1151 return 0;
@@ -1244,45 +1159,9 @@ fail:
1244 return -ENOMEM; 1159 return -ENOMEM;
1245} 1160}
1246 1161
1247/**
1248 * w_e_reissue() - Worker callback; Resubmit a bio, without REQ_HARDBARRIER set
1249 * @mdev: DRBD device.
1250 * @w: work object.
1251 * @cancel: The connection will be closed anyways (unused in this callback)
1252 */
1253int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
1254{
1255 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1256 /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
1257 (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
1258 so that we can finish that epoch in drbd_may_finish_epoch().
1259 That is necessary if we already have a long chain of Epochs, before
1260 we realize that REQ_HARDBARRIER is actually not supported */
1261
1262 /* As long as the -ENOTSUPP on the barrier is reported immediately
1263 that will never trigger. If it is reported late, we will just
1264 print that warning and continue correctly for all future requests
1265 with WO_bdev_flush */
1266 if (previous_epoch(mdev, e->epoch))
1267 dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
1268
1269 /* we still have a local reference,
1270 * get_ldev was done in receive_Data. */
1271
1272 e->w.cb = e_end_block;
1273 if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) {
1274 /* drbd_submit_ee fails for one reason only:
1275 * if was not able to allocate sufficient bios.
1276 * requeue, try again later. */
1277 e->w.cb = w_e_reissue;
1278 drbd_queue_work(&mdev->data.work, &e->w);
1279 }
1280 return 1;
1281}
1282
1283static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) 1162static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
1284{ 1163{
1285 int rv, issue_flush; 1164 int rv;
1286 struct p_barrier *p = &mdev->data.rbuf.barrier; 1165 struct p_barrier *p = &mdev->data.rbuf.barrier;
1287 struct drbd_epoch *epoch; 1166 struct drbd_epoch *epoch;
1288 1167
@@ -1300,44 +1179,40 @@ static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsign
1300 * Therefore we must send the barrier_ack after the barrier request was 1179 * Therefore we must send the barrier_ack after the barrier request was
1301 * completed. */ 1180 * completed. */
1302 switch (mdev->write_ordering) { 1181 switch (mdev->write_ordering) {
1303 case WO_bio_barrier:
1304 case WO_none: 1182 case WO_none:
1305 if (rv == FE_RECYCLED) 1183 if (rv == FE_RECYCLED)
1306 return TRUE; 1184 return TRUE;
1307 break; 1185
1186 /* receiver context, in the writeout path of the other node.
1187 * avoid potential distributed deadlock */
1188 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1189 if (epoch)
1190 break;
1191 else
1192 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1193 /* Fall through */
1308 1194
1309 case WO_bdev_flush: 1195 case WO_bdev_flush:
1310 case WO_drain_io: 1196 case WO_drain_io:
1311 if (rv == FE_STILL_LIVE) {
1312 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1313 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1314 rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1315 }
1316 if (rv == FE_RECYCLED)
1317 return TRUE;
1318
1319 /* The asender will send all the ACKs and barrier ACKs out, since
1320 all EEs moved from the active_ee to the done_ee. We need to
1321 provide a new epoch object for the EEs that come in soon */
1322 break;
1323 }
1324
1325 /* receiver context, in the writeout path of the other node.
1326 * avoid potential distributed deadlock */
1327 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1328 if (!epoch) {
1329 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1330 issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1331 drbd_wait_ee_list_empty(mdev, &mdev->active_ee); 1197 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1332 if (issue_flush) { 1198 drbd_flush(mdev);
1333 rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); 1199
1334 if (rv == FE_RECYCLED) 1200 if (atomic_read(&mdev->current_epoch->epoch_size)) {
1335 return TRUE; 1201 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1202 if (epoch)
1203 break;
1336 } 1204 }
1337 1205
1338 drbd_wait_ee_list_empty(mdev, &mdev->done_ee); 1206 epoch = mdev->current_epoch;
1207 wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0);
1208
1209 D_ASSERT(atomic_read(&epoch->active) == 0);
1210 D_ASSERT(epoch->flags == 0);
1339 1211
1340 return TRUE; 1212 return TRUE;
1213 default:
1214 dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
1215 return FALSE;
1341 } 1216 }
1342 1217
1343 epoch->flags = 0; 1218 epoch->flags = 0;
@@ -1652,15 +1527,8 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1652{ 1527{
1653 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; 1528 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1654 sector_t sector = e->sector; 1529 sector_t sector = e->sector;
1655 struct drbd_epoch *epoch;
1656 int ok = 1, pcmd; 1530 int ok = 1, pcmd;
1657 1531
1658 if (e->flags & EE_IS_BARRIER) {
1659 epoch = previous_epoch(mdev, e->epoch);
1660 if (epoch)
1661 drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
1662 }
1663
1664 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { 1532 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
1665 if (likely((e->flags & EE_WAS_ERROR) == 0)) { 1533 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1666 pcmd = (mdev->state.conn >= C_SYNC_SOURCE && 1534 pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
@@ -1817,27 +1685,6 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
1817 e->epoch = mdev->current_epoch; 1685 e->epoch = mdev->current_epoch;
1818 atomic_inc(&e->epoch->epoch_size); 1686 atomic_inc(&e->epoch->epoch_size);
1819 atomic_inc(&e->epoch->active); 1687 atomic_inc(&e->epoch->active);
1820
1821 if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
1822 struct drbd_epoch *epoch;
1823 /* Issue a barrier if we start a new epoch, and the previous epoch
1824 was not a epoch containing a single request which already was
1825 a Barrier. */
1826 epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
1827 if (epoch == e->epoch) {
1828 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1829 rw |= REQ_HARDBARRIER;
1830 e->flags |= EE_IS_BARRIER;
1831 } else {
1832 if (atomic_read(&epoch->epoch_size) > 1 ||
1833 !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
1834 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1835 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1836 rw |= REQ_HARDBARRIER;
1837 e->flags |= EE_IS_BARRIER;
1838 }
1839 }
1840 }
1841 spin_unlock(&mdev->epoch_lock); 1688 spin_unlock(&mdev->epoch_lock);
1842 1689
1843 dp_flags = be32_to_cpu(p->dp_flags); 1690 dp_flags = be32_to_cpu(p->dp_flags);
@@ -1995,10 +1842,11 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
1995 break; 1842 break;
1996 } 1843 }
1997 1844
1998 if (mdev->state.pdsk == D_DISKLESS) { 1845 if (mdev->state.pdsk < D_INCONSISTENT) {
1999 /* In case we have the only disk of the cluster, */ 1846 /* In case we have the only disk of the cluster, */
2000 drbd_set_out_of_sync(mdev, e->sector, e->size); 1847 drbd_set_out_of_sync(mdev, e->sector, e->size);
2001 e->flags |= EE_CALL_AL_COMPLETE_IO; 1848 e->flags |= EE_CALL_AL_COMPLETE_IO;
1849 e->flags &= ~EE_MAY_SET_IN_SYNC;
2002 drbd_al_begin_io(mdev, e->sector); 1850 drbd_al_begin_io(mdev, e->sector);
2003 } 1851 }
2004 1852
@@ -3362,7 +3210,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
3362 if (ns.conn == C_MASK) { 3210 if (ns.conn == C_MASK) {
3363 ns.conn = C_CONNECTED; 3211 ns.conn = C_CONNECTED;
3364 if (mdev->state.disk == D_NEGOTIATING) { 3212 if (mdev->state.disk == D_NEGOTIATING) {
3365 drbd_force_state(mdev, NS(disk, D_DISKLESS)); 3213 drbd_force_state(mdev, NS(disk, D_FAILED));
3366 } else if (peer_state.disk == D_NEGOTIATING) { 3214 } else if (peer_state.disk == D_NEGOTIATING) {
3367 dev_err(DEV, "Disk attach process on the peer node was aborted.\n"); 3215 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3368 peer_state.disk = D_DISKLESS; 3216 peer_state.disk = D_DISKLESS;