aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband
diff options
context:
space:
mode:
authorJack Morgenstein <jackm@dev.mellanox.co.il>2007-09-19 12:52:25 -0400
committerRoland Dreier <rolandd@cisco.com>2007-09-23 16:03:22 -0400
commit6e694ea33e7a7fad908d188c46f441f04fb633d4 (patch)
tree82325240b222299766b5cfa53e59306cb607878a /drivers/infiniband
parent40ffbfad6bb79a99cc7627bdaca0ee22dec526f6 (diff)
IB/mlx4: Fix data corruption triggered by wrong headroom marking order
This is an addendum to commit 0e6e7416 ("IB/mlx4: Handle new FW requirement for send request prefetching"). We also need to handle prefetch marking properly for S/G segments, or else the HCA may end up processing S/G segments that are not fully written and end up sending the wrong data. This can actually cause data corruption in practice, especially on systems with relatively slow CPUs (where the HCA is more likely to prefetch while the CPU is in the middle of writing a work request into memory). We write S/G segments in reverse order into the WQE, in order to guarantee that the first dword of all cachelines containing S/G segments is written last (overwriting the headroom invalidation pattern). The entire cacheline will thus contain valid data when the invalidation pattern is overwritten. Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il> Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband')
-rw-r--r--drivers/infiniband/hw/mlx4/qp.c62
1 files changed, 49 insertions, 13 deletions
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index ba0428d872aa..85c51bdc36f1 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1211,12 +1211,42 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
1211 dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey); 1211 dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
1212} 1212}
1213 1213
1214static void set_data_seg(struct mlx4_wqe_data_seg *dseg, 1214static void set_mlx_icrc_seg(void *dseg)
1215 struct ib_sge *sg) 1215{
1216 u32 *t = dseg;
1217 struct mlx4_wqe_inline_seg *iseg = dseg;
1218
1219 t[1] = 0;
1220
1221 /*
1222 * Need a barrier here before writing the byte_count field to
1223 * make sure that all the data is visible before the
1224 * byte_count field is set. Otherwise, if the segment begins
1225 * a new cacheline, the HCA prefetcher could grab the 64-byte
1226 * chunk and get a valid (!= * 0xffffffff) byte count but
1227 * stale data, and end up sending the wrong data.
1228 */
1229 wmb();
1230
1231 iseg->byte_count = cpu_to_be32((1 << 31) | 4);
1232}
1233
1234static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
1216{ 1235{
1217 dseg->byte_count = cpu_to_be32(sg->length);
1218 dseg->lkey = cpu_to_be32(sg->lkey); 1236 dseg->lkey = cpu_to_be32(sg->lkey);
1219 dseg->addr = cpu_to_be64(sg->addr); 1237 dseg->addr = cpu_to_be64(sg->addr);
1238
1239 /*
1240 * Need a barrier here before writing the byte_count field to
1241 * make sure that all the data is visible before the
1242 * byte_count field is set. Otherwise, if the segment begins
1243 * a new cacheline, the HCA prefetcher could grab the 64-byte
1244 * chunk and get a valid (!= * 0xffffffff) byte count but
1245 * stale data, and end up sending the wrong data.
1246 */
1247 wmb();
1248
1249 dseg->byte_count = cpu_to_be32(sg->length);
1220} 1250}
1221 1251
1222int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, 1252int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
@@ -1225,6 +1255,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
1225 struct mlx4_ib_qp *qp = to_mqp(ibqp); 1255 struct mlx4_ib_qp *qp = to_mqp(ibqp);
1226 void *wqe; 1256 void *wqe;
1227 struct mlx4_wqe_ctrl_seg *ctrl; 1257 struct mlx4_wqe_ctrl_seg *ctrl;
1258 struct mlx4_wqe_data_seg *dseg;
1228 unsigned long flags; 1259 unsigned long flags;
1229 int nreq; 1260 int nreq;
1230 int err = 0; 1261 int err = 0;
@@ -1324,22 +1355,27 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
1324 break; 1355 break;
1325 } 1356 }
1326 1357
1327 for (i = 0; i < wr->num_sge; ++i) { 1358 /*
1328 set_data_seg(wqe, wr->sg_list + i); 1359 * Write data segments in reverse order, so as to
1360 * overwrite cacheline stamp last within each
1361 * cacheline. This avoids issues with WQE
1362 * prefetching.
1363 */
1329 1364
1330 wqe += sizeof (struct mlx4_wqe_data_seg); 1365 dseg = wqe;
1331 size += sizeof (struct mlx4_wqe_data_seg) / 16; 1366 dseg += wr->num_sge - 1;
1332 } 1367 size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16);
1333 1368
1334 /* Add one more inline data segment for ICRC for MLX sends */ 1369 /* Add one more inline data segment for ICRC for MLX sends */
1335 if (qp->ibqp.qp_type == IB_QPT_SMI || qp->ibqp.qp_type == IB_QPT_GSI) { 1370 if (unlikely(qp->ibqp.qp_type == IB_QPT_SMI ||
1336 ((struct mlx4_wqe_inline_seg *) wqe)->byte_count = 1371 qp->ibqp.qp_type == IB_QPT_GSI)) {
1337 cpu_to_be32((1 << 31) | 4); 1372 set_mlx_icrc_seg(dseg + 1);
1338 ((u32 *) wqe)[1] = 0;
1339 wqe += sizeof (struct mlx4_wqe_data_seg);
1340 size += sizeof (struct mlx4_wqe_data_seg) / 16; 1373 size += sizeof (struct mlx4_wqe_data_seg) / 16;
1341 } 1374 }
1342 1375
1376 for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
1377 set_data_seg(dseg, wr->sg_list + i);
1378
1343 ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ? 1379 ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
1344 MLX4_WQE_CTRL_FENCE : 0) | size; 1380 MLX4_WQE_CTRL_FENCE : 0) | size;
1345 1381