summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSantosh Shilimkar <santosh.shilimkar@oracle.com>2018-10-13 10:13:23 -0400
committerSantosh Shilimkar <santosh.shilimkar@oracle.com>2019-02-04 17:59:13 -0500
commitfd261ce6a30e01ad67c416e2c67e263024b3a6f9 (patch)
treeb54d7fe0fe3d096c3cf8c9bdab695aa563cab3bb
parent56dc8bce9f761cf61258e25d96dec4072273d8db (diff)
rds: rdma: update rdma transport for tos
For RDMA transports, RDS TOS is an extension of IB QoS(Annex A13) to provide clients the ability to segregate traffic flows for different type of data. RDMA CM abstract it for ULPs using rdma_set_service_type(). Internally, each traffic flow is represented by a connection with all of its independent resources like that of a normal connection, and is differentiated by service type. In other words, there can be multiple qp connections between an IP pair and each supports a unique service type. The feature has been added from RDSv4.1 onwards and supports rolling upgrades. RDMA connection metadata also carries the tos information to set up SL on end to end context. The original code was developed by Bang Nguyen in downstream kernel back in 2.6.32 kernel days and it has evolved over period of time. Reviewed-by: Sowmini Varadhan <sowmini.varadhan@oracle.com> Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com> [yanjun.zhu@oracle.com: Adapted original patch with ipv6 changes] Signed-off-by: Zhu Yanjun <yanjun.zhu@oracle.com>
-rw-r--r--net/rds/ib.h4
-rw-r--r--net/rds/ib_cm.c32
-rw-r--r--net/rds/ib_recv.c4
-rw-r--r--net/rds/ib_send.c5
-rw-r--r--net/rds/rdma_transport.c1
-rw-r--r--net/rds/send.c5
6 files changed, 29 insertions, 22 deletions
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 71ff356ee702..752f92235a38 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -67,7 +67,9 @@ struct rds_ib_conn_priv_cmn {
67 u8 ricpc_protocol_major; 67 u8 ricpc_protocol_major;
68 u8 ricpc_protocol_minor; 68 u8 ricpc_protocol_minor;
69 __be16 ricpc_protocol_minor_mask; /* bitmask */ 69 __be16 ricpc_protocol_minor_mask; /* bitmask */
70 __be32 ricpc_reserved1; 70 u8 ricpc_dp_toss;
71 u8 ripc_reserved1;
72 __be16 ripc_reserved2;
71 __be64 ricpc_ack_seq; 73 __be64 ricpc_ack_seq;
72 __be32 ricpc_credit; /* non-zero enables flow ctl */ 74 __be32 ricpc_credit; /* non-zero enables flow ctl */
73}; 75};
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 70518e329a9e..66c6eb56072b 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -144,9 +144,9 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
144 } 144 }
145 } 145 }
146 146
147 pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c> version %u.%u%s\n", 147 pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c,%d> version %u.%u%s\n",
148 ic->i_active_side ? "Active" : "Passive", 148 ic->i_active_side ? "Active" : "Passive",
149 &conn->c_laddr, &conn->c_faddr, 149 &conn->c_laddr, &conn->c_faddr, conn->c_tos,
150 RDS_PROTOCOL_MAJOR(conn->c_version), 150 RDS_PROTOCOL_MAJOR(conn->c_version),
151 RDS_PROTOCOL_MINOR(conn->c_version), 151 RDS_PROTOCOL_MINOR(conn->c_version),
152 ic->i_flowctl ? ", flow control" : ""); 152 ic->i_flowctl ? ", flow control" : "");
@@ -222,6 +222,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
222 cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); 222 cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
223 dp->ricp_v6.dp_ack_seq = 223 dp->ricp_v6.dp_ack_seq =
224 cpu_to_be64(rds_ib_piggyb_ack(ic)); 224 cpu_to_be64(rds_ib_piggyb_ack(ic));
225 dp->ricp_v6.dp_cmn.ricpc_dp_toss = conn->c_tos;
225 226
226 conn_param->private_data = &dp->ricp_v6; 227 conn_param->private_data = &dp->ricp_v6;
227 conn_param->private_data_len = sizeof(dp->ricp_v6); 228 conn_param->private_data_len = sizeof(dp->ricp_v6);
@@ -236,6 +237,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
236 cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); 237 cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
237 dp->ricp_v4.dp_ack_seq = 238 dp->ricp_v4.dp_ack_seq =
238 cpu_to_be64(rds_ib_piggyb_ack(ic)); 239 cpu_to_be64(rds_ib_piggyb_ack(ic));
240 dp->ricp_v4.dp_cmn.ricpc_dp_toss = conn->c_tos;
239 241
240 conn_param->private_data = &dp->ricp_v4; 242 conn_param->private_data = &dp->ricp_v4;
241 conn_param->private_data_len = sizeof(dp->ricp_v4); 243 conn_param->private_data_len = sizeof(dp->ricp_v4);
@@ -391,10 +393,9 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
391 rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); 393 rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
392 break; 394 break;
393 default: 395 default:
394 rdsdebug("Fatal QP Event %u (%s) " 396 rdsdebug("Fatal QP Event %u (%s) - connection %pI6c->%pI6c, reconnecting\n",
395 "- connection %pI6c->%pI6c, reconnecting\n", 397 event->event, ib_event_msg(event->event),
396 event->event, ib_event_msg(event->event), 398 &conn->c_laddr, &conn->c_faddr);
397 &conn->c_laddr, &conn->c_faddr);
398 rds_conn_drop(conn); 399 rds_conn_drop(conn);
399 break; 400 break;
400 } 401 }
@@ -662,11 +663,11 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
662 663
663 /* Even if len is crap *now* I still want to check it. -ASG */ 664 /* Even if len is crap *now* I still want to check it. -ASG */
664 if (event->param.conn.private_data_len < data_len || major == 0) 665 if (event->param.conn.private_data_len < data_len || major == 0)
665 return RDS_PROTOCOL_3_0; 666 return RDS_PROTOCOL_4_0;
666 667
667 common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS; 668 common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS;
668 if (major == 3 && common) { 669 if (major == 4 && common) {
669 version = RDS_PROTOCOL_3_0; 670 version = RDS_PROTOCOL_4_0;
670 while ((common >>= 1) != 0) 671 while ((common >>= 1) != 0)
671 version++; 672 version++;
672 } else if (RDS_PROTOCOL_COMPAT_VERSION == 673 } else if (RDS_PROTOCOL_COMPAT_VERSION ==
@@ -778,15 +779,16 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
778 daddr6 = &d_mapped_addr; 779 daddr6 = &d_mapped_addr;
779 } 780 }
780 781
781 rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid " 782 rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid 0x%llx, tos:%d\n",
782 "0x%llx\n", saddr6, daddr6, 783 saddr6, daddr6, RDS_PROTOCOL_MAJOR(version),
783 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), 784 RDS_PROTOCOL_MINOR(version),
784 (unsigned long long)be64_to_cpu(lguid), 785 (unsigned long long)be64_to_cpu(lguid),
785 (unsigned long long)be64_to_cpu(fguid)); 786 (unsigned long long)be64_to_cpu(fguid), dp_cmn->ricpc_dp_toss);
786 787
787 /* RDS/IB is not currently netns aware, thus init_net */ 788 /* RDS/IB is not currently netns aware, thus init_net */
788 conn = rds_conn_create(&init_net, daddr6, saddr6, 789 conn = rds_conn_create(&init_net, daddr6, saddr6,
789 &rds_ib_transport, 0, GFP_KERNEL, ifindex); 790 &rds_ib_transport, dp_cmn->ricpc_dp_toss,
791 GFP_KERNEL, ifindex);
790 if (IS_ERR(conn)) { 792 if (IS_ERR(conn)) {
791 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); 793 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
792 conn = NULL; 794 conn = NULL;
@@ -868,7 +870,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
868 870
869 /* If the peer doesn't do protocol negotiation, we must 871 /* If the peer doesn't do protocol negotiation, we must
870 * default to RDSv3.0 */ 872 * default to RDSv3.0 */
871 rds_ib_set_protocol(conn, RDS_PROTOCOL_VERSION); 873 rds_ib_set_protocol(conn, RDS_PROTOCOL_4_1);
872 ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */ 874 ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */
873 875
874 ret = rds_ib_setup_qp(conn); 876 ret = rds_ib_setup_qp(conn);
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 2f16146e4ec9..d395eec98959 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -986,9 +986,9 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
986 } else { 986 } else {
987 /* We expect errors as the qp is drained during shutdown */ 987 /* We expect errors as the qp is drained during shutdown */
988 if (rds_conn_up(conn) || rds_conn_connecting(conn)) 988 if (rds_conn_up(conn) || rds_conn_connecting(conn))
989 rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n", 989 rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), disconnecting and reconnecting\n",
990 &conn->c_laddr, &conn->c_faddr, 990 &conn->c_laddr, &conn->c_faddr,
991 wc->status, 991 conn->c_tos, wc->status,
992 ib_wc_status_msg(wc->status)); 992 ib_wc_status_msg(wc->status));
993 } 993 }
994 994
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 4e0c36acf866..09c46f2e97fa 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -305,8 +305,9 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
305 305
306 /* We expect errors as the qp is drained during shutdown */ 306 /* We expect errors as the qp is drained during shutdown */
307 if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { 307 if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
308 rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n", 308 rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), disconnecting and reconnecting\n",
309 &conn->c_laddr, &conn->c_faddr, wc->status, 309 &conn->c_laddr, &conn->c_faddr,
310 conn->c_tos, wc->status,
310 ib_wc_status_msg(wc->status)); 311 ib_wc_status_msg(wc->status));
311 } 312 }
312} 313}
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index e37f91537d29..46bce8389066 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -83,6 +83,7 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
83 break; 83 break;
84 84
85 case RDMA_CM_EVENT_ADDR_RESOLVED: 85 case RDMA_CM_EVENT_ADDR_RESOLVED:
86 rdma_set_service_type(cm_id, conn->c_tos);
86 /* XXX do we need to clean up if this fails? */ 87 /* XXX do we need to clean up if this fails? */
87 ret = rdma_resolve_route(cm_id, 88 ret = rdma_resolve_route(cm_id,
88 RDS_RDMA_RESOLVE_TIMEOUT_MS); 89 RDS_RDMA_RESOLVE_TIMEOUT_MS);
diff --git a/net/rds/send.c b/net/rds/send.c
index c555e121b908..166dd578c1cc 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1277,12 +1277,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
1277 1277
1278 /* rds_conn_create has a spinlock that runs with IRQ off. 1278 /* rds_conn_create has a spinlock that runs with IRQ off.
1279 * Caching the conn in the socket helps a lot. */ 1279 * Caching the conn in the socket helps a lot. */
1280 if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr)) { 1280 if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr) &&
1281 rs->rs_tos == rs->rs_conn->c_tos) {
1281 conn = rs->rs_conn; 1282 conn = rs->rs_conn;
1282 } else { 1283 } else {
1283 conn = rds_conn_create_outgoing(sock_net(sock->sk), 1284 conn = rds_conn_create_outgoing(sock_net(sock->sk),
1284 &rs->rs_bound_addr, &daddr, 1285 &rs->rs_bound_addr, &daddr,
1285 rs->rs_transport, 0, 1286 rs->rs_transport, rs->rs_tos,
1286 sock->sk->sk_allocation, 1287 sock->sk->sk_allocation,
1287 scope_id); 1288 scope_id);
1288 if (IS_ERR(conn)) { 1289 if (IS_ERR(conn)) {