aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHefty, Sean <sean.hefty@intel.com>2011-05-10 01:06:10 -0400
committerRoland Dreier <roland@purestorage.com>2011-05-10 01:06:10 -0400
commita9bb79128aa659f97b774b97c9bb1bdc74444595 (patch)
tree20208e66195581568e5c68f838897199d224fa90
parent43b752daae9445a3b2b075a236840d801fce1593 (diff)
RDMA/cma: Add an ID_REUSEADDR option
Lustre requires that clients bind to a privileged port number before connecting to a remote server. On larger clusters (typically more than about 1000 nodes), the number of privileged ports is exhausted, resulting in lustre being unusable. To handle this, we add support for reusable addresses to the rdma_cm. This mimics the behavior of the socket option SO_REUSEADDR. A user may set an rdma_cm_id to reuse an address before calling rdma_bind_addr() (explicitly or implicitly). If set, other rdma_cm_id's may be bound to the same address, provided that they all have reuse enabled, and there are no active listens. If rdma_listen() is called on an rdma_cm_id that has reuse enabled, it will only succeed if there are no other id's bound to that same address. The reuse option is exported to user space. The behavior of the kernel reuse implementation was verified against that given by sockets. This patch is derived from a path by Ira Weiny <weiny2@llnl.gov> Signed-off-by: Sean Hefty <sean.hefty@intel.com> Signed-off-by: Roland Dreier <roland@purestorage.com>
-rw-r--r--drivers/infiniband/core/cma.c190
-rw-r--r--drivers/infiniband/core/ucma.c7
-rw-r--r--include/rdma/rdma_cm.h10
-rw-r--r--include/rdma/rdma_user_cm.h5
4 files changed, 143 insertions, 69 deletions
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index eff5e46f005c..99dde874fbbd 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -148,6 +148,7 @@ struct rdma_id_private {
148 u32 qp_num; 148 u32 qp_num;
149 u8 srq; 149 u8 srq;
150 u8 tos; 150 u8 tos;
151 u8 reuseaddr;
151}; 152};
152 153
153struct cma_multicast { 154struct cma_multicast {
@@ -1579,50 +1580,6 @@ static void cma_listen_on_all(struct rdma_id_private *id_priv)
1579 mutex_unlock(&lock); 1580 mutex_unlock(&lock);
1580} 1581}
1581 1582
1582int rdma_listen(struct rdma_cm_id *id, int backlog)
1583{
1584 struct rdma_id_private *id_priv;
1585 int ret;
1586
1587 id_priv = container_of(id, struct rdma_id_private, id);
1588 if (id_priv->state == CMA_IDLE) {
1589 ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET;
1590 ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr);
1591 if (ret)
1592 return ret;
1593 }
1594
1595 if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN))
1596 return -EINVAL;
1597
1598 id_priv->backlog = backlog;
1599 if (id->device) {
1600 switch (rdma_node_get_transport(id->device->node_type)) {
1601 case RDMA_TRANSPORT_IB:
1602 ret = cma_ib_listen(id_priv);
1603 if (ret)
1604 goto err;
1605 break;
1606 case RDMA_TRANSPORT_IWARP:
1607 ret = cma_iw_listen(id_priv, backlog);
1608 if (ret)
1609 goto err;
1610 break;
1611 default:
1612 ret = -ENOSYS;
1613 goto err;
1614 }
1615 } else
1616 cma_listen_on_all(id_priv);
1617
1618 return 0;
1619err:
1620 id_priv->backlog = 0;
1621 cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND);
1622 return ret;
1623}
1624EXPORT_SYMBOL(rdma_listen);
1625
1626void rdma_set_service_type(struct rdma_cm_id *id, int tos) 1583void rdma_set_service_type(struct rdma_cm_id *id, int tos)
1627{ 1584{
1628 struct rdma_id_private *id_priv; 1585 struct rdma_id_private *id_priv;
@@ -2105,6 +2062,25 @@ err:
2105} 2062}
2106EXPORT_SYMBOL(rdma_resolve_addr); 2063EXPORT_SYMBOL(rdma_resolve_addr);
2107 2064
2065int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse)
2066{
2067 struct rdma_id_private *id_priv;
2068 unsigned long flags;
2069 int ret;
2070
2071 id_priv = container_of(id, struct rdma_id_private, id);
2072 spin_lock_irqsave(&id_priv->lock, flags);
2073 if (id_priv->state == CMA_IDLE) {
2074 id_priv->reuseaddr = reuse;
2075 ret = 0;
2076 } else {
2077 ret = -EINVAL;
2078 }
2079 spin_unlock_irqrestore(&id_priv->lock, flags);
2080 return ret;
2081}
2082EXPORT_SYMBOL(rdma_set_reuseaddr);
2083
2108static void cma_bind_port(struct rdma_bind_list *bind_list, 2084static void cma_bind_port(struct rdma_bind_list *bind_list,
2109 struct rdma_id_private *id_priv) 2085 struct rdma_id_private *id_priv)
2110{ 2086{
@@ -2180,43 +2156,73 @@ retry:
2180 return -EADDRNOTAVAIL; 2156 return -EADDRNOTAVAIL;
2181} 2157}
2182 2158
2183static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) 2159/*
2160 * Check that the requested port is available. This is called when trying to
2161 * bind to a specific port, or when trying to listen on a bound port. In
2162 * the latter case, the provided id_priv may already be on the bind_list, but
2163 * we still need to check that it's okay to start listening.
2164 */
2165static int cma_check_port(struct rdma_bind_list *bind_list,
2166 struct rdma_id_private *id_priv, uint8_t reuseaddr)
2184{ 2167{
2185 struct rdma_id_private *cur_id; 2168 struct rdma_id_private *cur_id;
2186 struct sockaddr *addr, *cur_addr; 2169 struct sockaddr *addr, *cur_addr;
2187 struct rdma_bind_list *bind_list;
2188 struct hlist_node *node; 2170 struct hlist_node *node;
2189 unsigned short snum;
2190 2171
2191 addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; 2172 addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr;
2192 snum = ntohs(cma_port(addr)); 2173 if (cma_any_addr(addr) && !reuseaddr)
2193 if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
2194 return -EACCES;
2195
2196 bind_list = idr_find(ps, snum);
2197 if (!bind_list)
2198 return cma_alloc_port(ps, id_priv, snum);
2199
2200 /*
2201 * We don't support binding to any address if anyone is bound to
2202 * a specific address on the same port.
2203 */
2204 if (cma_any_addr(addr))
2205 return -EADDRNOTAVAIL; 2174 return -EADDRNOTAVAIL;
2206 2175
2207 hlist_for_each_entry(cur_id, node, &bind_list->owners, node) { 2176 hlist_for_each_entry(cur_id, node, &bind_list->owners, node) {
2208 cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr; 2177 if (id_priv == cur_id)
2209 if (cma_any_addr(cur_addr)) 2178 continue;
2210 return -EADDRNOTAVAIL;
2211 2179
2212 if (!cma_addr_cmp(addr, cur_addr)) 2180 if ((cur_id->state == CMA_LISTEN) ||
2213 return -EADDRINUSE; 2181 !reuseaddr || !cur_id->reuseaddr) {
2214 } 2182 cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr;
2183 if (cma_any_addr(cur_addr))
2184 return -EADDRNOTAVAIL;
2215 2185
2216 cma_bind_port(bind_list, id_priv); 2186 if (!cma_addr_cmp(addr, cur_addr))
2187 return -EADDRINUSE;
2188 }
2189 }
2217 return 0; 2190 return 0;
2218} 2191}
2219 2192
2193static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
2194{
2195 struct rdma_bind_list *bind_list;
2196 unsigned short snum;
2197 int ret;
2198
2199 snum = ntohs(cma_port((struct sockaddr *) &id_priv->id.route.addr.src_addr));
2200 if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
2201 return -EACCES;
2202
2203 bind_list = idr_find(ps, snum);
2204 if (!bind_list) {
2205 ret = cma_alloc_port(ps, id_priv, snum);
2206 } else {
2207 ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr);
2208 if (!ret)
2209 cma_bind_port(bind_list, id_priv);
2210 }
2211 return ret;
2212}
2213
2214static int cma_bind_listen(struct rdma_id_private *id_priv)
2215{
2216 struct rdma_bind_list *bind_list = id_priv->bind_list;
2217 int ret = 0;
2218
2219 mutex_lock(&lock);
2220 if (bind_list->owners.first->next)
2221 ret = cma_check_port(bind_list, id_priv, 0);
2222 mutex_unlock(&lock);
2223 return ret;
2224}
2225
2220static int cma_get_port(struct rdma_id_private *id_priv) 2226static int cma_get_port(struct rdma_id_private *id_priv)
2221{ 2227{
2222 struct idr *ps; 2228 struct idr *ps;
@@ -2268,6 +2274,56 @@ static int cma_check_linklocal(struct rdma_dev_addr *dev_addr,
2268 return 0; 2274 return 0;
2269} 2275}
2270 2276
2277int rdma_listen(struct rdma_cm_id *id, int backlog)
2278{
2279 struct rdma_id_private *id_priv;
2280 int ret;
2281
2282 id_priv = container_of(id, struct rdma_id_private, id);
2283 if (id_priv->state == CMA_IDLE) {
2284 ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET;
2285 ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr);
2286 if (ret)
2287 return ret;
2288 }
2289
2290 if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN))
2291 return -EINVAL;
2292
2293 if (id_priv->reuseaddr) {
2294 ret = cma_bind_listen(id_priv);
2295 if (ret)
2296 goto err;
2297 }
2298
2299 id_priv->backlog = backlog;
2300 if (id->device) {
2301 switch (rdma_node_get_transport(id->device->node_type)) {
2302 case RDMA_TRANSPORT_IB:
2303 ret = cma_ib_listen(id_priv);
2304 if (ret)
2305 goto err;
2306 break;
2307 case RDMA_TRANSPORT_IWARP:
2308 ret = cma_iw_listen(id_priv, backlog);
2309 if (ret)
2310 goto err;
2311 break;
2312 default:
2313 ret = -ENOSYS;
2314 goto err;
2315 }
2316 } else
2317 cma_listen_on_all(id_priv);
2318
2319 return 0;
2320err:
2321 id_priv->backlog = 0;
2322 cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND);
2323 return ret;
2324}
2325EXPORT_SYMBOL(rdma_listen);
2326
2271int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) 2327int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
2272{ 2328{
2273 struct rdma_id_private *id_priv; 2329 struct rdma_id_private *id_priv;
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index ec1e9da1488b..b3fa798525b2 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -883,6 +883,13 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname,
883 } 883 }
884 rdma_set_service_type(ctx->cm_id, *((u8 *) optval)); 884 rdma_set_service_type(ctx->cm_id, *((u8 *) optval));
885 break; 885 break;
886 case RDMA_OPTION_ID_REUSEADDR:
887 if (optlen != sizeof(int)) {
888 ret = -EINVAL;
889 break;
890 }
891 ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0);
892 break;
886 default: 893 default:
887 ret = -ENOSYS; 894 ret = -ENOSYS;
888 } 895 }
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index 4fae90304648..169f7a53fb0c 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -329,4 +329,14 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr);
329 */ 329 */
330void rdma_set_service_type(struct rdma_cm_id *id, int tos); 330void rdma_set_service_type(struct rdma_cm_id *id, int tos);
331 331
332/**
333 * rdma_set_reuseaddr - Allow the reuse of local addresses when binding
334 * the rdma_cm_id.
335 * @id: Communication identifier to configure.
336 * @reuse: Value indicating if the bound address is reusable.
337 *
338 * Reuse must be set before an address is bound to the id.
339 */
340int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse);
341
332#endif /* RDMA_CM_H */ 342#endif /* RDMA_CM_H */
diff --git a/include/rdma/rdma_user_cm.h b/include/rdma/rdma_user_cm.h
index 1d165022c02d..fc82c1896f75 100644
--- a/include/rdma/rdma_user_cm.h
+++ b/include/rdma/rdma_user_cm.h
@@ -221,8 +221,9 @@ enum {
221 221
222/* Option details */ 222/* Option details */
223enum { 223enum {
224 RDMA_OPTION_ID_TOS = 0, 224 RDMA_OPTION_ID_TOS = 0,
225 RDMA_OPTION_IB_PATH = 1 225 RDMA_OPTION_ID_REUSEADDR = 1,
226 RDMA_OPTION_IB_PATH = 1
226}; 227};
227 228
228struct rdma_ucm_set_option { 229struct rdma_ucm_set_option {