aboutsummaryrefslogtreecommitdiffstats
path: root/net/rds/ib.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/rds/ib.c')
-rw-r--r--net/rds/ib.c129
1 files changed, 95 insertions, 34 deletions
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 7d289d7985fe..1732f8effb59 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -59,6 +59,38 @@ struct list_head rds_ib_devices;
59DEFINE_SPINLOCK(ib_nodev_conns_lock); 59DEFINE_SPINLOCK(ib_nodev_conns_lock);
60LIST_HEAD(ib_nodev_conns); 60LIST_HEAD(ib_nodev_conns);
61 61
62/*
63 * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
64 * from interrupt context so we push freing off into a work struct in krdsd.
65 */
66static void rds_ib_dev_free(struct work_struct *work)
67{
68 struct rds_ib_ipaddr *i_ipaddr, *i_next;
69 struct rds_ib_device *rds_ibdev = container_of(work,
70 struct rds_ib_device, free_work);
71
72 if (rds_ibdev->mr_pool)
73 rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
74 if (rds_ibdev->mr)
75 ib_dereg_mr(rds_ibdev->mr);
76 if (rds_ibdev->pd)
77 ib_dealloc_pd(rds_ibdev->pd);
78
79 list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
80 list_del(&i_ipaddr->list);
81 kfree(i_ipaddr);
82 }
83
84 kfree(rds_ibdev);
85}
86
87void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
88{
89 BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
90 if (atomic_dec_and_test(&rds_ibdev->refcount))
91 queue_work(rds_wq, &rds_ibdev->free_work);
92}
93
62void rds_ib_add_one(struct ib_device *device) 94void rds_ib_add_one(struct ib_device *device)
63{ 95{
64 struct rds_ib_device *rds_ibdev; 96 struct rds_ib_device *rds_ibdev;
@@ -77,11 +109,14 @@ void rds_ib_add_one(struct ib_device *device)
77 goto free_attr; 109 goto free_attr;
78 } 110 }
79 111
80 rds_ibdev = kmalloc_node(sizeof *rds_ibdev, GFP_KERNEL, ibdev_to_node(device)); 112 rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
113 ibdev_to_node(device));
81 if (!rds_ibdev) 114 if (!rds_ibdev)
82 goto free_attr; 115 goto free_attr;
83 116
84 spin_lock_init(&rds_ibdev->spinlock); 117 spin_lock_init(&rds_ibdev->spinlock);
118 atomic_set(&rds_ibdev->refcount, 1);
119 INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
85 120
86 rds_ibdev->max_wrs = dev_attr->max_qp_wr; 121 rds_ibdev->max_wrs = dev_attr->max_qp_wr;
87 rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); 122 rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
@@ -96,67 +131,93 @@ void rds_ib_add_one(struct ib_device *device)
96 131
97 rds_ibdev->dev = device; 132 rds_ibdev->dev = device;
98 rds_ibdev->pd = ib_alloc_pd(device); 133 rds_ibdev->pd = ib_alloc_pd(device);
99 if (IS_ERR(rds_ibdev->pd)) 134 if (IS_ERR(rds_ibdev->pd)) {
100 goto free_dev; 135 rds_ibdev->pd = NULL;
136 goto put_dev;
137 }
101 138
102 rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, 139 rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
103 IB_ACCESS_LOCAL_WRITE); 140 if (IS_ERR(rds_ibdev->mr)) {
104 if (IS_ERR(rds_ibdev->mr)) 141 rds_ibdev->mr = NULL;
105 goto err_pd; 142 goto put_dev;
143 }
106 144
107 rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); 145 rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
108 if (IS_ERR(rds_ibdev->mr_pool)) { 146 if (IS_ERR(rds_ibdev->mr_pool)) {
109 rds_ibdev->mr_pool = NULL; 147 rds_ibdev->mr_pool = NULL;
110 goto err_mr; 148 goto put_dev;
111 } 149 }
112 150
113 INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); 151 INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
114 INIT_LIST_HEAD(&rds_ibdev->conn_list); 152 INIT_LIST_HEAD(&rds_ibdev->conn_list);
115 list_add_tail(&rds_ibdev->list, &rds_ib_devices); 153 list_add_tail(&rds_ibdev->list, &rds_ib_devices);
154 atomic_inc(&rds_ibdev->refcount);
116 155
117 ib_set_client_data(device, &rds_ib_client, rds_ibdev); 156 ib_set_client_data(device, &rds_ib_client, rds_ibdev);
157 atomic_inc(&rds_ibdev->refcount);
118 158
119 goto free_attr; 159put_dev:
120 160 rds_ib_dev_put(rds_ibdev);
121err_mr:
122 ib_dereg_mr(rds_ibdev->mr);
123err_pd:
124 ib_dealloc_pd(rds_ibdev->pd);
125free_dev:
126 kfree(rds_ibdev);
127free_attr: 161free_attr:
128 kfree(dev_attr); 162 kfree(dev_attr);
129} 163}
130 164
165/*
166 * New connections use this to find the device to associate with the
167 * connection. It's not in the fast path so we're not concerned about the
168 * performance of the IB call. (As of this writing, it uses an interrupt
169 * blocking spinlock to serialize walking a per-device list of all registered
170 * clients.)
171 *
172 * RCU is used to handle incoming connections racing with device teardown.
173 * Rather than use a lock to serialize removal from the client_data and
174 * getting a new reference, we use an RCU grace period. The destruction
175 * path removes the device from client_data and then waits for all RCU
176 * readers to finish.
177 *
178 * A new connection can get NULL from this if its arriving on a
179 * device that is in the process of being removed.
180 */
181struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
182{
183 struct rds_ib_device *rds_ibdev;
184
185 rcu_read_lock();
186 rds_ibdev = ib_get_client_data(device, &rds_ib_client);
187 if (rds_ibdev)
188 atomic_inc(&rds_ibdev->refcount);
189 rcu_read_unlock();
190 return rds_ibdev;
191}
192
193/*
194 * The IB stack is letting us know that a device is going away. This can
195 * happen if the underlying HCA driver is removed or if PCI hotplug is removing
196 * the pci function, for example.
197 *
198 * This can be called at any time and can be racing with any other RDS path.
199 */
131void rds_ib_remove_one(struct ib_device *device) 200void rds_ib_remove_one(struct ib_device *device)
132{ 201{
133 struct rds_ib_device *rds_ibdev; 202 struct rds_ib_device *rds_ibdev;
134 struct rds_ib_ipaddr *i_ipaddr, *i_next;
135 203
136 rds_ibdev = ib_get_client_data(device, &rds_ib_client); 204 rds_ibdev = ib_get_client_data(device, &rds_ib_client);
137 if (!rds_ibdev) 205 if (!rds_ibdev)
138 return; 206 return;
139 207
140 synchronize_rcu();
141 list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
142 list_del(&i_ipaddr->list);
143 kfree(i_ipaddr);
144 }
145
146 rds_ib_destroy_conns(rds_ibdev); 208 rds_ib_destroy_conns(rds_ibdev);
147 209
148 if (rds_ibdev->mr_pool) 210 /*
149 rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); 211 * prevent future connection attempts from getting a reference to this
150 212 * device and wait for currently racing connection attempts to finish
151 ib_dereg_mr(rds_ibdev->mr); 213 * getting their reference
152 214 */
153 while (ib_dealloc_pd(rds_ibdev->pd)) { 215 ib_set_client_data(device, &rds_ib_client, NULL);
154 rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); 216 synchronize_rcu();
155 msleep(1); 217 rds_ib_dev_put(rds_ibdev);
156 }
157 218
158 list_del(&rds_ibdev->list); 219 list_del(&rds_ibdev->list);
159 kfree(rds_ibdev); 220 rds_ib_dev_put(rds_ibdev);
160} 221}
161 222
162struct ib_client rds_ib_client = { 223struct ib_client rds_ib_client = {
@@ -190,7 +251,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
190 rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); 251 rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
191 rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); 252 rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
192 253
193 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); 254 rds_ibdev = ic->rds_ibdev;
194 iinfo->max_send_wr = ic->i_send_ring.w_nr; 255 iinfo->max_send_wr = ic->i_send_ring.w_nr;
195 iinfo->max_recv_wr = ic->i_recv_ring.w_nr; 256 iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
196 iinfo->max_send_sge = rds_ibdev->max_sge; 257 iinfo->max_send_sge = rds_ibdev->max_sge;