aboutsummaryrefslogtreecommitdiffstats
path: root/net/rds/ib.c
diff options
context:
space:
mode:
authorZach Brown <zach.brown@oracle.com>2010-05-18 18:48:51 -0400
committerAndy Grover <andy.grover@oracle.com>2010-09-08 21:15:17 -0400
commit3e0249f9c05cb77b66f7f09644ca9ca208d991a9 (patch)
tree2f8aaddb51590d36a54b8f7044224073107d77d9 /net/rds/ib.c
parent89bf9d4158b5a1b6bd00960eb2e47601ec8cc138 (diff)
RDS/IB: add refcount tracking to struct rds_ib_device
The RDS IB client .remove callback used to free the rds_ibdev for the given device unconditionally. This could race other users of the struct. This patch adds refcounting so that we only free the rds_ibdev once all of its users are done. Many rds_ibdev users are tied to connections. We give the connection a reference and change these users to reference the device in the connection instead of looking it up in the IB client data. The only user of the IB client data remaining is the first lookup of the device as connections are built up. Incrementing the reference count of a device found in the IB client data could race with final freeing so we use an RCU grace period to make sure that freeing won't happen until those lookups are done. MRs need the rds_ibdev to get at the pool that they're freed in to. They exist outside a connection and many MRs can reference different devices from one socket, so it was natural to have each MR hold a reference. MR refs can be dropped from interrupt handlers and final device teardown can block so we push it off to a work struct. Pool teardown had to be fixed to cancel its pending work instead of deadlocking waiting for all queued work, including itself, to finish. MRs get their reference from the global device list, which gets a reference. It is left unprotected by locks and remains racy. A simple global lock would be a significant bottleneck. More scalable (complicated) locking should be done carefully in a later patch. Signed-off-by: Zach Brown <zach.brown@oracle.com>
Diffstat (limited to 'net/rds/ib.c')
-rw-r--r--net/rds/ib.c129
1 files changed, 95 insertions, 34 deletions
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 7d289d7985fe..1732f8effb59 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -59,6 +59,38 @@ struct list_head rds_ib_devices;
59DEFINE_SPINLOCK(ib_nodev_conns_lock); 59DEFINE_SPINLOCK(ib_nodev_conns_lock);
60LIST_HEAD(ib_nodev_conns); 60LIST_HEAD(ib_nodev_conns);
61 61
62/*
63 * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
64 * from interrupt context so we push freing off into a work struct in krdsd.
65 */
66static void rds_ib_dev_free(struct work_struct *work)
67{
68 struct rds_ib_ipaddr *i_ipaddr, *i_next;
69 struct rds_ib_device *rds_ibdev = container_of(work,
70 struct rds_ib_device, free_work);
71
72 if (rds_ibdev->mr_pool)
73 rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
74 if (rds_ibdev->mr)
75 ib_dereg_mr(rds_ibdev->mr);
76 if (rds_ibdev->pd)
77 ib_dealloc_pd(rds_ibdev->pd);
78
79 list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
80 list_del(&i_ipaddr->list);
81 kfree(i_ipaddr);
82 }
83
84 kfree(rds_ibdev);
85}
86
87void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
88{
89 BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
90 if (atomic_dec_and_test(&rds_ibdev->refcount))
91 queue_work(rds_wq, &rds_ibdev->free_work);
92}
93
62void rds_ib_add_one(struct ib_device *device) 94void rds_ib_add_one(struct ib_device *device)
63{ 95{
64 struct rds_ib_device *rds_ibdev; 96 struct rds_ib_device *rds_ibdev;
@@ -77,11 +109,14 @@ void rds_ib_add_one(struct ib_device *device)
77 goto free_attr; 109 goto free_attr;
78 } 110 }
79 111
80 rds_ibdev = kmalloc_node(sizeof *rds_ibdev, GFP_KERNEL, ibdev_to_node(device)); 112 rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
113 ibdev_to_node(device));
81 if (!rds_ibdev) 114 if (!rds_ibdev)
82 goto free_attr; 115 goto free_attr;
83 116
84 spin_lock_init(&rds_ibdev->spinlock); 117 spin_lock_init(&rds_ibdev->spinlock);
118 atomic_set(&rds_ibdev->refcount, 1);
119 INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
85 120
86 rds_ibdev->max_wrs = dev_attr->max_qp_wr; 121 rds_ibdev->max_wrs = dev_attr->max_qp_wr;
87 rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); 122 rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
@@ -96,67 +131,93 @@ void rds_ib_add_one(struct ib_device *device)
96 131
97 rds_ibdev->dev = device; 132 rds_ibdev->dev = device;
98 rds_ibdev->pd = ib_alloc_pd(device); 133 rds_ibdev->pd = ib_alloc_pd(device);
99 if (IS_ERR(rds_ibdev->pd)) 134 if (IS_ERR(rds_ibdev->pd)) {
100 goto free_dev; 135 rds_ibdev->pd = NULL;
136 goto put_dev;
137 }
101 138
102 rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, 139 rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
103 IB_ACCESS_LOCAL_WRITE); 140 if (IS_ERR(rds_ibdev->mr)) {
104 if (IS_ERR(rds_ibdev->mr)) 141 rds_ibdev->mr = NULL;
105 goto err_pd; 142 goto put_dev;
143 }
106 144
107 rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); 145 rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
108 if (IS_ERR(rds_ibdev->mr_pool)) { 146 if (IS_ERR(rds_ibdev->mr_pool)) {
109 rds_ibdev->mr_pool = NULL; 147 rds_ibdev->mr_pool = NULL;
110 goto err_mr; 148 goto put_dev;
111 } 149 }
112 150
113 INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); 151 INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
114 INIT_LIST_HEAD(&rds_ibdev->conn_list); 152 INIT_LIST_HEAD(&rds_ibdev->conn_list);
115 list_add_tail(&rds_ibdev->list, &rds_ib_devices); 153 list_add_tail(&rds_ibdev->list, &rds_ib_devices);
154 atomic_inc(&rds_ibdev->refcount);
116 155
117 ib_set_client_data(device, &rds_ib_client, rds_ibdev); 156 ib_set_client_data(device, &rds_ib_client, rds_ibdev);
157 atomic_inc(&rds_ibdev->refcount);
118 158
119 goto free_attr; 159put_dev:
120 160 rds_ib_dev_put(rds_ibdev);
121err_mr:
122 ib_dereg_mr(rds_ibdev->mr);
123err_pd:
124 ib_dealloc_pd(rds_ibdev->pd);
125free_dev:
126 kfree(rds_ibdev);
127free_attr: 161free_attr:
128 kfree(dev_attr); 162 kfree(dev_attr);
129} 163}
130 164
165/*
166 * New connections use this to find the device to associate with the
167 * connection. It's not in the fast path so we're not concerned about the
168 * performance of the IB call. (As of this writing, it uses an interrupt
169 * blocking spinlock to serialize walking a per-device list of all registered
170 * clients.)
171 *
172 * RCU is used to handle incoming connections racing with device teardown.
173 * Rather than use a lock to serialize removal from the client_data and
174 * getting a new reference, we use an RCU grace period. The destruction
175 * path removes the device from client_data and then waits for all RCU
176 * readers to finish.
177 *
178 * A new connection can get NULL from this if its arriving on a
179 * device that is in the process of being removed.
180 */
181struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
182{
183 struct rds_ib_device *rds_ibdev;
184
185 rcu_read_lock();
186 rds_ibdev = ib_get_client_data(device, &rds_ib_client);
187 if (rds_ibdev)
188 atomic_inc(&rds_ibdev->refcount);
189 rcu_read_unlock();
190 return rds_ibdev;
191}
192
193/*
194 * The IB stack is letting us know that a device is going away. This can
195 * happen if the underlying HCA driver is removed or if PCI hotplug is removing
196 * the pci function, for example.
197 *
198 * This can be called at any time and can be racing with any other RDS path.
199 */
131void rds_ib_remove_one(struct ib_device *device) 200void rds_ib_remove_one(struct ib_device *device)
132{ 201{
133 struct rds_ib_device *rds_ibdev; 202 struct rds_ib_device *rds_ibdev;
134 struct rds_ib_ipaddr *i_ipaddr, *i_next;
135 203
136 rds_ibdev = ib_get_client_data(device, &rds_ib_client); 204 rds_ibdev = ib_get_client_data(device, &rds_ib_client);
137 if (!rds_ibdev) 205 if (!rds_ibdev)
138 return; 206 return;
139 207
140 synchronize_rcu();
141 list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
142 list_del(&i_ipaddr->list);
143 kfree(i_ipaddr);
144 }
145
146 rds_ib_destroy_conns(rds_ibdev); 208 rds_ib_destroy_conns(rds_ibdev);
147 209
148 if (rds_ibdev->mr_pool) 210 /*
149 rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); 211 * prevent future connection attempts from getting a reference to this
150 212 * device and wait for currently racing connection attempts to finish
151 ib_dereg_mr(rds_ibdev->mr); 213 * getting their reference
152 214 */
153 while (ib_dealloc_pd(rds_ibdev->pd)) { 215 ib_set_client_data(device, &rds_ib_client, NULL);
154 rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); 216 synchronize_rcu();
155 msleep(1); 217 rds_ib_dev_put(rds_ibdev);
156 }
157 218
158 list_del(&rds_ibdev->list); 219 list_del(&rds_ibdev->list);
159 kfree(rds_ibdev); 220 rds_ib_dev_put(rds_ibdev);
160} 221}
161 222
162struct ib_client rds_ib_client = { 223struct ib_client rds_ib_client = {
@@ -190,7 +251,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
190 rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); 251 rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
191 rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); 252 rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
192 253
193 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); 254 rds_ibdev = ic->rds_ibdev;
194 iinfo->max_send_wr = ic->i_send_ring.w_nr; 255 iinfo->max_send_wr = ic->i_send_ring.w_nr;
195 iinfo->max_recv_wr = ic->i_recv_ring.w_nr; 256 iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
196 iinfo->max_send_sge = rds_ibdev->max_sge; 257 iinfo->max_send_sge = rds_ibdev->max_sge;