aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGerd Rausch <gerd.rausch@oracle.com>2019-07-16 18:28:57 -0400
committerDavid S. Miller <davem@davemloft.net>2019-07-17 15:06:51 -0400
commitc9467447fc50ec3715d8ec98f4da874fce539235 (patch)
tree51d23641565aed0c1bda2b866a56212f6669ace6
parent2c7da8e6b041a8df2661def81ac90c9c0c719909 (diff)
net/rds: Get rid of "wait_clean_list_grace" and add locking
Waiting for activity on the "clean_list" to quiesce is no substitute for proper locking. We can have multiple threads competing for "llist_del_first" via "rds_ib_reuse_mr", and a single thread competing for "llist_del_all" and "llist_del_first" via "rds_ib_flush_mr_pool". Since "llist_del_first" depends on "list->first->next" not to change in the midst of the operation, simply waiting for all current calls to "rds_ib_reuse_mr" to quiesce across all CPUs is woefully inadequate: By the time "wait_clean_list_grace" is done iterating over all CPUs to see that there is no concurrent caller to "rds_ib_reuse_mr", a new caller may have just shown up on the first CPU. Furthermore, <linux/llist.h> explicitly calls out the need for locking: * Cases where locking is needed: * If we have multiple consumers with llist_del_first used in one consumer, * and llist_del_first or llist_del_all used in other consumers, * then a lock is needed. Also, while at it, drop the unused "pool" parameter from "list_to_llist_nodes". Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com> Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--net/rds/ib_mr.h1
-rw-r--r--net/rds/ib_rdma.c56
2 files changed, 19 insertions, 38 deletions
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 42daccb7b5eb..ab26c20ed66f 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -98,6 +98,7 @@ struct rds_ib_mr_pool {
98 struct llist_head free_list; /* unused MRs */ 98 struct llist_head free_list; /* unused MRs */
99 struct llist_head clean_list; /* unused & unmapped MRs */ 99 struct llist_head clean_list; /* unused & unmapped MRs */
100 wait_queue_head_t flush_wait; 100 wait_queue_head_t flush_wait;
101 spinlock_t clean_lock; /* "clean_list" concurrency */
101 102
102 atomic_t free_pinned; /* memory pinned by free MRs */ 103 atomic_t free_pinned; /* memory pinned by free MRs */
103 unsigned long max_items; 104 unsigned long max_items;
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 0b347f46b2f4..6b047e63a769 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -40,9 +40,6 @@
40 40
41struct workqueue_struct *rds_ib_mr_wq; 41struct workqueue_struct *rds_ib_mr_wq;
42 42
43static DEFINE_PER_CPU(unsigned long, clean_list_grace);
44#define CLEAN_LIST_BUSY_BIT 0
45
46static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) 43static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
47{ 44{
48 struct rds_ib_device *rds_ibdev; 45 struct rds_ib_device *rds_ibdev;
@@ -195,12 +192,11 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
195{ 192{
196 struct rds_ib_mr *ibmr = NULL; 193 struct rds_ib_mr *ibmr = NULL;
197 struct llist_node *ret; 194 struct llist_node *ret;
198 unsigned long *flag; 195 unsigned long flags;
199 196
200 preempt_disable(); 197 spin_lock_irqsave(&pool->clean_lock, flags);
201 flag = this_cpu_ptr(&clean_list_grace);
202 set_bit(CLEAN_LIST_BUSY_BIT, flag);
203 ret = llist_del_first(&pool->clean_list); 198 ret = llist_del_first(&pool->clean_list);
199 spin_unlock_irqrestore(&pool->clean_lock, flags);
204 if (ret) { 200 if (ret) {
205 ibmr = llist_entry(ret, struct rds_ib_mr, llnode); 201 ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
206 if (pool->pool_type == RDS_IB_MR_8K_POOL) 202 if (pool->pool_type == RDS_IB_MR_8K_POOL)
@@ -209,23 +205,9 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
209 rds_ib_stats_inc(s_ib_rdma_mr_1m_reused); 205 rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
210 } 206 }
211 207
212 clear_bit(CLEAN_LIST_BUSY_BIT, flag);
213 preempt_enable();
214 return ibmr; 208 return ibmr;
215} 209}
216 210
217static inline void wait_clean_list_grace(void)
218{
219 int cpu;
220 unsigned long *flag;
221
222 for_each_online_cpu(cpu) {
223 flag = &per_cpu(clean_list_grace, cpu);
224 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
225 cpu_relax();
226 }
227}
228
229void rds_ib_sync_mr(void *trans_private, int direction) 211void rds_ib_sync_mr(void *trans_private, int direction)
230{ 212{
231 struct rds_ib_mr *ibmr = trans_private; 213 struct rds_ib_mr *ibmr = trans_private;
@@ -324,8 +306,7 @@ static unsigned int llist_append_to_list(struct llist_head *llist,
324 * of clusters. Each cluster has linked llist nodes of 306 * of clusters. Each cluster has linked llist nodes of
325 * MR_CLUSTER_SIZE mrs that are ready for reuse. 307 * MR_CLUSTER_SIZE mrs that are ready for reuse.
326 */ 308 */
327static void list_to_llist_nodes(struct rds_ib_mr_pool *pool, 309static void list_to_llist_nodes(struct list_head *list,
328 struct list_head *list,
329 struct llist_node **nodes_head, 310 struct llist_node **nodes_head,
330 struct llist_node **nodes_tail) 311 struct llist_node **nodes_tail)
331{ 312{
@@ -402,8 +383,13 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
402 */ 383 */
403 dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list); 384 dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list);
404 dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list); 385 dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list);
405 if (free_all) 386 if (free_all) {
387 unsigned long flags;
388
389 spin_lock_irqsave(&pool->clean_lock, flags);
406 llist_append_to_list(&pool->clean_list, &unmap_list); 390 llist_append_to_list(&pool->clean_list, &unmap_list);
391 spin_unlock_irqrestore(&pool->clean_lock, flags);
392 }
407 393
408 free_goal = rds_ib_flush_goal(pool, free_all); 394 free_goal = rds_ib_flush_goal(pool, free_all);
409 395
@@ -416,27 +402,20 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
416 rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal); 402 rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);
417 403
418 if (!list_empty(&unmap_list)) { 404 if (!list_empty(&unmap_list)) {
419 /* we have to make sure that none of the things we're about 405 unsigned long flags;
420 * to put on the clean list would race with other cpus trying 406
421 * to pull items off. The llist would explode if we managed to 407 list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail);
422 * remove something from the clean list and then add it back again
423 * while another CPU was spinning on that same item in llist_del_first.
424 *
425 * This is pretty unlikely, but just in case wait for an llist grace period
426 * here before adding anything back into the clean list.
427 */
428 wait_clean_list_grace();
429
430 list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
431 if (ibmr_ret) { 408 if (ibmr_ret) {
432 *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); 409 *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
433 clean_nodes = clean_nodes->next; 410 clean_nodes = clean_nodes->next;
434 } 411 }
435 /* more than one entry in llist nodes */ 412 /* more than one entry in llist nodes */
436 if (clean_nodes) 413 if (clean_nodes) {
414 spin_lock_irqsave(&pool->clean_lock, flags);
437 llist_add_batch(clean_nodes, clean_tail, 415 llist_add_batch(clean_nodes, clean_tail,
438 &pool->clean_list); 416 &pool->clean_list);
439 417 spin_unlock_irqrestore(&pool->clean_lock, flags);
418 }
440 } 419 }
441 420
442 atomic_sub(unpinned, &pool->free_pinned); 421 atomic_sub(unpinned, &pool->free_pinned);
@@ -610,6 +589,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
610 init_llist_head(&pool->free_list); 589 init_llist_head(&pool->free_list);
611 init_llist_head(&pool->drop_list); 590 init_llist_head(&pool->drop_list);
612 init_llist_head(&pool->clean_list); 591 init_llist_head(&pool->clean_list);
592 spin_lock_init(&pool->clean_lock);
613 mutex_init(&pool->flush_lock); 593 mutex_init(&pool->flush_lock);
614 init_waitqueue_head(&pool->flush_wait); 594 init_waitqueue_head(&pool->flush_wait);
615 INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); 595 INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);