diff options
author | Gerd Rausch <gerd.rausch@oracle.com> | 2019-07-16 18:28:57 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2019-07-17 15:06:51 -0400 |
commit | c9467447fc50ec3715d8ec98f4da874fce539235 (patch) | |
tree | 51d23641565aed0c1bda2b866a56212f6669ace6 | |
parent | 2c7da8e6b041a8df2661def81ac90c9c0c719909 (diff) |
net/rds: Get rid of "wait_clean_list_grace" and add locking
Waiting for activity on the "clean_list" to quiesce is no substitute
for proper locking.
We can have multiple threads competing for "llist_del_first"
via "rds_ib_reuse_mr", and a single thread competing
for "llist_del_all" and "llist_del_first" via "rds_ib_flush_mr_pool".
Since "llist_del_first" depends on "list->first->next" not to change
in the midst of the operation, simply waiting for all current calls
to "rds_ib_reuse_mr" to quiesce across all CPUs is woefully inadequate:
By the time "wait_clean_list_grace" is done iterating over all CPUs to see
that there is no concurrent caller to "rds_ib_reuse_mr", a new caller may
have just shown up on the first CPU.
Furthermore, <linux/llist.h> explicitly calls out the need for locking:
* Cases where locking is needed:
* If we have multiple consumers with llist_del_first used in one consumer,
* and llist_del_first or llist_del_all used in other consumers,
* then a lock is needed.
Also, while at it, drop the unused "pool" parameter
from "list_to_llist_nodes".
Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | net/rds/ib_mr.h | 1 | ||||
-rw-r--r-- | net/rds/ib_rdma.c | 56 |
2 files changed, 19 insertions, 38 deletions
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index 42daccb7b5eb..ab26c20ed66f 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h | |||
@@ -98,6 +98,7 @@ struct rds_ib_mr_pool { | |||
98 | struct llist_head free_list; /* unused MRs */ | 98 | struct llist_head free_list; /* unused MRs */ |
99 | struct llist_head clean_list; /* unused & unmapped MRs */ | 99 | struct llist_head clean_list; /* unused & unmapped MRs */ |
100 | wait_queue_head_t flush_wait; | 100 | wait_queue_head_t flush_wait; |
101 | spinlock_t clean_lock; /* "clean_list" concurrency */ | ||
101 | 102 | ||
102 | atomic_t free_pinned; /* memory pinned by free MRs */ | 103 | atomic_t free_pinned; /* memory pinned by free MRs */ |
103 | unsigned long max_items; | 104 | unsigned long max_items; |
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 0b347f46b2f4..6b047e63a769 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c | |||
@@ -40,9 +40,6 @@ | |||
40 | 40 | ||
41 | struct workqueue_struct *rds_ib_mr_wq; | 41 | struct workqueue_struct *rds_ib_mr_wq; |
42 | 42 | ||
43 | static DEFINE_PER_CPU(unsigned long, clean_list_grace); | ||
44 | #define CLEAN_LIST_BUSY_BIT 0 | ||
45 | |||
46 | static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) | 43 | static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) |
47 | { | 44 | { |
48 | struct rds_ib_device *rds_ibdev; | 45 | struct rds_ib_device *rds_ibdev; |
@@ -195,12 +192,11 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) | |||
195 | { | 192 | { |
196 | struct rds_ib_mr *ibmr = NULL; | 193 | struct rds_ib_mr *ibmr = NULL; |
197 | struct llist_node *ret; | 194 | struct llist_node *ret; |
198 | unsigned long *flag; | 195 | unsigned long flags; |
199 | 196 | ||
200 | preempt_disable(); | 197 | spin_lock_irqsave(&pool->clean_lock, flags); |
201 | flag = this_cpu_ptr(&clean_list_grace); | ||
202 | set_bit(CLEAN_LIST_BUSY_BIT, flag); | ||
203 | ret = llist_del_first(&pool->clean_list); | 198 | ret = llist_del_first(&pool->clean_list); |
199 | spin_unlock_irqrestore(&pool->clean_lock, flags); | ||
204 | if (ret) { | 200 | if (ret) { |
205 | ibmr = llist_entry(ret, struct rds_ib_mr, llnode); | 201 | ibmr = llist_entry(ret, struct rds_ib_mr, llnode); |
206 | if (pool->pool_type == RDS_IB_MR_8K_POOL) | 202 | if (pool->pool_type == RDS_IB_MR_8K_POOL) |
@@ -209,23 +205,9 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) | |||
209 | rds_ib_stats_inc(s_ib_rdma_mr_1m_reused); | 205 | rds_ib_stats_inc(s_ib_rdma_mr_1m_reused); |
210 | } | 206 | } |
211 | 207 | ||
212 | clear_bit(CLEAN_LIST_BUSY_BIT, flag); | ||
213 | preempt_enable(); | ||
214 | return ibmr; | 208 | return ibmr; |
215 | } | 209 | } |
216 | 210 | ||
217 | static inline void wait_clean_list_grace(void) | ||
218 | { | ||
219 | int cpu; | ||
220 | unsigned long *flag; | ||
221 | |||
222 | for_each_online_cpu(cpu) { | ||
223 | flag = &per_cpu(clean_list_grace, cpu); | ||
224 | while (test_bit(CLEAN_LIST_BUSY_BIT, flag)) | ||
225 | cpu_relax(); | ||
226 | } | ||
227 | } | ||
228 | |||
229 | void rds_ib_sync_mr(void *trans_private, int direction) | 211 | void rds_ib_sync_mr(void *trans_private, int direction) |
230 | { | 212 | { |
231 | struct rds_ib_mr *ibmr = trans_private; | 213 | struct rds_ib_mr *ibmr = trans_private; |
@@ -324,8 +306,7 @@ static unsigned int llist_append_to_list(struct llist_head *llist, | |||
324 | * of clusters. Each cluster has linked llist nodes of | 306 | * of clusters. Each cluster has linked llist nodes of |
325 | * MR_CLUSTER_SIZE mrs that are ready for reuse. | 307 | * MR_CLUSTER_SIZE mrs that are ready for reuse. |
326 | */ | 308 | */ |
327 | static void list_to_llist_nodes(struct rds_ib_mr_pool *pool, | 309 | static void list_to_llist_nodes(struct list_head *list, |
328 | struct list_head *list, | ||
329 | struct llist_node **nodes_head, | 310 | struct llist_node **nodes_head, |
330 | struct llist_node **nodes_tail) | 311 | struct llist_node **nodes_tail) |
331 | { | 312 | { |
@@ -402,8 +383,13 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, | |||
402 | */ | 383 | */ |
403 | dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list); | 384 | dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list); |
404 | dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list); | 385 | dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list); |
405 | if (free_all) | 386 | if (free_all) { |
387 | unsigned long flags; | ||
388 | |||
389 | spin_lock_irqsave(&pool->clean_lock, flags); | ||
406 | llist_append_to_list(&pool->clean_list, &unmap_list); | 390 | llist_append_to_list(&pool->clean_list, &unmap_list); |
391 | spin_unlock_irqrestore(&pool->clean_lock, flags); | ||
392 | } | ||
407 | 393 | ||
408 | free_goal = rds_ib_flush_goal(pool, free_all); | 394 | free_goal = rds_ib_flush_goal(pool, free_all); |
409 | 395 | ||
@@ -416,27 +402,20 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, | |||
416 | rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal); | 402 | rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal); |
417 | 403 | ||
418 | if (!list_empty(&unmap_list)) { | 404 | if (!list_empty(&unmap_list)) { |
419 | /* we have to make sure that none of the things we're about | 405 | unsigned long flags; |
420 | * to put on the clean list would race with other cpus trying | 406 | |
421 | * to pull items off. The llist would explode if we managed to | 407 | list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail); |
422 | * remove something from the clean list and then add it back again | ||
423 | * while another CPU was spinning on that same item in llist_del_first. | ||
424 | * | ||
425 | * This is pretty unlikely, but just in case wait for an llist grace period | ||
426 | * here before adding anything back into the clean list. | ||
427 | */ | ||
428 | wait_clean_list_grace(); | ||
429 | |||
430 | list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail); | ||
431 | if (ibmr_ret) { | 408 | if (ibmr_ret) { |
432 | *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); | 409 | *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); |
433 | clean_nodes = clean_nodes->next; | 410 | clean_nodes = clean_nodes->next; |
434 | } | 411 | } |
435 | /* more than one entry in llist nodes */ | 412 | /* more than one entry in llist nodes */ |
436 | if (clean_nodes) | 413 | if (clean_nodes) { |
414 | spin_lock_irqsave(&pool->clean_lock, flags); | ||
437 | llist_add_batch(clean_nodes, clean_tail, | 415 | llist_add_batch(clean_nodes, clean_tail, |
438 | &pool->clean_list); | 416 | &pool->clean_list); |
439 | 417 | spin_unlock_irqrestore(&pool->clean_lock, flags); | |
418 | } | ||
440 | } | 419 | } |
441 | 420 | ||
442 | atomic_sub(unpinned, &pool->free_pinned); | 421 | atomic_sub(unpinned, &pool->free_pinned); |
@@ -610,6 +589,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, | |||
610 | init_llist_head(&pool->free_list); | 589 | init_llist_head(&pool->free_list); |
611 | init_llist_head(&pool->drop_list); | 590 | init_llist_head(&pool->drop_list); |
612 | init_llist_head(&pool->clean_list); | 591 | init_llist_head(&pool->clean_list); |
592 | spin_lock_init(&pool->clean_lock); | ||
613 | mutex_init(&pool->flush_lock); | 593 | mutex_init(&pool->flush_lock); |
614 | init_waitqueue_head(&pool->flush_wait); | 594 | init_waitqueue_head(&pool->flush_wait); |
615 | INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); | 595 | INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); |