aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorMichael S. Tsirkin <mst@redhat.com>2014-02-13 04:42:05 -0500
committerDavid S. Miller <davem@davemloft.net>2014-02-13 18:47:30 -0500
commit0ad8b480d6ee916aa84324f69acf690142aecd0e (patch)
tree0822d50ade14f1ab57ec2cd4fa878611f3ba752f /drivers
parent208ece14e23b167b43d906c74e3bdcbee8d6e4aa (diff)
vhost: fix ref cnt checking deadlock
vhost checked the counter within the refcnt before decrementing. It really wanted to know that it is the one that has the last reference, as a way to batch freeing resources a bit more efficiently. Note: we only let refcount go to 0 on device release. This works well but we now access the ref counter twice so there's a race: all users might see a high count and decide to defer freeing resources. In the end no one initiates freeing resources until the last reference is gone (which is on VM shotdown so might happen after a looooong time). Let's do what we probably should have done straight away: switch from kref to plain atomic, documenting the semantics, return the refcount value atomically after decrement, then use that to avoid the deadlock. Reported-by: Qin Chuanyu <qinchuanyu@huawei.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Jason Wang <jasowang@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/vhost/net.c41
1 files changed, 20 insertions, 21 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9a68409580d5..41be4de37e81 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -70,7 +70,12 @@ enum {
70}; 70};
71 71
72struct vhost_net_ubuf_ref { 72struct vhost_net_ubuf_ref {
73 struct kref kref; 73 /* refcount follows semantics similar to kref:
74 * 0: object is released
75 * 1: no outstanding ubufs
76 * >1: outstanding ubufs
77 */
78 atomic_t refcount;
74 wait_queue_head_t wait; 79 wait_queue_head_t wait;
75 struct vhost_virtqueue *vq; 80 struct vhost_virtqueue *vq;
76}; 81};
@@ -116,14 +121,6 @@ static void vhost_net_enable_zcopy(int vq)
116 vhost_net_zcopy_mask |= 0x1 << vq; 121 vhost_net_zcopy_mask |= 0x1 << vq;
117} 122}
118 123
119static void vhost_net_zerocopy_done_signal(struct kref *kref)
120{
121 struct vhost_net_ubuf_ref *ubufs;
122
123 ubufs = container_of(kref, struct vhost_net_ubuf_ref, kref);
124 wake_up(&ubufs->wait);
125}
126
127static struct vhost_net_ubuf_ref * 124static struct vhost_net_ubuf_ref *
128vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy) 125vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
129{ 126{
@@ -134,21 +131,24 @@ vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
134 ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL); 131 ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL);
135 if (!ubufs) 132 if (!ubufs)
136 return ERR_PTR(-ENOMEM); 133 return ERR_PTR(-ENOMEM);
137 kref_init(&ubufs->kref); 134 atomic_set(&ubufs->refcount, 1);
138 init_waitqueue_head(&ubufs->wait); 135 init_waitqueue_head(&ubufs->wait);
139 ubufs->vq = vq; 136 ubufs->vq = vq;
140 return ubufs; 137 return ubufs;
141} 138}
142 139
143static void vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs) 140static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
144{ 141{
145 kref_put(&ubufs->kref, vhost_net_zerocopy_done_signal); 142 int r = atomic_sub_return(1, &ubufs->refcount);
143 if (unlikely(!r))
144 wake_up(&ubufs->wait);
145 return r;
146} 146}
147 147
148static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs) 148static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
149{ 149{
150 kref_put(&ubufs->kref, vhost_net_zerocopy_done_signal); 150 vhost_net_ubuf_put(ubufs);
151 wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount)); 151 wait_event(ubufs->wait, !atomic_read(&ubufs->refcount));
152} 152}
153 153
154static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs) 154static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs)
@@ -306,22 +306,21 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
306{ 306{
307 struct vhost_net_ubuf_ref *ubufs = ubuf->ctx; 307 struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
308 struct vhost_virtqueue *vq = ubufs->vq; 308 struct vhost_virtqueue *vq = ubufs->vq;
309 int cnt = atomic_read(&ubufs->kref.refcount); 309 int cnt;
310 310
311 /* set len to mark this desc buffers done DMA */ 311 /* set len to mark this desc buffers done DMA */
312 vq->heads[ubuf->desc].len = success ? 312 vq->heads[ubuf->desc].len = success ?
313 VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; 313 VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
314 vhost_net_ubuf_put(ubufs); 314 cnt = vhost_net_ubuf_put(ubufs);
315 315
316 /* 316 /*
317 * Trigger polling thread if guest stopped submitting new buffers: 317 * Trigger polling thread if guest stopped submitting new buffers:
318 * in this case, the refcount after decrement will eventually reach 1 318 * in this case, the refcount after decrement will eventually reach 1.
319 * so here it is 2.
320 * We also trigger polling periodically after each 16 packets 319 * We also trigger polling periodically after each 16 packets
321 * (the value 16 here is more or less arbitrary, it's tuned to trigger 320 * (the value 16 here is more or less arbitrary, it's tuned to trigger
322 * less than 10% of times). 321 * less than 10% of times).
323 */ 322 */
324 if (cnt <= 2 || !(cnt % 16)) 323 if (cnt <= 1 || !(cnt % 16))
325 vhost_poll_queue(&vq->poll); 324 vhost_poll_queue(&vq->poll);
326} 325}
327 326
@@ -420,7 +419,7 @@ static void handle_tx(struct vhost_net *net)
420 msg.msg_control = ubuf; 419 msg.msg_control = ubuf;
421 msg.msg_controllen = sizeof(ubuf); 420 msg.msg_controllen = sizeof(ubuf);
422 ubufs = nvq->ubufs; 421 ubufs = nvq->ubufs;
423 kref_get(&ubufs->kref); 422 atomic_inc(&ubufs->refcount);
424 nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; 423 nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
425 } else { 424 } else {
426 msg.msg_control = NULL; 425 msg.msg_control = NULL;
@@ -780,7 +779,7 @@ static void vhost_net_flush(struct vhost_net *n)
780 vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs); 779 vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs);
781 mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); 780 mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
782 n->tx_flush = false; 781 n->tx_flush = false;
783 kref_init(&n->vqs[VHOST_NET_VQ_TX].ubufs->kref); 782 atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1);
784 mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); 783 mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
785 } 784 }
786} 785}