aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShirley Ma <mashirle@us.ibm.com>2010-01-28 22:20:04 -0500
committerDavid S. Miller <davem@davemloft.net>2010-02-02 18:55:42 -0500
commit9ab86bbcf8be755256f0a5e994e0b38af6b4d399 (patch)
tree6de61af79e9892d192acb5ba215b9331d88c8212
parentf9bfbebf34eab707b065116cdc9699d25ba4252a (diff)
virtio_net: Defer skb allocation in receive path Date: Wed, 13 Jan 2010 12:53:38 -0800
virtio_net receives packets from its pre-allocated vring buffers, then it delivers these packets to upper layer protocols as skb buffs. So it's not necessary to pre-allocate skb for each mergable buffer, then frees extra skbs when buffers are merged into a large packet. This patch has deferred skb allocation in receiving packets for both big packets and mergeable buffers to reduce skb pre-allocations and skb frees. It frees unused buffers by calling detach_unused_buf in vring, so recv skb queue is not needed. Signed-off-by: Shirley Ma <xma@us.ibm.com> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/virtio_net.c427
1 files changed, 248 insertions, 179 deletions
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 6b92e383c653..9d8984a3741c 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -56,8 +56,7 @@ struct virtnet_info
56 /* Host will merge rx buffers for big packets (shake it! shake it!) */ 56 /* Host will merge rx buffers for big packets (shake it! shake it!) */
57 bool mergeable_rx_bufs; 57 bool mergeable_rx_bufs;
58 58
59 /* Receive & send queues. */ 59 /* Send queue. */
60 struct sk_buff_head recv;
61 struct sk_buff_head send; 60 struct sk_buff_head send;
62 61
63 /* Work struct for refilling if we run low on memory. */ 62 /* Work struct for refilling if we run low on memory. */
@@ -75,34 +74,44 @@ struct skb_vnet_hdr {
75 unsigned int num_sg; 74 unsigned int num_sg;
76}; 75};
77 76
77struct padded_vnet_hdr {
78 struct virtio_net_hdr hdr;
79 /*
80 * virtio_net_hdr should be in a separated sg buffer because of a
81 * QEMU bug, and data sg buffer shares same page with this header sg.
82 * This padding makes next sg 16 byte aligned after virtio_net_hdr.
83 */
84 char padding[6];
85};
86
78static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb) 87static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
79{ 88{
80 return (struct skb_vnet_hdr *)skb->cb; 89 return (struct skb_vnet_hdr *)skb->cb;
81} 90}
82 91
83static void give_a_page(struct virtnet_info *vi, struct page *page) 92/*
84{ 93 * private is used to chain pages for big packets, put the whole
85 page->private = (unsigned long)vi->pages; 94 * most recent used list in the beginning for reuse
86 vi->pages = page; 95 */
87} 96static void give_pages(struct virtnet_info *vi, struct page *page)
88
89static void trim_pages(struct virtnet_info *vi, struct sk_buff *skb)
90{ 97{
91 unsigned int i; 98 struct page *end;
92 99
93 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 100 /* Find end of list, sew whole thing into vi->pages. */
94 give_a_page(vi, skb_shinfo(skb)->frags[i].page); 101 for (end = page; end->private; end = (struct page *)end->private);
95 skb_shinfo(skb)->nr_frags = 0; 102 end->private = (unsigned long)vi->pages;
96 skb->data_len = 0; 103 vi->pages = page;
97} 104}
98 105
99static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask) 106static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
100{ 107{
101 struct page *p = vi->pages; 108 struct page *p = vi->pages;
102 109
103 if (p) 110 if (p) {
104 vi->pages = (struct page *)p->private; 111 vi->pages = (struct page *)p->private;
105 else 112 /* clear private here, it is used to chain pages */
113 p->private = 0;
114 } else
106 p = alloc_page(gfp_mask); 115 p = alloc_page(gfp_mask);
107 return p; 116 return p;
108} 117}
@@ -118,99 +127,142 @@ static void skb_xmit_done(struct virtqueue *svq)
118 netif_wake_queue(vi->dev); 127 netif_wake_queue(vi->dev);
119} 128}
120 129
121static void receive_skb(struct net_device *dev, struct sk_buff *skb, 130static void set_skb_frag(struct sk_buff *skb, struct page *page,
122 unsigned len) 131 unsigned int offset, unsigned int *len)
123{ 132{
124 struct virtnet_info *vi = netdev_priv(dev); 133 int i = skb_shinfo(skb)->nr_frags;
125 struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb); 134 skb_frag_t *f;
126 int err; 135
127 int i; 136 f = &skb_shinfo(skb)->frags[i];
128 137 f->size = min((unsigned)PAGE_SIZE - offset, *len);
129 if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) { 138 f->page_offset = offset;
130 pr_debug("%s: short packet %i\n", dev->name, len); 139 f->page = page;
131 dev->stats.rx_length_errors++; 140
132 goto drop; 141 skb->data_len += f->size;
133 } 142 skb->len += f->size;
143 skb_shinfo(skb)->nr_frags++;
144 *len -= f->size;
145}
134 146
135 if (vi->mergeable_rx_bufs) { 147static struct sk_buff *page_to_skb(struct virtnet_info *vi,
136 unsigned int copy; 148 struct page *page, unsigned int len)
137 char *p = page_address(skb_shinfo(skb)->frags[0].page); 149{
150 struct sk_buff *skb;
151 struct skb_vnet_hdr *hdr;
152 unsigned int copy, hdr_len, offset;
153 char *p;
138 154
139 if (len > PAGE_SIZE) 155 p = page_address(page);
140 len = PAGE_SIZE;
141 len -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
142 156
143 memcpy(&hdr->mhdr, p, sizeof(hdr->mhdr)); 157 /* copy small packet so we can reuse these pages for small data */
144 p += sizeof(hdr->mhdr); 158 skb = netdev_alloc_skb_ip_align(vi->dev, GOOD_COPY_LEN);
159 if (unlikely(!skb))
160 return NULL;
145 161
146 copy = len; 162 hdr = skb_vnet_hdr(skb);
147 if (copy > skb_tailroom(skb))
148 copy = skb_tailroom(skb);
149 163
150 memcpy(skb_put(skb, copy), p, copy); 164 if (vi->mergeable_rx_bufs) {
165 hdr_len = sizeof hdr->mhdr;
166 offset = hdr_len;
167 } else {
168 hdr_len = sizeof hdr->hdr;
169 offset = sizeof(struct padded_vnet_hdr);
170 }
151 171
152 len -= copy; 172 memcpy(hdr, p, hdr_len);
153 173
154 if (!len) { 174 len -= hdr_len;
155 give_a_page(vi, skb_shinfo(skb)->frags[0].page); 175 p += offset;
156 skb_shinfo(skb)->nr_frags--;
157 } else {
158 skb_shinfo(skb)->frags[0].page_offset +=
159 sizeof(hdr->mhdr) + copy;
160 skb_shinfo(skb)->frags[0].size = len;
161 skb->data_len += len;
162 skb->len += len;
163 }
164 176
165 while (--hdr->mhdr.num_buffers) { 177 copy = len;
166 struct sk_buff *nskb; 178 if (copy > skb_tailroom(skb))
179 copy = skb_tailroom(skb);
180 memcpy(skb_put(skb, copy), p, copy);
167 181
168 i = skb_shinfo(skb)->nr_frags; 182 len -= copy;
169 if (i >= MAX_SKB_FRAGS) { 183 offset += copy;
170 pr_debug("%s: packet too long %d\n", dev->name,
171 len);
172 dev->stats.rx_length_errors++;
173 goto drop;
174 }
175 184
176 nskb = vi->rvq->vq_ops->get_buf(vi->rvq, &len); 185 while (len) {
177 if (!nskb) { 186 set_skb_frag(skb, page, offset, &len);
178 pr_debug("%s: rx error: %d buffers missing\n", 187 page = (struct page *)page->private;
179 dev->name, hdr->mhdr.num_buffers); 188 offset = 0;
180 dev->stats.rx_length_errors++; 189 }
181 goto drop;
182 }
183 190
184 __skb_unlink(nskb, &vi->recv); 191 if (page)
185 vi->num--; 192 give_pages(vi, page);
186 193
187 skb_shinfo(skb)->frags[i] = skb_shinfo(nskb)->frags[0]; 194 return skb;
188 skb_shinfo(nskb)->nr_frags = 0; 195}
189 kfree_skb(nskb);
190 196
191 if (len > PAGE_SIZE) 197static int receive_mergeable(struct virtnet_info *vi, struct sk_buff *skb)
192 len = PAGE_SIZE; 198{
199 struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
200 struct page *page;
201 int num_buf, i, len;
202
203 num_buf = hdr->mhdr.num_buffers;
204 while (--num_buf) {
205 i = skb_shinfo(skb)->nr_frags;
206 if (i >= MAX_SKB_FRAGS) {
207 pr_debug("%s: packet too long\n", skb->dev->name);
208 skb->dev->stats.rx_length_errors++;
209 return -EINVAL;
210 }
193 211
194 skb_shinfo(skb)->frags[i].size = len; 212 page = vi->rvq->vq_ops->get_buf(vi->rvq, &len);
195 skb_shinfo(skb)->nr_frags++; 213 if (!page) {
196 skb->data_len += len; 214 pr_debug("%s: rx error: %d buffers missing\n",
197 skb->len += len; 215 skb->dev->name, hdr->mhdr.num_buffers);
216 skb->dev->stats.rx_length_errors++;
217 return -EINVAL;
198 } 218 }
199 } else { 219 if (len > PAGE_SIZE)
200 len -= sizeof(hdr->hdr); 220 len = PAGE_SIZE;
221
222 set_skb_frag(skb, page, 0, &len);
223
224 --vi->num;
225 }
226 return 0;
227}
228
229static void receive_buf(struct net_device *dev, void *buf, unsigned int len)
230{
231 struct virtnet_info *vi = netdev_priv(dev);
232 struct sk_buff *skb;
233 struct page *page;
234 struct skb_vnet_hdr *hdr;
201 235
202 if (len <= MAX_PACKET_LEN) 236 if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
203 trim_pages(vi, skb); 237 pr_debug("%s: short packet %i\n", dev->name, len);
238 dev->stats.rx_length_errors++;
239 if (vi->mergeable_rx_bufs || vi->big_packets)
240 give_pages(vi, buf);
241 else
242 dev_kfree_skb(buf);
243 return;
244 }
204 245
205 err = pskb_trim(skb, len); 246 if (!vi->mergeable_rx_bufs && !vi->big_packets) {
206 if (err) { 247 skb = buf;
207 pr_debug("%s: pskb_trim failed %i %d\n", dev->name, 248 len -= sizeof(struct virtio_net_hdr);
208 len, err); 249 skb_trim(skb, len);
250 } else {
251 page = buf;
252 skb = page_to_skb(vi, page, len);
253 if (unlikely(!skb)) {
209 dev->stats.rx_dropped++; 254 dev->stats.rx_dropped++;
210 goto drop; 255 give_pages(vi, page);
256 return;
211 } 257 }
258 if (vi->mergeable_rx_bufs)
259 if (receive_mergeable(vi, skb)) {
260 dev_kfree_skb(skb);
261 return;
262 }
212 } 263 }
213 264
265 hdr = skb_vnet_hdr(skb);
214 skb->truesize += skb->data_len; 266 skb->truesize += skb->data_len;
215 dev->stats.rx_bytes += skb->len; 267 dev->stats.rx_bytes += skb->len;
216 dev->stats.rx_packets++; 268 dev->stats.rx_packets++;
@@ -267,110 +319,119 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
267 319
268frame_err: 320frame_err:
269 dev->stats.rx_frame_errors++; 321 dev->stats.rx_frame_errors++;
270drop:
271 dev_kfree_skb(skb); 322 dev_kfree_skb(skb);
272} 323}
273 324
274static bool try_fill_recv_maxbufs(struct virtnet_info *vi, gfp_t gfp) 325static int add_recvbuf_small(struct virtnet_info *vi, gfp_t gfp)
275{ 326{
276 struct sk_buff *skb; 327 struct sk_buff *skb;
277 struct scatterlist sg[2+MAX_SKB_FRAGS]; 328 struct skb_vnet_hdr *hdr;
278 int num, err, i; 329 struct scatterlist sg[2];
279 bool oom = false; 330 int err;
280
281 sg_init_table(sg, 2+MAX_SKB_FRAGS);
282 do {
283 struct skb_vnet_hdr *hdr;
284 331
285 skb = netdev_alloc_skb_ip_align(vi->dev, MAX_PACKET_LEN); 332 skb = netdev_alloc_skb_ip_align(vi->dev, MAX_PACKET_LEN);
286 if (unlikely(!skb)) { 333 if (unlikely(!skb))
287 oom = true; 334 return -ENOMEM;
288 break;
289 }
290 335
291 skb_put(skb, MAX_PACKET_LEN); 336 skb_put(skb, MAX_PACKET_LEN);
292 337
293 hdr = skb_vnet_hdr(skb); 338 hdr = skb_vnet_hdr(skb);
294 sg_set_buf(sg, &hdr->hdr, sizeof(hdr->hdr)); 339 sg_set_buf(sg, &hdr->hdr, sizeof hdr->hdr);
295 340
296 if (vi->big_packets) { 341 skb_to_sgvec(skb, sg + 1, 0, skb->len);
297 for (i = 0; i < MAX_SKB_FRAGS; i++) {
298 skb_frag_t *f = &skb_shinfo(skb)->frags[i];
299 f->page = get_a_page(vi, gfp);
300 if (!f->page)
301 break;
302 342
303 f->page_offset = 0; 343 err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 2, skb);
304 f->size = PAGE_SIZE; 344 if (err < 0)
345 dev_kfree_skb(skb);
305 346
306 skb->data_len += PAGE_SIZE; 347 return err;
307 skb->len += PAGE_SIZE; 348}
308 349
309 skb_shinfo(skb)->nr_frags++; 350static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp)
310 } 351{
352 struct scatterlist sg[MAX_SKB_FRAGS + 2];
353 struct page *first, *list = NULL;
354 char *p;
355 int i, err, offset;
356
357 /* page in sg[MAX_SKB_FRAGS + 1] is list tail */
358 for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
359 first = get_a_page(vi, gfp);
360 if (!first) {
361 if (list)
362 give_pages(vi, list);
363 return -ENOMEM;
311 } 364 }
365 sg_set_buf(&sg[i], page_address(first), PAGE_SIZE);
312 366
313 num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1; 367 /* chain new page in list head to match sg */
314 skb_queue_head(&vi->recv, skb); 368 first->private = (unsigned long)list;
369 list = first;
370 }
315 371
316 err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, num, skb); 372 first = get_a_page(vi, gfp);
317 if (err < 0) { 373 if (!first) {
318 skb_unlink(skb, &vi->recv); 374 give_pages(vi, list);
319 trim_pages(vi, skb); 375 return -ENOMEM;
320 kfree_skb(skb); 376 }
321 break; 377 p = page_address(first);
322 } 378
323 vi->num++; 379 /* sg[0], sg[1] share the same page */
324 } while (err >= num); 380 /* a separated sg[0] for virtio_net_hdr only during to QEMU bug*/
325 if (unlikely(vi->num > vi->max)) 381 sg_set_buf(&sg[0], p, sizeof(struct virtio_net_hdr));
326 vi->max = vi->num; 382
327 vi->rvq->vq_ops->kick(vi->rvq); 383 /* sg[1] for data packet, from offset */
328 return !oom; 384 offset = sizeof(struct padded_vnet_hdr);
385 sg_set_buf(&sg[1], p + offset, PAGE_SIZE - offset);
386
387 /* chain first in list head */
388 first->private = (unsigned long)list;
389 err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, MAX_SKB_FRAGS + 2,
390 first);
391 if (err < 0)
392 give_pages(vi, first);
393
394 return err;
329} 395}
330 396
331/* Returns false if we couldn't fill entirely (OOM). */ 397static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp)
332static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
333{ 398{
334 struct sk_buff *skb; 399 struct page *page;
335 struct scatterlist sg[1]; 400 struct scatterlist sg;
336 int err; 401 int err;
337 bool oom = false;
338
339 if (!vi->mergeable_rx_bufs)
340 return try_fill_recv_maxbufs(vi, gfp);
341 402
342 do { 403 page = get_a_page(vi, gfp);
343 skb_frag_t *f; 404 if (!page)
405 return -ENOMEM;
344 406
345 skb = netdev_alloc_skb_ip_align(vi->dev, GOOD_COPY_LEN); 407 sg_init_one(&sg, page_address(page), PAGE_SIZE);
346 if (unlikely(!skb)) {
347 oom = true;
348 break;
349 }
350 408
351 f = &skb_shinfo(skb)->frags[0]; 409 err = vi->rvq->vq_ops->add_buf(vi->rvq, &sg, 0, 1, page);
352 f->page = get_a_page(vi, gfp); 410 if (err < 0)
353 if (!f->page) { 411 give_pages(vi, page);
354 oom = true;
355 kfree_skb(skb);
356 break;
357 }
358 412
359 f->page_offset = 0; 413 return err;
360 f->size = PAGE_SIZE; 414}
361 415
362 skb_shinfo(skb)->nr_frags++; 416/* Returns false if we couldn't fill entirely (OOM). */
417static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
418{
419 int err;
420 bool oom = false;
363 421
364 sg_init_one(sg, page_address(f->page), PAGE_SIZE); 422 do {
365 skb_queue_head(&vi->recv, skb); 423 if (vi->mergeable_rx_bufs)
424 err = add_recvbuf_mergeable(vi, gfp);
425 else if (vi->big_packets)
426 err = add_recvbuf_big(vi, gfp);
427 else
428 err = add_recvbuf_small(vi, gfp);
366 429
367 err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 1, skb);
368 if (err < 0) { 430 if (err < 0) {
369 skb_unlink(skb, &vi->recv); 431 oom = true;
370 kfree_skb(skb);
371 break; 432 break;
372 } 433 }
373 vi->num++; 434 ++vi->num;
374 } while (err > 0); 435 } while (err > 0);
375 if (unlikely(vi->num > vi->max)) 436 if (unlikely(vi->num > vi->max))
376 vi->max = vi->num; 437 vi->max = vi->num;
@@ -407,15 +468,14 @@ static void refill_work(struct work_struct *work)
407static int virtnet_poll(struct napi_struct *napi, int budget) 468static int virtnet_poll(struct napi_struct *napi, int budget)
408{ 469{
409 struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi); 470 struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
410 struct sk_buff *skb = NULL; 471 void *buf;
411 unsigned int len, received = 0; 472 unsigned int len, received = 0;
412 473
413again: 474again:
414 while (received < budget && 475 while (received < budget &&
415 (skb = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) { 476 (buf = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
416 __skb_unlink(skb, &vi->recv); 477 receive_buf(vi->dev, buf, len);
417 receive_skb(vi->dev, skb, len); 478 --vi->num;
418 vi->num--;
419 received++; 479 received++;
420 } 480 }
421 481
@@ -495,9 +555,9 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
495 555
496 /* Encode metadata header at front. */ 556 /* Encode metadata header at front. */
497 if (vi->mergeable_rx_bufs) 557 if (vi->mergeable_rx_bufs)
498 sg_set_buf(sg, &hdr->mhdr, sizeof(hdr->mhdr)); 558 sg_set_buf(sg, &hdr->mhdr, sizeof hdr->mhdr);
499 else 559 else
500 sg_set_buf(sg, &hdr->hdr, sizeof(hdr->hdr)); 560 sg_set_buf(sg, &hdr->hdr, sizeof hdr->hdr);
501 561
502 hdr->num_sg = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1; 562 hdr->num_sg = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
503 return vi->svq->vq_ops->add_buf(vi->svq, sg, hdr->num_sg, 0, skb); 563 return vi->svq->vq_ops->add_buf(vi->svq, sg, hdr->num_sg, 0, skb);
@@ -917,8 +977,7 @@ static int virtnet_probe(struct virtio_device *vdev)
917 dev->features |= NETIF_F_HW_VLAN_FILTER; 977 dev->features |= NETIF_F_HW_VLAN_FILTER;
918 } 978 }
919 979
920 /* Initialize our empty receive and send queues. */ 980 /* Initialize our empty send queue. */
921 skb_queue_head_init(&vi->recv);
922 skb_queue_head_init(&vi->send); 981 skb_queue_head_init(&vi->send);
923 982
924 err = register_netdev(dev); 983 err = register_netdev(dev);
@@ -953,25 +1012,35 @@ free:
953 return err; 1012 return err;
954} 1013}
955 1014
1015static void free_unused_bufs(struct virtnet_info *vi)
1016{
1017 void *buf;
1018 while (1) {
1019 buf = vi->rvq->vq_ops->detach_unused_buf(vi->rvq);
1020 if (!buf)
1021 break;
1022 if (vi->mergeable_rx_bufs || vi->big_packets)
1023 give_pages(vi, buf);
1024 else
1025 dev_kfree_skb(buf);
1026 --vi->num;
1027 }
1028 BUG_ON(vi->num != 0);
1029}
1030
956static void __devexit virtnet_remove(struct virtio_device *vdev) 1031static void __devexit virtnet_remove(struct virtio_device *vdev)
957{ 1032{
958 struct virtnet_info *vi = vdev->priv; 1033 struct virtnet_info *vi = vdev->priv;
959 struct sk_buff *skb;
960 1034
961 /* Stop all the virtqueues. */ 1035 /* Stop all the virtqueues. */
962 vdev->config->reset(vdev); 1036 vdev->config->reset(vdev);
963 1037
964 /* Free our skbs in send and recv queues, if any. */ 1038 /* Free our skbs in send queue, if any. */
965 while ((skb = __skb_dequeue(&vi->recv)) != NULL) {
966 kfree_skb(skb);
967 vi->num--;
968 }
969 __skb_queue_purge(&vi->send); 1039 __skb_queue_purge(&vi->send);
970 1040
971 BUG_ON(vi->num != 0);
972
973 unregister_netdev(vi->dev); 1041 unregister_netdev(vi->dev);
974 cancel_delayed_work_sync(&vi->refill); 1042 cancel_delayed_work_sync(&vi->refill);
1043 free_unused_bufs(vi);
975 1044
976 vdev->config->del_vqs(vi->vdev); 1045 vdev->config->del_vqs(vi->vdev);
977 1046