aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/net/xen-netback
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2014-06-08 02:24:07 -0400
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2014-06-08 02:24:07 -0400
commita292241cccb7e20e8b997a9a44177e7c98141859 (patch)
treea0b0bb95e7dce3233a2d8b203f9e326cdec7a00e /drivers/net/xen-netback
parentd49cb7aeebb974713f9f7ab2991352d3050b095b (diff)
parent68807a0c2015cb40df4869e16651f0ce5cc14d52 (diff)
Merge branch 'next' into for-linus
Prepare input updates for 3.16.
Diffstat (limited to 'drivers/net/xen-netback')
-rw-r--r--drivers/net/xen-netback/common.h113
-rw-r--r--drivers/net/xen-netback/interface.c144
-rw-r--r--drivers/net/xen-netback/netback.c910
3 files changed, 825 insertions, 342 deletions
diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index ae413a2cbee7..630a3fcf65bc 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -48,37 +48,19 @@
48typedef unsigned int pending_ring_idx_t; 48typedef unsigned int pending_ring_idx_t;
49#define INVALID_PENDING_RING_IDX (~0U) 49#define INVALID_PENDING_RING_IDX (~0U)
50 50
51/* For the head field in pending_tx_info: it is used to indicate
52 * whether this tx info is the head of one or more coalesced requests.
53 *
54 * When head != INVALID_PENDING_RING_IDX, it means the start of a new
55 * tx requests queue and the end of previous queue.
56 *
57 * An example sequence of head fields (I = INVALID_PENDING_RING_IDX):
58 *
59 * ...|0 I I I|5 I|9 I I I|...
60 * -->|<-INUSE----------------
61 *
62 * After consuming the first slot(s) we have:
63 *
64 * ...|V V V V|5 I|9 I I I|...
65 * -----FREE->|<-INUSE--------
66 *
67 * where V stands for "valid pending ring index". Any number other
68 * than INVALID_PENDING_RING_IDX is OK. These entries are considered
69 * free and can contain any number other than
70 * INVALID_PENDING_RING_IDX. In practice we use 0.
71 *
72 * The in use non-INVALID_PENDING_RING_IDX (say 0, 5 and 9 in the
73 * above example) number is the index into pending_tx_info and
74 * mmap_pages arrays.
75 */
76struct pending_tx_info { 51struct pending_tx_info {
77 struct xen_netif_tx_request req; /* coalesced tx request */ 52 struct xen_netif_tx_request req; /* tx request */
78 pending_ring_idx_t head; /* head != INVALID_PENDING_RING_IDX 53 /* Callback data for released SKBs. The callback is always
79 * if it is head of one or more tx 54 * xenvif_zerocopy_callback, desc contains the pending_idx, which is
80 * reqs 55 * also an index in pending_tx_info array. It is initialized in
81 */ 56 * xenvif_alloc and it never changes.
57 * skb_shinfo(skb)->destructor_arg points to the first mapped slot's
58 * callback_struct in this array of struct pending_tx_info's, then ctx
59 * to the next, or NULL if there is no more slot for this skb.
60 * ubuf_to_vif is a helper which finds the struct xenvif from a pointer
61 * to this field.
62 */
63 struct ubuf_info callback_struct;
82}; 64};
83 65
84#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE) 66#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
@@ -99,7 +81,7 @@ struct xenvif_rx_meta {
99 81
100#define MAX_BUFFER_OFFSET PAGE_SIZE 82#define MAX_BUFFER_OFFSET PAGE_SIZE
101 83
102#define MAX_PENDING_REQS 256 84#define MAX_PENDING_REQS XEN_NETIF_TX_RING_SIZE
103 85
104/* It's possible for an skb to have a maximal number of frags 86/* It's possible for an skb to have a maximal number of frags
105 * but still be less than MAX_BUFFER_OFFSET in size. Thus the 87 * but still be less than MAX_BUFFER_OFFSET in size. Thus the
@@ -108,11 +90,25 @@ struct xenvif_rx_meta {
108 */ 90 */
109#define MAX_GRANT_COPY_OPS (MAX_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE) 91#define MAX_GRANT_COPY_OPS (MAX_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE)
110 92
93#define NETBACK_INVALID_HANDLE -1
94
95/* To avoid confusion, we define XEN_NETBK_LEGACY_SLOTS_MAX indicating
96 * the maximum slots a valid packet can use. Now this value is defined
97 * to be XEN_NETIF_NR_SLOTS_MIN, which is supposed to be supported by
98 * all backend.
99 */
100#define XEN_NETBK_LEGACY_SLOTS_MAX XEN_NETIF_NR_SLOTS_MIN
101
111struct xenvif { 102struct xenvif {
112 /* Unique identifier for this interface. */ 103 /* Unique identifier for this interface. */
113 domid_t domid; 104 domid_t domid;
114 unsigned int handle; 105 unsigned int handle;
115 106
107 /* Is this interface disabled? True when backend discovers
108 * frontend is rogue.
109 */
110 bool disabled;
111
116 /* Use NAPI for guest TX */ 112 /* Use NAPI for guest TX */
117 struct napi_struct napi; 113 struct napi_struct napi;
118 /* When feature-split-event-channels = 0, tx_irq = rx_irq. */ 114 /* When feature-split-event-channels = 0, tx_irq = rx_irq. */
@@ -126,13 +122,27 @@ struct xenvif {
126 pending_ring_idx_t pending_cons; 122 pending_ring_idx_t pending_cons;
127 u16 pending_ring[MAX_PENDING_REQS]; 123 u16 pending_ring[MAX_PENDING_REQS];
128 struct pending_tx_info pending_tx_info[MAX_PENDING_REQS]; 124 struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
129 125 grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
130 /* Coalescing tx requests before copying makes number of grant 126
131 * copy ops greater or equal to number of slots required. In 127 struct gnttab_copy tx_copy_ops[MAX_PENDING_REQS];
132 * worst case a tx request consumes 2 gnttab_copy. 128 struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
129 struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
130 /* passed to gnttab_[un]map_refs with pages under (un)mapping */
131 struct page *pages_to_map[MAX_PENDING_REQS];
132 struct page *pages_to_unmap[MAX_PENDING_REQS];
133
134 /* This prevents zerocopy callbacks to race over dealloc_ring */
135 spinlock_t callback_lock;
136 /* This prevents dealloc thread and NAPI instance to race over response
137 * creation and pending_ring in xenvif_idx_release. In xenvif_tx_err
138 * it only protect response creation
133 */ 139 */
134 struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS]; 140 spinlock_t response_lock;
135 141 pending_ring_idx_t dealloc_prod;
142 pending_ring_idx_t dealloc_cons;
143 u16 dealloc_ring[MAX_PENDING_REQS];
144 struct task_struct *dealloc_task;
145 wait_queue_head_t dealloc_wq;
136 146
137 /* Use kthread for guest RX */ 147 /* Use kthread for guest RX */
138 struct task_struct *task; 148 struct task_struct *task;
@@ -144,6 +154,9 @@ struct xenvif {
144 struct xen_netif_rx_back_ring rx; 154 struct xen_netif_rx_back_ring rx;
145 struct sk_buff_head rx_queue; 155 struct sk_buff_head rx_queue;
146 RING_IDX rx_last_skb_slots; 156 RING_IDX rx_last_skb_slots;
157 bool rx_queue_purge;
158
159 struct timer_list wake_queue;
147 160
148 /* This array is allocated seperately as it is large */ 161 /* This array is allocated seperately as it is large */
149 struct gnttab_copy *grant_copy_op; 162 struct gnttab_copy *grant_copy_op;
@@ -175,6 +188,10 @@ struct xenvif {
175 188
176 /* Statistics */ 189 /* Statistics */
177 unsigned long rx_gso_checksum_fixup; 190 unsigned long rx_gso_checksum_fixup;
191 unsigned long tx_zerocopy_sent;
192 unsigned long tx_zerocopy_success;
193 unsigned long tx_zerocopy_fail;
194 unsigned long tx_frag_overflow;
178 195
179 /* Miscellaneous private stuff. */ 196 /* Miscellaneous private stuff. */
180 struct net_device *dev; 197 struct net_device *dev;
@@ -216,9 +233,11 @@ void xenvif_carrier_off(struct xenvif *vif);
216 233
217int xenvif_tx_action(struct xenvif *vif, int budget); 234int xenvif_tx_action(struct xenvif *vif, int budget);
218 235
219int xenvif_kthread(void *data); 236int xenvif_kthread_guest_rx(void *data);
220void xenvif_kick_thread(struct xenvif *vif); 237void xenvif_kick_thread(struct xenvif *vif);
221 238
239int xenvif_dealloc_kthread(void *data);
240
222/* Determine whether the needed number of slots (req) are available, 241/* Determine whether the needed number of slots (req) are available,
223 * and set req_event if not. 242 * and set req_event if not.
224 */ 243 */
@@ -226,6 +245,24 @@ bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed);
226 245
227void xenvif_stop_queue(struct xenvif *vif); 246void xenvif_stop_queue(struct xenvif *vif);
228 247
248/* Callback from stack when TX packet can be released */
249void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success);
250
251/* Unmap a pending page and release it back to the guest */
252void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx);
253
254static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif)
255{
256 return MAX_PENDING_REQS -
257 vif->pending_prod + vif->pending_cons;
258}
259
260/* Callback from stack when TX packet can be released */
261void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success);
262
229extern bool separate_tx_rx_irq; 263extern bool separate_tx_rx_irq;
230 264
265extern unsigned int rx_drain_timeout_msecs;
266extern unsigned int rx_drain_timeout_jiffies;
267
231#endif /* __XEN_NETBACK__COMMON_H__ */ 268#endif /* __XEN_NETBACK__COMMON_H__ */
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 7669d49a67e2..ef05c5c49d41 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -38,6 +38,7 @@
38 38
39#include <xen/events.h> 39#include <xen/events.h>
40#include <asm/xen/hypercall.h> 40#include <asm/xen/hypercall.h>
41#include <xen/balloon.h>
41 42
42#define XENVIF_QUEUE_LENGTH 32 43#define XENVIF_QUEUE_LENGTH 32
43#define XENVIF_NAPI_WEIGHT 64 44#define XENVIF_NAPI_WEIGHT 64
@@ -62,6 +63,15 @@ static int xenvif_poll(struct napi_struct *napi, int budget)
62 struct xenvif *vif = container_of(napi, struct xenvif, napi); 63 struct xenvif *vif = container_of(napi, struct xenvif, napi);
63 int work_done; 64 int work_done;
64 65
66 /* This vif is rogue, we pretend we've there is nothing to do
67 * for this vif to deschedule it from NAPI. But this interface
68 * will be turned off in thread context later.
69 */
70 if (unlikely(vif->disabled)) {
71 napi_complete(napi);
72 return 0;
73 }
74
65 work_done = xenvif_tx_action(vif, budget); 75 work_done = xenvif_tx_action(vif, budget);
66 76
67 if (work_done < budget) { 77 if (work_done < budget) {
@@ -113,6 +123,18 @@ static irqreturn_t xenvif_interrupt(int irq, void *dev_id)
113 return IRQ_HANDLED; 123 return IRQ_HANDLED;
114} 124}
115 125
126static void xenvif_wake_queue(unsigned long data)
127{
128 struct xenvif *vif = (struct xenvif *)data;
129
130 if (netif_queue_stopped(vif->dev)) {
131 netdev_err(vif->dev, "draining TX queue\n");
132 vif->rx_queue_purge = true;
133 xenvif_kick_thread(vif);
134 netif_wake_queue(vif->dev);
135 }
136}
137
116static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev) 138static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
117{ 139{
118 struct xenvif *vif = netdev_priv(dev); 140 struct xenvif *vif = netdev_priv(dev);
@@ -121,7 +143,9 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
121 BUG_ON(skb->dev != dev); 143 BUG_ON(skb->dev != dev);
122 144
123 /* Drop the packet if vif is not ready */ 145 /* Drop the packet if vif is not ready */
124 if (vif->task == NULL || !xenvif_schedulable(vif)) 146 if (vif->task == NULL ||
147 vif->dealloc_task == NULL ||
148 !xenvif_schedulable(vif))
125 goto drop; 149 goto drop;
126 150
127 /* At best we'll need one slot for the header and one for each 151 /* At best we'll need one slot for the header and one for each
@@ -132,16 +156,20 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
132 /* If the skb is GSO then we'll also need an extra slot for the 156 /* If the skb is GSO then we'll also need an extra slot for the
133 * metadata. 157 * metadata.
134 */ 158 */
135 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4 || 159 if (skb_is_gso(skb))
136 skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
137 min_slots_needed++; 160 min_slots_needed++;
138 161
139 /* If the skb can't possibly fit in the remaining slots 162 /* If the skb can't possibly fit in the remaining slots
140 * then turn off the queue to give the ring a chance to 163 * then turn off the queue to give the ring a chance to
141 * drain. 164 * drain.
142 */ 165 */
143 if (!xenvif_rx_ring_slots_available(vif, min_slots_needed)) 166 if (!xenvif_rx_ring_slots_available(vif, min_slots_needed)) {
167 vif->wake_queue.function = xenvif_wake_queue;
168 vif->wake_queue.data = (unsigned long)vif;
144 xenvif_stop_queue(vif); 169 xenvif_stop_queue(vif);
170 mod_timer(&vif->wake_queue,
171 jiffies + rx_drain_timeout_jiffies);
172 }
145 173
146 skb_queue_tail(&vif->rx_queue, skb); 174 skb_queue_tail(&vif->rx_queue, skb);
147 xenvif_kick_thread(vif); 175 xenvif_kick_thread(vif);
@@ -234,6 +262,28 @@ static const struct xenvif_stat {
234 "rx_gso_checksum_fixup", 262 "rx_gso_checksum_fixup",
235 offsetof(struct xenvif, rx_gso_checksum_fixup) 263 offsetof(struct xenvif, rx_gso_checksum_fixup)
236 }, 264 },
265 /* If (sent != success + fail), there are probably packets never
266 * freed up properly!
267 */
268 {
269 "tx_zerocopy_sent",
270 offsetof(struct xenvif, tx_zerocopy_sent),
271 },
272 {
273 "tx_zerocopy_success",
274 offsetof(struct xenvif, tx_zerocopy_success),
275 },
276 {
277 "tx_zerocopy_fail",
278 offsetof(struct xenvif, tx_zerocopy_fail)
279 },
280 /* Number of packets exceeding MAX_SKB_FRAG slots. You should use
281 * a guest with the same MAX_SKB_FRAG
282 */
283 {
284 "tx_frag_overflow",
285 offsetof(struct xenvif, tx_frag_overflow)
286 },
237}; 287};
238 288
239static int xenvif_get_sset_count(struct net_device *dev, int string_set) 289static int xenvif_get_sset_count(struct net_device *dev, int string_set)
@@ -322,11 +372,15 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
322 vif->ip_csum = 1; 372 vif->ip_csum = 1;
323 vif->dev = dev; 373 vif->dev = dev;
324 374
375 vif->disabled = false;
376
325 vif->credit_bytes = vif->remaining_credit = ~0UL; 377 vif->credit_bytes = vif->remaining_credit = ~0UL;
326 vif->credit_usec = 0UL; 378 vif->credit_usec = 0UL;
327 init_timer(&vif->credit_timeout); 379 init_timer(&vif->credit_timeout);
328 vif->credit_window_start = get_jiffies_64(); 380 vif->credit_window_start = get_jiffies_64();
329 381
382 init_timer(&vif->wake_queue);
383
330 dev->netdev_ops = &xenvif_netdev_ops; 384 dev->netdev_ops = &xenvif_netdev_ops;
331 dev->hw_features = NETIF_F_SG | 385 dev->hw_features = NETIF_F_SG |
332 NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | 386 NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
@@ -343,8 +397,26 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
343 vif->pending_prod = MAX_PENDING_REQS; 397 vif->pending_prod = MAX_PENDING_REQS;
344 for (i = 0; i < MAX_PENDING_REQS; i++) 398 for (i = 0; i < MAX_PENDING_REQS; i++)
345 vif->pending_ring[i] = i; 399 vif->pending_ring[i] = i;
346 for (i = 0; i < MAX_PENDING_REQS; i++) 400 spin_lock_init(&vif->callback_lock);
347 vif->mmap_pages[i] = NULL; 401 spin_lock_init(&vif->response_lock);
402 /* If ballooning is disabled, this will consume real memory, so you
403 * better enable it. The long term solution would be to use just a
404 * bunch of valid page descriptors, without dependency on ballooning
405 */
406 err = alloc_xenballooned_pages(MAX_PENDING_REQS,
407 vif->mmap_pages,
408 false);
409 if (err) {
410 netdev_err(dev, "Could not reserve mmap_pages\n");
411 return ERR_PTR(-ENOMEM);
412 }
413 for (i = 0; i < MAX_PENDING_REQS; i++) {
414 vif->pending_tx_info[i].callback_struct = (struct ubuf_info)
415 { .callback = xenvif_zerocopy_callback,
416 .ctx = NULL,
417 .desc = i };
418 vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
419 }
348 420
349 /* 421 /*
350 * Initialise a dummy MAC address. We choose the numerically 422 * Initialise a dummy MAC address. We choose the numerically
@@ -382,12 +454,14 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
382 454
383 BUG_ON(vif->tx_irq); 455 BUG_ON(vif->tx_irq);
384 BUG_ON(vif->task); 456 BUG_ON(vif->task);
457 BUG_ON(vif->dealloc_task);
385 458
386 err = xenvif_map_frontend_rings(vif, tx_ring_ref, rx_ring_ref); 459 err = xenvif_map_frontend_rings(vif, tx_ring_ref, rx_ring_ref);
387 if (err < 0) 460 if (err < 0)
388 goto err; 461 goto err;
389 462
390 init_waitqueue_head(&vif->wq); 463 init_waitqueue_head(&vif->wq);
464 init_waitqueue_head(&vif->dealloc_wq);
391 465
392 if (tx_evtchn == rx_evtchn) { 466 if (tx_evtchn == rx_evtchn) {
393 /* feature-split-event-channels == 0 */ 467 /* feature-split-event-channels == 0 */
@@ -421,8 +495,8 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
421 disable_irq(vif->rx_irq); 495 disable_irq(vif->rx_irq);
422 } 496 }
423 497
424 task = kthread_create(xenvif_kthread, 498 task = kthread_create(xenvif_kthread_guest_rx,
425 (void *)vif, "%s", vif->dev->name); 499 (void *)vif, "%s-guest-rx", vif->dev->name);
426 if (IS_ERR(task)) { 500 if (IS_ERR(task)) {
427 pr_warn("Could not allocate kthread for %s\n", vif->dev->name); 501 pr_warn("Could not allocate kthread for %s\n", vif->dev->name);
428 err = PTR_ERR(task); 502 err = PTR_ERR(task);
@@ -431,6 +505,16 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
431 505
432 vif->task = task; 506 vif->task = task;
433 507
508 task = kthread_create(xenvif_dealloc_kthread,
509 (void *)vif, "%s-dealloc", vif->dev->name);
510 if (IS_ERR(task)) {
511 pr_warn("Could not allocate kthread for %s\n", vif->dev->name);
512 err = PTR_ERR(task);
513 goto err_rx_unbind;
514 }
515
516 vif->dealloc_task = task;
517
434 rtnl_lock(); 518 rtnl_lock();
435 if (!vif->can_sg && vif->dev->mtu > ETH_DATA_LEN) 519 if (!vif->can_sg && vif->dev->mtu > ETH_DATA_LEN)
436 dev_set_mtu(vif->dev, ETH_DATA_LEN); 520 dev_set_mtu(vif->dev, ETH_DATA_LEN);
@@ -441,6 +525,7 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
441 rtnl_unlock(); 525 rtnl_unlock();
442 526
443 wake_up_process(vif->task); 527 wake_up_process(vif->task);
528 wake_up_process(vif->dealloc_task);
444 529
445 return 0; 530 return 0;
446 531
@@ -474,10 +559,16 @@ void xenvif_disconnect(struct xenvif *vif)
474 xenvif_carrier_off(vif); 559 xenvif_carrier_off(vif);
475 560
476 if (vif->task) { 561 if (vif->task) {
562 del_timer_sync(&vif->wake_queue);
477 kthread_stop(vif->task); 563 kthread_stop(vif->task);
478 vif->task = NULL; 564 vif->task = NULL;
479 } 565 }
480 566
567 if (vif->dealloc_task) {
568 kthread_stop(vif->dealloc_task);
569 vif->dealloc_task = NULL;
570 }
571
481 if (vif->tx_irq) { 572 if (vif->tx_irq) {
482 if (vif->tx_irq == vif->rx_irq) 573 if (vif->tx_irq == vif->rx_irq)
483 unbind_from_irqhandler(vif->tx_irq, vif); 574 unbind_from_irqhandler(vif->tx_irq, vif);
@@ -493,6 +584,43 @@ void xenvif_disconnect(struct xenvif *vif)
493 584
494void xenvif_free(struct xenvif *vif) 585void xenvif_free(struct xenvif *vif)
495{ 586{
587 int i, unmap_timeout = 0;
588 /* Here we want to avoid timeout messages if an skb can be legitimately
589 * stuck somewhere else. Realistically this could be an another vif's
590 * internal or QDisc queue. That another vif also has this
591 * rx_drain_timeout_msecs timeout, but the timer only ditches the
592 * internal queue. After that, the QDisc queue can put in worst case
593 * XEN_NETIF_RX_RING_SIZE / MAX_SKB_FRAGS skbs into that another vif's
594 * internal queue, so we need several rounds of such timeouts until we
595 * can be sure that no another vif should have skb's from us. We are
596 * not sending more skb's, so newly stuck packets are not interesting
597 * for us here.
598 */
599 unsigned int worst_case_skb_lifetime = (rx_drain_timeout_msecs/1000) *
600 DIV_ROUND_UP(XENVIF_QUEUE_LENGTH, (XEN_NETIF_RX_RING_SIZE / MAX_SKB_FRAGS));
601
602 for (i = 0; i < MAX_PENDING_REQS; ++i) {
603 if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
604 unmap_timeout++;
605 schedule_timeout(msecs_to_jiffies(1000));
606 if (unmap_timeout > worst_case_skb_lifetime &&
607 net_ratelimit())
608 netdev_err(vif->dev,
609 "Page still granted! Index: %x\n",
610 i);
611 /* If there are still unmapped pages, reset the loop to
612 * start checking again. We shouldn't exit here until
613 * dealloc thread and NAPI instance release all the
614 * pages. If a kernel bug causes the skbs to stall
615 * somewhere, the interface cannot be brought down
616 * properly.
617 */
618 i = -1;
619 }
620 }
621
622 free_xenballooned_pages(MAX_PENDING_REQS, vif->mmap_pages);
623
496 netif_napi_del(&vif->napi); 624 netif_napi_del(&vif->napi);
497 625
498 unregister_netdev(vif->dev); 626 unregister_netdev(vif->dev);
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index e5284bca2d90..76665405c5aa 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -37,6 +37,7 @@
37#include <linux/kthread.h> 37#include <linux/kthread.h>
38#include <linux/if_vlan.h> 38#include <linux/if_vlan.h>
39#include <linux/udp.h> 39#include <linux/udp.h>
40#include <linux/highmem.h>
40 41
41#include <net/tcp.h> 42#include <net/tcp.h>
42 43
@@ -54,6 +55,13 @@
54bool separate_tx_rx_irq = 1; 55bool separate_tx_rx_irq = 1;
55module_param(separate_tx_rx_irq, bool, 0644); 56module_param(separate_tx_rx_irq, bool, 0644);
56 57
58/* When guest ring is filled up, qdisc queues the packets for us, but we have
59 * to timeout them, otherwise other guests' packets can get stuck there
60 */
61unsigned int rx_drain_timeout_msecs = 10000;
62module_param(rx_drain_timeout_msecs, uint, 0444);
63unsigned int rx_drain_timeout_jiffies;
64
57/* 65/*
58 * This is the maximum slots a skb can have. If a guest sends a skb 66 * This is the maximum slots a skb can have. If a guest sends a skb
59 * which exceeds this limit it is considered malicious. 67 * which exceeds this limit it is considered malicious.
@@ -62,24 +70,6 @@ module_param(separate_tx_rx_irq, bool, 0644);
62static unsigned int fatal_skb_slots = FATAL_SKB_SLOTS_DEFAULT; 70static unsigned int fatal_skb_slots = FATAL_SKB_SLOTS_DEFAULT;
63module_param(fatal_skb_slots, uint, 0444); 71module_param(fatal_skb_slots, uint, 0444);
64 72
65/*
66 * To avoid confusion, we define XEN_NETBK_LEGACY_SLOTS_MAX indicating
67 * the maximum slots a valid packet can use. Now this value is defined
68 * to be XEN_NETIF_NR_SLOTS_MIN, which is supposed to be supported by
69 * all backend.
70 */
71#define XEN_NETBK_LEGACY_SLOTS_MAX XEN_NETIF_NR_SLOTS_MIN
72
73/*
74 * If head != INVALID_PENDING_RING_IDX, it means this tx request is head of
75 * one or more merged tx requests, otherwise it is the continuation of
76 * previous tx request.
77 */
78static inline int pending_tx_is_head(struct xenvif *vif, RING_IDX idx)
79{
80 return vif->pending_tx_info[idx].head != INVALID_PENDING_RING_IDX;
81}
82
83static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx, 73static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
84 u8 status); 74 u8 status);
85 75
@@ -109,6 +99,21 @@ static inline unsigned long idx_to_kaddr(struct xenvif *vif,
109 return (unsigned long)pfn_to_kaddr(idx_to_pfn(vif, idx)); 99 return (unsigned long)pfn_to_kaddr(idx_to_pfn(vif, idx));
110} 100}
111 101
102#define callback_param(vif, pending_idx) \
103 (vif->pending_tx_info[pending_idx].callback_struct)
104
105/* Find the containing VIF's structure from a pointer in pending_tx_info array
106 */
107static inline struct xenvif* ubuf_to_vif(struct ubuf_info *ubuf)
108{
109 u16 pending_idx = ubuf->desc;
110 struct pending_tx_info *temp =
111 container_of(ubuf, struct pending_tx_info, callback_struct);
112 return container_of(temp - pending_idx,
113 struct xenvif,
114 pending_tx_info[0]);
115}
116
112/* This is a miniumum size for the linear area to avoid lots of 117/* This is a miniumum size for the linear area to avoid lots of
113 * calls to __pskb_pull_tail() as we set up checksum offsets. The 118 * calls to __pskb_pull_tail() as we set up checksum offsets. The
114 * value 128 was chosen as it covers all IPv4 and most likely 119 * value 128 was chosen as it covers all IPv4 and most likely
@@ -131,12 +136,6 @@ static inline pending_ring_idx_t pending_index(unsigned i)
131 return i & (MAX_PENDING_REQS-1); 136 return i & (MAX_PENDING_REQS-1);
132} 137}
133 138
134static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif)
135{
136 return MAX_PENDING_REQS -
137 vif->pending_prod + vif->pending_cons;
138}
139
140bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed) 139bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed)
141{ 140{
142 RING_IDX prod, cons; 141 RING_IDX prod, cons;
@@ -192,8 +191,8 @@ static bool start_new_rx_buffer(int offset, unsigned long size, int head)
192 * into multiple copies tend to give large frags their 191 * into multiple copies tend to give large frags their
193 * own buffers as before. 192 * own buffers as before.
194 */ 193 */
195 if ((offset + size > MAX_BUFFER_OFFSET) && 194 BUG_ON(size > MAX_BUFFER_OFFSET);
196 (size <= MAX_BUFFER_OFFSET) && offset && !head) 195 if ((offset + size > MAX_BUFFER_OFFSET) && offset && !head)
197 return true; 196 return true;
198 197
199 return false; 198 return false;
@@ -235,12 +234,14 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif *vif,
235static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb, 234static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
236 struct netrx_pending_operations *npo, 235 struct netrx_pending_operations *npo,
237 struct page *page, unsigned long size, 236 struct page *page, unsigned long size,
238 unsigned long offset, int *head) 237 unsigned long offset, int *head,
238 struct xenvif *foreign_vif,
239 grant_ref_t foreign_gref)
239{ 240{
240 struct gnttab_copy *copy_gop; 241 struct gnttab_copy *copy_gop;
241 struct xenvif_rx_meta *meta; 242 struct xenvif_rx_meta *meta;
242 unsigned long bytes; 243 unsigned long bytes;
243 int gso_type; 244 int gso_type = XEN_NETIF_GSO_TYPE_NONE;
244 245
245 /* Data must not cross a page boundary. */ 246 /* Data must not cross a page boundary. */
246 BUG_ON(size + offset > PAGE_SIZE<<compound_order(page)); 247 BUG_ON(size + offset > PAGE_SIZE<<compound_order(page));
@@ -277,8 +278,15 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
277 copy_gop->flags = GNTCOPY_dest_gref; 278 copy_gop->flags = GNTCOPY_dest_gref;
278 copy_gop->len = bytes; 279 copy_gop->len = bytes;
279 280
280 copy_gop->source.domid = DOMID_SELF; 281 if (foreign_vif) {
281 copy_gop->source.u.gmfn = virt_to_mfn(page_address(page)); 282 copy_gop->source.domid = foreign_vif->domid;
283 copy_gop->source.u.ref = foreign_gref;
284 copy_gop->flags |= GNTCOPY_source_gref;
285 } else {
286 copy_gop->source.domid = DOMID_SELF;
287 copy_gop->source.u.gmfn =
288 virt_to_mfn(page_address(page));
289 }
282 copy_gop->source.offset = offset; 290 copy_gop->source.offset = offset;
283 291
284 copy_gop->dest.domid = vif->domid; 292 copy_gop->dest.domid = vif->domid;
@@ -299,12 +307,12 @@ static void xenvif_gop_frag_copy(struct xenvif *vif, struct sk_buff *skb,
299 } 307 }
300 308
301 /* Leave a gap for the GSO descriptor. */ 309 /* Leave a gap for the GSO descriptor. */
302 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) 310 if (skb_is_gso(skb)) {
303 gso_type = XEN_NETIF_GSO_TYPE_TCPV4; 311 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
304 else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) 312 gso_type = XEN_NETIF_GSO_TYPE_TCPV4;
305 gso_type = XEN_NETIF_GSO_TYPE_TCPV6; 313 else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
306 else 314 gso_type = XEN_NETIF_GSO_TYPE_TCPV6;
307 gso_type = XEN_NETIF_GSO_TYPE_NONE; 315 }
308 316
309 if (*head && ((1 << gso_type) & vif->gso_mask)) 317 if (*head && ((1 << gso_type) & vif->gso_mask))
310 vif->rx.req_cons++; 318 vif->rx.req_cons++;
@@ -338,19 +346,18 @@ static int xenvif_gop_skb(struct sk_buff *skb,
338 int head = 1; 346 int head = 1;
339 int old_meta_prod; 347 int old_meta_prod;
340 int gso_type; 348 int gso_type;
341 int gso_size; 349 struct ubuf_info *ubuf = skb_shinfo(skb)->destructor_arg;
350 grant_ref_t foreign_grefs[MAX_SKB_FRAGS];
351 struct xenvif *foreign_vif = NULL;
342 352
343 old_meta_prod = npo->meta_prod; 353 old_meta_prod = npo->meta_prod;
344 354
345 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { 355 gso_type = XEN_NETIF_GSO_TYPE_NONE;
346 gso_type = XEN_NETIF_GSO_TYPE_TCPV4; 356 if (skb_is_gso(skb)) {
347 gso_size = skb_shinfo(skb)->gso_size; 357 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
348 } else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) { 358 gso_type = XEN_NETIF_GSO_TYPE_TCPV4;
349 gso_type = XEN_NETIF_GSO_TYPE_TCPV6; 359 else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
350 gso_size = skb_shinfo(skb)->gso_size; 360 gso_type = XEN_NETIF_GSO_TYPE_TCPV6;
351 } else {
352 gso_type = XEN_NETIF_GSO_TYPE_NONE;
353 gso_size = 0;
354 } 361 }
355 362
356 /* Set up a GSO prefix descriptor, if necessary */ 363 /* Set up a GSO prefix descriptor, if necessary */
@@ -358,7 +365,7 @@ static int xenvif_gop_skb(struct sk_buff *skb,
358 req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++); 365 req = RING_GET_REQUEST(&vif->rx, vif->rx.req_cons++);
359 meta = npo->meta + npo->meta_prod++; 366 meta = npo->meta + npo->meta_prod++;
360 meta->gso_type = gso_type; 367 meta->gso_type = gso_type;
361 meta->gso_size = gso_size; 368 meta->gso_size = skb_shinfo(skb)->gso_size;
362 meta->size = 0; 369 meta->size = 0;
363 meta->id = req->id; 370 meta->id = req->id;
364 } 371 }
@@ -368,7 +375,7 @@ static int xenvif_gop_skb(struct sk_buff *skb,
368 375
369 if ((1 << gso_type) & vif->gso_mask) { 376 if ((1 << gso_type) & vif->gso_mask) {
370 meta->gso_type = gso_type; 377 meta->gso_type = gso_type;
371 meta->gso_size = gso_size; 378 meta->gso_size = skb_shinfo(skb)->gso_size;
372 } else { 379 } else {
373 meta->gso_type = XEN_NETIF_GSO_TYPE_NONE; 380 meta->gso_type = XEN_NETIF_GSO_TYPE_NONE;
374 meta->gso_size = 0; 381 meta->gso_size = 0;
@@ -379,6 +386,19 @@ static int xenvif_gop_skb(struct sk_buff *skb,
379 npo->copy_off = 0; 386 npo->copy_off = 0;
380 npo->copy_gref = req->gref; 387 npo->copy_gref = req->gref;
381 388
389 if ((skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) &&
390 (ubuf->callback == &xenvif_zerocopy_callback)) {
391 int i = 0;
392 foreign_vif = ubuf_to_vif(ubuf);
393
394 do {
395 u16 pending_idx = ubuf->desc;
396 foreign_grefs[i++] =
397 foreign_vif->pending_tx_info[pending_idx].req.gref;
398 ubuf = (struct ubuf_info *) ubuf->ctx;
399 } while (ubuf);
400 }
401
382 data = skb->data; 402 data = skb->data;
383 while (data < skb_tail_pointer(skb)) { 403 while (data < skb_tail_pointer(skb)) {
384 unsigned int offset = offset_in_page(data); 404 unsigned int offset = offset_in_page(data);
@@ -388,7 +408,9 @@ static int xenvif_gop_skb(struct sk_buff *skb,
388 len = skb_tail_pointer(skb) - data; 408 len = skb_tail_pointer(skb) - data;
389 409
390 xenvif_gop_frag_copy(vif, skb, npo, 410 xenvif_gop_frag_copy(vif, skb, npo,
391 virt_to_page(data), len, offset, &head); 411 virt_to_page(data), len, offset, &head,
412 NULL,
413 0);
392 data += len; 414 data += len;
393 } 415 }
394 416
@@ -397,7 +419,9 @@ static int xenvif_gop_skb(struct sk_buff *skb,
397 skb_frag_page(&skb_shinfo(skb)->frags[i]), 419 skb_frag_page(&skb_shinfo(skb)->frags[i]),
398 skb_frag_size(&skb_shinfo(skb)->frags[i]), 420 skb_frag_size(&skb_shinfo(skb)->frags[i]),
399 skb_shinfo(skb)->frags[i].page_offset, 421 skb_shinfo(skb)->frags[i].page_offset,
400 &head); 422 &head,
423 foreign_vif,
424 foreign_grefs[i]);
401 } 425 }
402 426
403 return npo->meta_prod - old_meta_prod; 427 return npo->meta_prod - old_meta_prod;
@@ -455,10 +479,12 @@ static void xenvif_add_frag_responses(struct xenvif *vif, int status,
455 } 479 }
456} 480}
457 481
458struct skb_cb_overlay { 482struct xenvif_rx_cb {
459 int meta_slots_used; 483 int meta_slots_used;
460}; 484};
461 485
486#define XENVIF_RX_CB(skb) ((struct xenvif_rx_cb *)(skb)->cb)
487
462void xenvif_kick_thread(struct xenvif *vif) 488void xenvif_kick_thread(struct xenvif *vif)
463{ 489{
464 wake_up(&vif->wq); 490 wake_up(&vif->wq);
@@ -474,7 +500,6 @@ static void xenvif_rx_action(struct xenvif *vif)
474 LIST_HEAD(notify); 500 LIST_HEAD(notify);
475 int ret; 501 int ret;
476 unsigned long offset; 502 unsigned long offset;
477 struct skb_cb_overlay *sco;
478 bool need_to_notify = false; 503 bool need_to_notify = false;
479 504
480 struct netrx_pending_operations npo = { 505 struct netrx_pending_operations npo = {
@@ -486,6 +511,8 @@ static void xenvif_rx_action(struct xenvif *vif)
486 511
487 while ((skb = skb_dequeue(&vif->rx_queue)) != NULL) { 512 while ((skb = skb_dequeue(&vif->rx_queue)) != NULL) {
488 RING_IDX max_slots_needed; 513 RING_IDX max_slots_needed;
514 RING_IDX old_req_cons;
515 RING_IDX ring_slots_used;
489 int i; 516 int i;
490 517
491 /* We need a cheap worse case estimate for the number of 518 /* We need a cheap worse case estimate for the number of
@@ -497,11 +524,31 @@ static void xenvif_rx_action(struct xenvif *vif)
497 PAGE_SIZE); 524 PAGE_SIZE);
498 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 525 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
499 unsigned int size; 526 unsigned int size;
527 unsigned int offset;
528
500 size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 529 size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
501 max_slots_needed += DIV_ROUND_UP(size, PAGE_SIZE); 530 offset = skb_shinfo(skb)->frags[i].page_offset;
531
532 /* For a worse-case estimate we need to factor in
533 * the fragment page offset as this will affect the
534 * number of times xenvif_gop_frag_copy() will
535 * call start_new_rx_buffer().
536 */
537 max_slots_needed += DIV_ROUND_UP(offset + size,
538 PAGE_SIZE);
502 } 539 }
503 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4 || 540
504 skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) 541 /* To avoid the estimate becoming too pessimal for some
542 * frontends that limit posted rx requests, cap the estimate
543 * at MAX_SKB_FRAGS.
544 */
545 if (max_slots_needed > MAX_SKB_FRAGS)
546 max_slots_needed = MAX_SKB_FRAGS;
547
548 /* We may need one more slot for GSO metadata */
549 if (skb_is_gso(skb) &&
550 (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4 ||
551 skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6))
505 max_slots_needed++; 552 max_slots_needed++;
506 553
507 /* If the skb may not fit then bail out now */ 554 /* If the skb may not fit then bail out now */
@@ -513,9 +560,11 @@ static void xenvif_rx_action(struct xenvif *vif)
513 } else 560 } else
514 vif->rx_last_skb_slots = 0; 561 vif->rx_last_skb_slots = 0;
515 562
516 sco = (struct skb_cb_overlay *)skb->cb; 563 old_req_cons = vif->rx.req_cons;
517 sco->meta_slots_used = xenvif_gop_skb(skb, &npo); 564 XENVIF_RX_CB(skb)->meta_slots_used = xenvif_gop_skb(skb, &npo);
518 BUG_ON(sco->meta_slots_used > max_slots_needed); 565 ring_slots_used = vif->rx.req_cons - old_req_cons;
566
567 BUG_ON(ring_slots_used > max_slots_needed);
519 568
520 __skb_queue_tail(&rxq, skb); 569 __skb_queue_tail(&rxq, skb);
521 } 570 }
@@ -529,7 +578,6 @@ static void xenvif_rx_action(struct xenvif *vif)
529 gnttab_batch_copy(vif->grant_copy_op, npo.copy_prod); 578 gnttab_batch_copy(vif->grant_copy_op, npo.copy_prod);
530 579
531 while ((skb = __skb_dequeue(&rxq)) != NULL) { 580 while ((skb = __skb_dequeue(&rxq)) != NULL) {
532 sco = (struct skb_cb_overlay *)skb->cb;
533 581
534 if ((1 << vif->meta[npo.meta_cons].gso_type) & 582 if ((1 << vif->meta[npo.meta_cons].gso_type) &
535 vif->gso_prefix_mask) { 583 vif->gso_prefix_mask) {
@@ -540,19 +588,21 @@ static void xenvif_rx_action(struct xenvif *vif)
540 588
541 resp->offset = vif->meta[npo.meta_cons].gso_size; 589 resp->offset = vif->meta[npo.meta_cons].gso_size;
542 resp->id = vif->meta[npo.meta_cons].id; 590 resp->id = vif->meta[npo.meta_cons].id;
543 resp->status = sco->meta_slots_used; 591 resp->status = XENVIF_RX_CB(skb)->meta_slots_used;
544 592
545 npo.meta_cons++; 593 npo.meta_cons++;
546 sco->meta_slots_used--; 594 XENVIF_RX_CB(skb)->meta_slots_used--;
547 } 595 }
548 596
549 597
550 vif->dev->stats.tx_bytes += skb->len; 598 vif->dev->stats.tx_bytes += skb->len;
551 vif->dev->stats.tx_packets++; 599 vif->dev->stats.tx_packets++;
552 600
553 status = xenvif_check_gop(vif, sco->meta_slots_used, &npo); 601 status = xenvif_check_gop(vif,
602 XENVIF_RX_CB(skb)->meta_slots_used,
603 &npo);
554 604
555 if (sco->meta_slots_used == 1) 605 if (XENVIF_RX_CB(skb)->meta_slots_used == 1)
556 flags = 0; 606 flags = 0;
557 else 607 else
558 flags = XEN_NETRXF_more_data; 608 flags = XEN_NETRXF_more_data;
@@ -589,13 +639,13 @@ static void xenvif_rx_action(struct xenvif *vif)
589 639
590 xenvif_add_frag_responses(vif, status, 640 xenvif_add_frag_responses(vif, status,
591 vif->meta + npo.meta_cons + 1, 641 vif->meta + npo.meta_cons + 1,
592 sco->meta_slots_used); 642 XENVIF_RX_CB(skb)->meta_slots_used);
593 643
594 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&vif->rx, ret); 644 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&vif->rx, ret);
595 645
596 need_to_notify |= !!ret; 646 need_to_notify |= !!ret;
597 647
598 npo.meta_cons += sco->meta_slots_used; 648 npo.meta_cons += XENVIF_RX_CB(skb)->meta_slots_used;
599 dev_kfree_skb(skb); 649 dev_kfree_skb(skb);
600 } 650 }
601 651
@@ -645,9 +695,12 @@ static void xenvif_tx_err(struct xenvif *vif,
645 struct xen_netif_tx_request *txp, RING_IDX end) 695 struct xen_netif_tx_request *txp, RING_IDX end)
646{ 696{
647 RING_IDX cons = vif->tx.req_cons; 697 RING_IDX cons = vif->tx.req_cons;
698 unsigned long flags;
648 699
649 do { 700 do {
701 spin_lock_irqsave(&vif->response_lock, flags);
650 make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR); 702 make_tx_response(vif, txp, XEN_NETIF_RSP_ERROR);
703 spin_unlock_irqrestore(&vif->response_lock, flags);
651 if (cons == end) 704 if (cons == end)
652 break; 705 break;
653 txp = RING_GET_REQUEST(&vif->tx, cons++); 706 txp = RING_GET_REQUEST(&vif->tx, cons++);
@@ -658,7 +711,8 @@ static void xenvif_tx_err(struct xenvif *vif,
658static void xenvif_fatal_tx_err(struct xenvif *vif) 711static void xenvif_fatal_tx_err(struct xenvif *vif)
659{ 712{
660 netdev_err(vif->dev, "fatal error; disabling device\n"); 713 netdev_err(vif->dev, "fatal error; disabling device\n");
661 xenvif_carrier_off(vif); 714 vif->disabled = true;
715 xenvif_kick_thread(vif);
662} 716}
663 717
664static int xenvif_count_requests(struct xenvif *vif, 718static int xenvif_count_requests(struct xenvif *vif,
@@ -759,204 +813,220 @@ static int xenvif_count_requests(struct xenvif *vif,
759 return slots; 813 return slots;
760} 814}
761 815
762static struct page *xenvif_alloc_page(struct xenvif *vif, 816
763 u16 pending_idx) 817struct xenvif_tx_cb {
818 u16 pending_idx;
819};
820
821#define XENVIF_TX_CB(skb) ((struct xenvif_tx_cb *)(skb)->cb)
822
823static inline void xenvif_tx_create_map_op(struct xenvif *vif,
824 u16 pending_idx,
825 struct xen_netif_tx_request *txp,
826 struct gnttab_map_grant_ref *mop)
764{ 827{
765 struct page *page; 828 vif->pages_to_map[mop-vif->tx_map_ops] = vif->mmap_pages[pending_idx];
829 gnttab_set_map_op(mop, idx_to_kaddr(vif, pending_idx),
830 GNTMAP_host_map | GNTMAP_readonly,
831 txp->gref, vif->domid);
832
833 memcpy(&vif->pending_tx_info[pending_idx].req, txp,
834 sizeof(*txp));
835}
766 836
767 page = alloc_page(GFP_ATOMIC|__GFP_COLD); 837static inline struct sk_buff *xenvif_alloc_skb(unsigned int size)
768 if (!page) 838{
839 struct sk_buff *skb =
840 alloc_skb(size + NET_SKB_PAD + NET_IP_ALIGN,
841 GFP_ATOMIC | __GFP_NOWARN);
842 if (unlikely(skb == NULL))
769 return NULL; 843 return NULL;
770 vif->mmap_pages[pending_idx] = page;
771 844
772 return page; 845 /* Packets passed to netif_rx() must have some headroom. */
846 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
847
848 /* Initialize it here to avoid later surprises */
849 skb_shinfo(skb)->destructor_arg = NULL;
850
851 return skb;
773} 852}
774 853
775static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif, 854static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
776 struct sk_buff *skb, 855 struct sk_buff *skb,
777 struct xen_netif_tx_request *txp, 856 struct xen_netif_tx_request *txp,
778 struct gnttab_copy *gop) 857 struct gnttab_map_grant_ref *gop)
779{ 858{
780 struct skb_shared_info *shinfo = skb_shinfo(skb); 859 struct skb_shared_info *shinfo = skb_shinfo(skb);
781 skb_frag_t *frags = shinfo->frags; 860 skb_frag_t *frags = shinfo->frags;
782 u16 pending_idx = *((u16 *)skb->data); 861 u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
783 u16 head_idx = 0; 862 int start;
784 int slot, start; 863 pending_ring_idx_t index;
785 struct page *page; 864 unsigned int nr_slots, frag_overflow = 0;
786 pending_ring_idx_t index, start_idx = 0;
787 uint16_t dst_offset;
788 unsigned int nr_slots;
789 struct pending_tx_info *first = NULL;
790 865
791 /* At this point shinfo->nr_frags is in fact the number of 866 /* At this point shinfo->nr_frags is in fact the number of
792 * slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX. 867 * slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.
793 */ 868 */
869 if (shinfo->nr_frags > MAX_SKB_FRAGS) {
870 frag_overflow = shinfo->nr_frags - MAX_SKB_FRAGS;
871 BUG_ON(frag_overflow > MAX_SKB_FRAGS);
872 shinfo->nr_frags = MAX_SKB_FRAGS;
873 }
794 nr_slots = shinfo->nr_frags; 874 nr_slots = shinfo->nr_frags;
795 875
796 /* Skip first skb fragment if it is on same page as header fragment. */ 876 /* Skip first skb fragment if it is on same page as header fragment. */
797 start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); 877 start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
798 878
799 /* Coalesce tx requests, at this point the packet passed in 879 for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;
800 * should be <= 64K. Any packets larger than 64K have been 880 shinfo->nr_frags++, txp++, gop++) {
801 * handled in xenvif_count_requests(). 881 index = pending_index(vif->pending_cons++);
802 */ 882 pending_idx = vif->pending_ring[index];
803 for (shinfo->nr_frags = slot = start; slot < nr_slots; 883 xenvif_tx_create_map_op(vif, pending_idx, txp, gop);
804 shinfo->nr_frags++) { 884 frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
805 struct pending_tx_info *pending_tx_info = 885 }
806 vif->pending_tx_info;
807 886
808 page = alloc_page(GFP_ATOMIC|__GFP_COLD); 887 if (frag_overflow) {
809 if (!page) 888 struct sk_buff *nskb = xenvif_alloc_skb(0);
810 goto err; 889 if (unlikely(nskb == NULL)) {
811 890 if (net_ratelimit())
812 dst_offset = 0; 891 netdev_err(vif->dev,
813 first = NULL; 892 "Can't allocate the frag_list skb.\n");
814 while (dst_offset < PAGE_SIZE && slot < nr_slots) { 893 return NULL;
815 gop->flags = GNTCOPY_source_gref; 894 }
816
817 gop->source.u.ref = txp->gref;
818 gop->source.domid = vif->domid;
819 gop->source.offset = txp->offset;
820
821 gop->dest.domid = DOMID_SELF;
822
823 gop->dest.offset = dst_offset;
824 gop->dest.u.gmfn = virt_to_mfn(page_address(page));
825
826 if (dst_offset + txp->size > PAGE_SIZE) {
827 /* This page can only merge a portion
828 * of tx request. Do not increment any
829 * pointer / counter here. The txp
830 * will be dealt with in future
831 * rounds, eventually hitting the
832 * `else` branch.
833 */
834 gop->len = PAGE_SIZE - dst_offset;
835 txp->offset += gop->len;
836 txp->size -= gop->len;
837 dst_offset += gop->len; /* quit loop */
838 } else {
839 /* This tx request can be merged in the page */
840 gop->len = txp->size;
841 dst_offset += gop->len;
842
843 index = pending_index(vif->pending_cons++);
844
845 pending_idx = vif->pending_ring[index];
846
847 memcpy(&pending_tx_info[pending_idx].req, txp,
848 sizeof(*txp));
849
850 /* Poison these fields, corresponding
851 * fields for head tx req will be set
852 * to correct values after the loop.
853 */
854 vif->mmap_pages[pending_idx] = (void *)(~0UL);
855 pending_tx_info[pending_idx].head =
856 INVALID_PENDING_RING_IDX;
857
858 if (!first) {
859 first = &pending_tx_info[pending_idx];
860 start_idx = index;
861 head_idx = pending_idx;
862 }
863
864 txp++;
865 slot++;
866 }
867 895
868 gop++; 896 shinfo = skb_shinfo(nskb);
897 frags = shinfo->frags;
898
899 for (shinfo->nr_frags = 0; shinfo->nr_frags < frag_overflow;
900 shinfo->nr_frags++, txp++, gop++) {
901 index = pending_index(vif->pending_cons++);
902 pending_idx = vif->pending_ring[index];
903 xenvif_tx_create_map_op(vif, pending_idx, txp, gop);
904 frag_set_pending_idx(&frags[shinfo->nr_frags],
905 pending_idx);
869 } 906 }
870 907
871 first->req.offset = 0; 908 skb_shinfo(skb)->frag_list = nskb;
872 first->req.size = dst_offset;
873 first->head = start_idx;
874 vif->mmap_pages[head_idx] = page;
875 frag_set_pending_idx(&frags[shinfo->nr_frags], head_idx);
876 } 909 }
877 910
878 BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
879
880 return gop; 911 return gop;
881err: 912}
882 /* Unwind, freeing all pages and sending error responses. */ 913
883 while (shinfo->nr_frags-- > start) { 914static inline void xenvif_grant_handle_set(struct xenvif *vif,
884 xenvif_idx_release(vif, 915 u16 pending_idx,
885 frag_get_pending_idx(&frags[shinfo->nr_frags]), 916 grant_handle_t handle)
886 XEN_NETIF_RSP_ERROR); 917{
918 if (unlikely(vif->grant_tx_handle[pending_idx] !=
919 NETBACK_INVALID_HANDLE)) {
920 netdev_err(vif->dev,
921 "Trying to overwrite active handle! pending_idx: %x\n",
922 pending_idx);
923 BUG();
887 } 924 }
888 /* The head too, if necessary. */ 925 vif->grant_tx_handle[pending_idx] = handle;
889 if (start) 926}
890 xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
891 927
892 return NULL; 928static inline void xenvif_grant_handle_reset(struct xenvif *vif,
929 u16 pending_idx)
930{
931 if (unlikely(vif->grant_tx_handle[pending_idx] ==
932 NETBACK_INVALID_HANDLE)) {
933 netdev_err(vif->dev,
934 "Trying to unmap invalid handle! pending_idx: %x\n",
935 pending_idx);
936 BUG();
937 }
938 vif->grant_tx_handle[pending_idx] = NETBACK_INVALID_HANDLE;
893} 939}
894 940
895static int xenvif_tx_check_gop(struct xenvif *vif, 941static int xenvif_tx_check_gop(struct xenvif *vif,
896 struct sk_buff *skb, 942 struct sk_buff *skb,
897 struct gnttab_copy **gopp) 943 struct gnttab_map_grant_ref **gopp_map,
944 struct gnttab_copy **gopp_copy)
898{ 945{
899 struct gnttab_copy *gop = *gopp; 946 struct gnttab_map_grant_ref *gop_map = *gopp_map;
900 u16 pending_idx = *((u16 *)skb->data); 947 u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
901 struct skb_shared_info *shinfo = skb_shinfo(skb); 948 struct skb_shared_info *shinfo = skb_shinfo(skb);
902 struct pending_tx_info *tx_info;
903 int nr_frags = shinfo->nr_frags; 949 int nr_frags = shinfo->nr_frags;
904 int i, err, start; 950 int i, err;
905 u16 peek; /* peek into next tx request */ 951 struct sk_buff *first_skb = NULL;
906 952
907 /* Check status of header. */ 953 /* Check status of header. */
908 err = gop->status; 954 err = (*gopp_copy)->status;
909 if (unlikely(err)) 955 (*gopp_copy)++;
956 if (unlikely(err)) {
957 if (net_ratelimit())
958 netdev_dbg(vif->dev,
959 "Grant copy of header failed! status: %d pending_idx: %u ref: %u\n",
960 (*gopp_copy)->status,
961 pending_idx,
962 (*gopp_copy)->source.u.ref);
910 xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR); 963 xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
964 }
911 965
912 /* Skip first skb fragment if it is on same page as header fragment. */ 966check_frags:
913 start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); 967 for (i = 0; i < nr_frags; i++, gop_map++) {
914
915 for (i = start; i < nr_frags; i++) {
916 int j, newerr; 968 int j, newerr;
917 pending_ring_idx_t head;
918 969
919 pending_idx = frag_get_pending_idx(&shinfo->frags[i]); 970 pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
920 tx_info = &vif->pending_tx_info[pending_idx];
921 head = tx_info->head;
922 971
923 /* Check error status: if okay then remember grant handle. */ 972 /* Check error status: if okay then remember grant handle. */
924 do { 973 newerr = gop_map->status;
925 newerr = (++gop)->status;
926 if (newerr)
927 break;
928 peek = vif->pending_ring[pending_index(++head)];
929 } while (!pending_tx_is_head(vif, peek));
930 974
931 if (likely(!newerr)) { 975 if (likely(!newerr)) {
976 xenvif_grant_handle_set(vif,
977 pending_idx,
978 gop_map->handle);
932 /* Had a previous error? Invalidate this fragment. */ 979 /* Had a previous error? Invalidate this fragment. */
933 if (unlikely(err)) 980 if (unlikely(err))
934 xenvif_idx_release(vif, pending_idx, 981 xenvif_idx_unmap(vif, pending_idx);
935 XEN_NETIF_RSP_OKAY);
936 continue; 982 continue;
937 } 983 }
938 984
939 /* Error on this fragment: respond to client with an error. */ 985 /* Error on this fragment: respond to client with an error. */
986 if (net_ratelimit())
987 netdev_dbg(vif->dev,
988 "Grant map of %d. frag failed! status: %d pending_idx: %u ref: %u\n",
989 i,
990 gop_map->status,
991 pending_idx,
992 gop_map->ref);
940 xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR); 993 xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
941 994
942 /* Not the first error? Preceding frags already invalidated. */ 995 /* Not the first error? Preceding frags already invalidated. */
943 if (err) 996 if (err)
944 continue; 997 continue;
945 998 /* First error: invalidate preceding fragments. */
946 /* First error: invalidate header and preceding fragments. */ 999 for (j = 0; j < i; j++) {
947 pending_idx = *((u16 *)skb->data);
948 xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
949 for (j = start; j < i; j++) {
950 pending_idx = frag_get_pending_idx(&shinfo->frags[j]); 1000 pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
951 xenvif_idx_release(vif, pending_idx, 1001 xenvif_idx_unmap(vif, pending_idx);
952 XEN_NETIF_RSP_OKAY);
953 } 1002 }
954 1003
955 /* Remember the error: invalidate all subsequent fragments. */ 1004 /* Remember the error: invalidate all subsequent fragments. */
956 err = newerr; 1005 err = newerr;
957 } 1006 }
958 1007
959 *gopp = gop + 1; 1008 if (skb_has_frag_list(skb)) {
1009 first_skb = skb;
1010 skb = shinfo->frag_list;
1011 shinfo = skb_shinfo(skb);
1012 nr_frags = shinfo->nr_frags;
1013
1014 goto check_frags;
1015 }
1016
1017 /* There was a mapping error in the frag_list skb. We have to unmap
1018 * the first skb's frags
1019 */
1020 if (first_skb && err) {
1021 int j;
1022 shinfo = skb_shinfo(first_skb);
1023 for (j = 0; j < shinfo->nr_frags; j++) {
1024 pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
1025 xenvif_idx_unmap(vif, pending_idx);
1026 }
1027 }
1028
1029 *gopp_map = gop_map;
960 return err; 1030 return err;
961} 1031}
962 1032
@@ -965,6 +1035,7 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
965 struct skb_shared_info *shinfo = skb_shinfo(skb); 1035 struct skb_shared_info *shinfo = skb_shinfo(skb);
966 int nr_frags = shinfo->nr_frags; 1036 int nr_frags = shinfo->nr_frags;
967 int i; 1037 int i;
1038 u16 prev_pending_idx = INVALID_PENDING_IDX;
968 1039
969 for (i = 0; i < nr_frags; i++) { 1040 for (i = 0; i < nr_frags; i++) {
970 skb_frag_t *frag = shinfo->frags + i; 1041 skb_frag_t *frag = shinfo->frags + i;
@@ -974,6 +1045,17 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
974 1045
975 pending_idx = frag_get_pending_idx(frag); 1046 pending_idx = frag_get_pending_idx(frag);
976 1047
1048 /* If this is not the first frag, chain it to the previous*/
1049 if (prev_pending_idx == INVALID_PENDING_IDX)
1050 skb_shinfo(skb)->destructor_arg =
1051 &callback_param(vif, pending_idx);
1052 else
1053 callback_param(vif, prev_pending_idx).ctx =
1054 &callback_param(vif, pending_idx);
1055
1056 callback_param(vif, pending_idx).ctx = NULL;
1057 prev_pending_idx = pending_idx;
1058
977 txp = &vif->pending_tx_info[pending_idx].req; 1059 txp = &vif->pending_tx_info[pending_idx].req;
978 page = virt_to_page(idx_to_kaddr(vif, pending_idx)); 1060 page = virt_to_page(idx_to_kaddr(vif, pending_idx));
979 __skb_fill_page_desc(skb, i, page, txp->offset, txp->size); 1061 __skb_fill_page_desc(skb, i, page, txp->offset, txp->size);
@@ -981,10 +1063,15 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb)
981 skb->data_len += txp->size; 1063 skb->data_len += txp->size;
982 skb->truesize += txp->size; 1064 skb->truesize += txp->size;
983 1065
984 /* Take an extra reference to offset xenvif_idx_release */ 1066 /* Take an extra reference to offset network stack's put_page */
985 get_page(vif->mmap_pages[pending_idx]); 1067 get_page(vif->mmap_pages[pending_idx]);
986 xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
987 } 1068 }
1069 /* FIXME: __skb_fill_page_desc set this to true because page->pfmemalloc
1070 * overlaps with "index", and "mapping" is not set. I think mapping
1071 * should be set. If delivered to local stack, it would drop this
1072 * skb in sk_filter unless the socket has the right to use it.
1073 */
1074 skb->pfmemalloc = false;
988} 1075}
989 1076
990static int xenvif_get_extras(struct xenvif *vif, 1077static int xenvif_get_extras(struct xenvif *vif,
@@ -1102,18 +1189,18 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)
1102 return false; 1189 return false;
1103} 1190}
1104 1191
1105static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget) 1192static void xenvif_tx_build_gops(struct xenvif *vif,
1193 int budget,
1194 unsigned *copy_ops,
1195 unsigned *map_ops)
1106{ 1196{
1107 struct gnttab_copy *gop = vif->tx_copy_ops, *request_gop; 1197 struct gnttab_map_grant_ref *gop = vif->tx_map_ops, *request_gop;
1108 struct sk_buff *skb; 1198 struct sk_buff *skb;
1109 int ret; 1199 int ret;
1110 1200
1111 while ((nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX 1201 while (skb_queue_len(&vif->tx_queue) < budget) {
1112 < MAX_PENDING_REQS) &&
1113 (skb_queue_len(&vif->tx_queue) < budget)) {
1114 struct xen_netif_tx_request txreq; 1202 struct xen_netif_tx_request txreq;
1115 struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX]; 1203 struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX];
1116 struct page *page;
1117 struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1]; 1204 struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
1118 u16 pending_idx; 1205 u16 pending_idx;
1119 RING_IDX idx; 1206 RING_IDX idx;
@@ -1129,7 +1216,7 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
1129 vif->tx.sring->req_prod, vif->tx.req_cons, 1216 vif->tx.sring->req_prod, vif->tx.req_cons,
1130 XEN_NETIF_TX_RING_SIZE); 1217 XEN_NETIF_TX_RING_SIZE);
1131 xenvif_fatal_tx_err(vif); 1218 xenvif_fatal_tx_err(vif);
1132 continue; 1219 break;
1133 } 1220 }
1134 1221
1135 work_to_do = RING_HAS_UNCONSUMED_REQUESTS(&vif->tx); 1222 work_to_do = RING_HAS_UNCONSUMED_REQUESTS(&vif->tx);
@@ -1189,8 +1276,7 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
1189 ret < XEN_NETBK_LEGACY_SLOTS_MAX) ? 1276 ret < XEN_NETBK_LEGACY_SLOTS_MAX) ?
1190 PKT_PROT_LEN : txreq.size; 1277 PKT_PROT_LEN : txreq.size;
1191 1278
1192 skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN, 1279 skb = xenvif_alloc_skb(data_len);
1193 GFP_ATOMIC | __GFP_NOWARN);
1194 if (unlikely(skb == NULL)) { 1280 if (unlikely(skb == NULL)) {
1195 netdev_dbg(vif->dev, 1281 netdev_dbg(vif->dev,
1196 "Can't allocate a skb in start_xmit.\n"); 1282 "Can't allocate a skb in start_xmit.\n");
@@ -1198,9 +1284,6 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
1198 break; 1284 break;
1199 } 1285 }
1200 1286
1201 /* Packets passed to netif_rx() must have some headroom. */
1202 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
1203
1204 if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) { 1287 if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
1205 struct xen_netif_extra_info *gso; 1288 struct xen_netif_extra_info *gso;
1206 gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1]; 1289 gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
@@ -1212,42 +1295,36 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
1212 } 1295 }
1213 } 1296 }
1214 1297
1215 /* XXX could copy straight to head */ 1298 XENVIF_TX_CB(skb)->pending_idx = pending_idx;
1216 page = xenvif_alloc_page(vif, pending_idx);
1217 if (!page) {
1218 kfree_skb(skb);
1219 xenvif_tx_err(vif, &txreq, idx);
1220 break;
1221 }
1222
1223 gop->source.u.ref = txreq.gref;
1224 gop->source.domid = vif->domid;
1225 gop->source.offset = txreq.offset;
1226 1299
1227 gop->dest.u.gmfn = virt_to_mfn(page_address(page)); 1300 __skb_put(skb, data_len);
1228 gop->dest.domid = DOMID_SELF; 1301 vif->tx_copy_ops[*copy_ops].source.u.ref = txreq.gref;
1229 gop->dest.offset = txreq.offset; 1302 vif->tx_copy_ops[*copy_ops].source.domid = vif->domid;
1303 vif->tx_copy_ops[*copy_ops].source.offset = txreq.offset;
1230 1304
1231 gop->len = txreq.size; 1305 vif->tx_copy_ops[*copy_ops].dest.u.gmfn =
1232 gop->flags = GNTCOPY_source_gref; 1306 virt_to_mfn(skb->data);
1307 vif->tx_copy_ops[*copy_ops].dest.domid = DOMID_SELF;
1308 vif->tx_copy_ops[*copy_ops].dest.offset =
1309 offset_in_page(skb->data);
1233 1310
1234 gop++; 1311 vif->tx_copy_ops[*copy_ops].len = data_len;
1312 vif->tx_copy_ops[*copy_ops].flags = GNTCOPY_source_gref;
1235 1313
1236 memcpy(&vif->pending_tx_info[pending_idx].req, 1314 (*copy_ops)++;
1237 &txreq, sizeof(txreq));
1238 vif->pending_tx_info[pending_idx].head = index;
1239 *((u16 *)skb->data) = pending_idx;
1240
1241 __skb_put(skb, data_len);
1242 1315
1243 skb_shinfo(skb)->nr_frags = ret; 1316 skb_shinfo(skb)->nr_frags = ret;
1244 if (data_len < txreq.size) { 1317 if (data_len < txreq.size) {
1245 skb_shinfo(skb)->nr_frags++; 1318 skb_shinfo(skb)->nr_frags++;
1246 frag_set_pending_idx(&skb_shinfo(skb)->frags[0], 1319 frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
1247 pending_idx); 1320 pending_idx);
1321 xenvif_tx_create_map_op(vif, pending_idx, &txreq, gop);
1322 gop++;
1248 } else { 1323 } else {
1249 frag_set_pending_idx(&skb_shinfo(skb)->frags[0], 1324 frag_set_pending_idx(&skb_shinfo(skb)->frags[0],
1250 INVALID_PENDING_IDX); 1325 INVALID_PENDING_IDX);
1326 memcpy(&vif->pending_tx_info[pending_idx].req, &txreq,
1327 sizeof(txreq));
1251 } 1328 }
1252 1329
1253 vif->pending_cons++; 1330 vif->pending_cons++;
@@ -1264,17 +1341,85 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
1264 1341
1265 vif->tx.req_cons = idx; 1342 vif->tx.req_cons = idx;
1266 1343
1267 if ((gop-vif->tx_copy_ops) >= ARRAY_SIZE(vif->tx_copy_ops)) 1344 if (((gop-vif->tx_map_ops) >= ARRAY_SIZE(vif->tx_map_ops)) ||
1345 (*copy_ops >= ARRAY_SIZE(vif->tx_copy_ops)))
1268 break; 1346 break;
1269 } 1347 }
1270 1348
1271 return gop - vif->tx_copy_ops; 1349 (*map_ops) = gop - vif->tx_map_ops;
1350 return;
1272} 1351}
1273 1352
1353/* Consolidate skb with a frag_list into a brand new one with local pages on
1354 * frags. Returns 0 or -ENOMEM if can't allocate new pages.
1355 */
1356static int xenvif_handle_frag_list(struct xenvif *vif, struct sk_buff *skb)
1357{
1358 unsigned int offset = skb_headlen(skb);
1359 skb_frag_t frags[MAX_SKB_FRAGS];
1360 int i;
1361 struct ubuf_info *uarg;
1362 struct sk_buff *nskb = skb_shinfo(skb)->frag_list;
1363
1364 vif->tx_zerocopy_sent += 2;
1365 vif->tx_frag_overflow++;
1366
1367 xenvif_fill_frags(vif, nskb);
1368 /* Subtract frags size, we will correct it later */
1369 skb->truesize -= skb->data_len;
1370 skb->len += nskb->len;
1371 skb->data_len += nskb->len;
1372
1373 /* create a brand new frags array and coalesce there */
1374 for (i = 0; offset < skb->len; i++) {
1375 struct page *page;
1376 unsigned int len;
1377
1378 BUG_ON(i >= MAX_SKB_FRAGS);
1379 page = alloc_page(GFP_ATOMIC|__GFP_COLD);
1380 if (!page) {
1381 int j;
1382 skb->truesize += skb->data_len;
1383 for (j = 0; j < i; j++)
1384 put_page(frags[j].page.p);
1385 return -ENOMEM;
1386 }
1387
1388 if (offset + PAGE_SIZE < skb->len)
1389 len = PAGE_SIZE;
1390 else
1391 len = skb->len - offset;
1392 if (skb_copy_bits(skb, offset, page_address(page), len))
1393 BUG();
1394
1395 offset += len;
1396 frags[i].page.p = page;
1397 frags[i].page_offset = 0;
1398 skb_frag_size_set(&frags[i], len);
1399 }
1400 /* swap out with old one */
1401 memcpy(skb_shinfo(skb)->frags,
1402 frags,
1403 i * sizeof(skb_frag_t));
1404 skb_shinfo(skb)->nr_frags = i;
1405 skb->truesize += i * PAGE_SIZE;
1406
1407 /* remove traces of mapped pages and frag_list */
1408 skb_frag_list_init(skb);
1409 uarg = skb_shinfo(skb)->destructor_arg;
1410 uarg->callback(uarg, true);
1411 skb_shinfo(skb)->destructor_arg = NULL;
1412
1413 skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
1414 kfree_skb(nskb);
1415
1416 return 0;
1417}
1274 1418
1275static int xenvif_tx_submit(struct xenvif *vif) 1419static int xenvif_tx_submit(struct xenvif *vif)
1276{ 1420{
1277 struct gnttab_copy *gop = vif->tx_copy_ops; 1421 struct gnttab_map_grant_ref *gop_map = vif->tx_map_ops;
1422 struct gnttab_copy *gop_copy = vif->tx_copy_ops;
1278 struct sk_buff *skb; 1423 struct sk_buff *skb;
1279 int work_done = 0; 1424 int work_done = 0;
1280 1425
@@ -1283,21 +1428,18 @@ static int xenvif_tx_submit(struct xenvif *vif)
1283 u16 pending_idx; 1428 u16 pending_idx;
1284 unsigned data_len; 1429 unsigned data_len;
1285 1430
1286 pending_idx = *((u16 *)skb->data); 1431 pending_idx = XENVIF_TX_CB(skb)->pending_idx;
1287 txp = &vif->pending_tx_info[pending_idx].req; 1432 txp = &vif->pending_tx_info[pending_idx].req;
1288 1433
1289 /* Check the remap error code. */ 1434 /* Check the remap error code. */
1290 if (unlikely(xenvif_tx_check_gop(vif, skb, &gop))) { 1435 if (unlikely(xenvif_tx_check_gop(vif, skb, &gop_map, &gop_copy))) {
1291 netdev_dbg(vif->dev, "netback grant failed.\n");
1292 skb_shinfo(skb)->nr_frags = 0; 1436 skb_shinfo(skb)->nr_frags = 0;
1293 kfree_skb(skb); 1437 kfree_skb(skb);
1294 continue; 1438 continue;
1295 } 1439 }
1296 1440
1297 data_len = skb->len; 1441 data_len = skb->len;
1298 memcpy(skb->data, 1442 callback_param(vif, pending_idx).ctx = NULL;
1299 (void *)(idx_to_kaddr(vif, pending_idx)|txp->offset),
1300 data_len);
1301 if (data_len < txp->size) { 1443 if (data_len < txp->size) {
1302 /* Append the packet payload as a fragment. */ 1444 /* Append the packet payload as a fragment. */
1303 txp->offset += data_len; 1445 txp->offset += data_len;
@@ -1315,6 +1457,17 @@ static int xenvif_tx_submit(struct xenvif *vif)
1315 1457
1316 xenvif_fill_frags(vif, skb); 1458 xenvif_fill_frags(vif, skb);
1317 1459
1460 if (unlikely(skb_has_frag_list(skb))) {
1461 if (xenvif_handle_frag_list(vif, skb)) {
1462 if (net_ratelimit())
1463 netdev_err(vif->dev,
1464 "Not enough memory to consolidate frag_list!\n");
1465 skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
1466 kfree_skb(skb);
1467 continue;
1468 }
1469 }
1470
1318 if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) { 1471 if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
1319 int target = min_t(int, skb->len, PKT_PROT_LEN); 1472 int target = min_t(int, skb->len, PKT_PROT_LEN);
1320 __pskb_pull_tail(skb, target - skb_headlen(skb)); 1473 __pskb_pull_tail(skb, target - skb_headlen(skb));
@@ -1327,6 +1480,9 @@ static int xenvif_tx_submit(struct xenvif *vif)
1327 if (checksum_setup(vif, skb)) { 1480 if (checksum_setup(vif, skb)) {
1328 netdev_dbg(vif->dev, 1481 netdev_dbg(vif->dev,
1329 "Can't setup checksum in net_tx_action\n"); 1482 "Can't setup checksum in net_tx_action\n");
1483 /* We have to set this flag to trigger the callback */
1484 if (skb_shinfo(skb)->destructor_arg)
1485 skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
1330 kfree_skb(skb); 1486 kfree_skb(skb);
1331 continue; 1487 continue;
1332 } 1488 }
@@ -1352,27 +1508,143 @@ static int xenvif_tx_submit(struct xenvif *vif)
1352 1508
1353 work_done++; 1509 work_done++;
1354 1510
1511 /* Set this flag right before netif_receive_skb, otherwise
1512 * someone might think this packet already left netback, and
1513 * do a skb_copy_ubufs while we are still in control of the
1514 * skb. E.g. the __pskb_pull_tail earlier can do such thing.
1515 */
1516 if (skb_shinfo(skb)->destructor_arg) {
1517 skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
1518 vif->tx_zerocopy_sent++;
1519 }
1520
1355 netif_receive_skb(skb); 1521 netif_receive_skb(skb);
1356 } 1522 }
1357 1523
1358 return work_done; 1524 return work_done;
1359} 1525}
1360 1526
1527void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
1528{
1529 unsigned long flags;
1530 pending_ring_idx_t index;
1531 struct xenvif *vif = ubuf_to_vif(ubuf);
1532
1533 /* This is the only place where we grab this lock, to protect callbacks
1534 * from each other.
1535 */
1536 spin_lock_irqsave(&vif->callback_lock, flags);
1537 do {
1538 u16 pending_idx = ubuf->desc;
1539 ubuf = (struct ubuf_info *) ubuf->ctx;
1540 BUG_ON(vif->dealloc_prod - vif->dealloc_cons >=
1541 MAX_PENDING_REQS);
1542 index = pending_index(vif->dealloc_prod);
1543 vif->dealloc_ring[index] = pending_idx;
1544 /* Sync with xenvif_tx_dealloc_action:
1545 * insert idx then incr producer.
1546 */
1547 smp_wmb();
1548 vif->dealloc_prod++;
1549 } while (ubuf);
1550 wake_up(&vif->dealloc_wq);
1551 spin_unlock_irqrestore(&vif->callback_lock, flags);
1552
1553 if (likely(zerocopy_success))
1554 vif->tx_zerocopy_success++;
1555 else
1556 vif->tx_zerocopy_fail++;
1557}
1558
1559static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
1560{
1561 struct gnttab_unmap_grant_ref *gop;
1562 pending_ring_idx_t dc, dp;
1563 u16 pending_idx, pending_idx_release[MAX_PENDING_REQS];
1564 unsigned int i = 0;
1565
1566 dc = vif->dealloc_cons;
1567 gop = vif->tx_unmap_ops;
1568
1569 /* Free up any grants we have finished using */
1570 do {
1571 dp = vif->dealloc_prod;
1572
1573 /* Ensure we see all indices enqueued by all
1574 * xenvif_zerocopy_callback().
1575 */
1576 smp_rmb();
1577
1578 while (dc != dp) {
1579 BUG_ON(gop - vif->tx_unmap_ops > MAX_PENDING_REQS);
1580 pending_idx =
1581 vif->dealloc_ring[pending_index(dc++)];
1582
1583 pending_idx_release[gop-vif->tx_unmap_ops] =
1584 pending_idx;
1585 vif->pages_to_unmap[gop-vif->tx_unmap_ops] =
1586 vif->mmap_pages[pending_idx];
1587 gnttab_set_unmap_op(gop,
1588 idx_to_kaddr(vif, pending_idx),
1589 GNTMAP_host_map,
1590 vif->grant_tx_handle[pending_idx]);
1591 xenvif_grant_handle_reset(vif, pending_idx);
1592 ++gop;
1593 }
1594
1595 } while (dp != vif->dealloc_prod);
1596
1597 vif->dealloc_cons = dc;
1598
1599 if (gop - vif->tx_unmap_ops > 0) {
1600 int ret;
1601 ret = gnttab_unmap_refs(vif->tx_unmap_ops,
1602 NULL,
1603 vif->pages_to_unmap,
1604 gop - vif->tx_unmap_ops);
1605 if (ret) {
1606 netdev_err(vif->dev, "Unmap fail: nr_ops %tx ret %d\n",
1607 gop - vif->tx_unmap_ops, ret);
1608 for (i = 0; i < gop - vif->tx_unmap_ops; ++i) {
1609 if (gop[i].status != GNTST_okay)
1610 netdev_err(vif->dev,
1611 " host_addr: %llx handle: %x status: %d\n",
1612 gop[i].host_addr,
1613 gop[i].handle,
1614 gop[i].status);
1615 }
1616 BUG();
1617 }
1618 }
1619
1620 for (i = 0; i < gop - vif->tx_unmap_ops; ++i)
1621 xenvif_idx_release(vif, pending_idx_release[i],
1622 XEN_NETIF_RSP_OKAY);
1623}
1624
1625
1361/* Called after netfront has transmitted */ 1626/* Called after netfront has transmitted */
1362int xenvif_tx_action(struct xenvif *vif, int budget) 1627int xenvif_tx_action(struct xenvif *vif, int budget)
1363{ 1628{
1364 unsigned nr_gops; 1629 unsigned nr_mops, nr_cops = 0;
1365 int work_done; 1630 int work_done, ret;
1366 1631
1367 if (unlikely(!tx_work_todo(vif))) 1632 if (unlikely(!tx_work_todo(vif)))
1368 return 0; 1633 return 0;
1369 1634
1370 nr_gops = xenvif_tx_build_gops(vif, budget); 1635 xenvif_tx_build_gops(vif, budget, &nr_cops, &nr_mops);
1371 1636
1372 if (nr_gops == 0) 1637 if (nr_cops == 0)
1373 return 0; 1638 return 0;
1374 1639
1375 gnttab_batch_copy(vif->tx_copy_ops, nr_gops); 1640 gnttab_batch_copy(vif->tx_copy_ops, nr_cops);
1641 if (nr_mops != 0) {
1642 ret = gnttab_map_refs(vif->tx_map_ops,
1643 NULL,
1644 vif->pages_to_map,
1645 nr_mops);
1646 BUG_ON(ret);
1647 }
1376 1648
1377 work_done = xenvif_tx_submit(vif); 1649 work_done = xenvif_tx_submit(vif);
1378 1650
@@ -1383,45 +1655,18 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
1383 u8 status) 1655 u8 status)
1384{ 1656{
1385 struct pending_tx_info *pending_tx_info; 1657 struct pending_tx_info *pending_tx_info;
1386 pending_ring_idx_t head; 1658 pending_ring_idx_t index;
1387 u16 peek; /* peek into next tx request */ 1659 unsigned long flags;
1388
1389 BUG_ON(vif->mmap_pages[pending_idx] == (void *)(~0UL));
1390
1391 /* Already complete? */
1392 if (vif->mmap_pages[pending_idx] == NULL)
1393 return;
1394 1660
1395 pending_tx_info = &vif->pending_tx_info[pending_idx]; 1661 pending_tx_info = &vif->pending_tx_info[pending_idx];
1396 1662 spin_lock_irqsave(&vif->response_lock, flags);
1397 head = pending_tx_info->head; 1663 make_tx_response(vif, &pending_tx_info->req, status);
1398 1664 index = pending_index(vif->pending_prod);
1399 BUG_ON(!pending_tx_is_head(vif, head)); 1665 vif->pending_ring[index] = pending_idx;
1400 BUG_ON(vif->pending_ring[pending_index(head)] != pending_idx); 1666 /* TX shouldn't use the index before we give it back here */
1401 1667 mb();
1402 do { 1668 vif->pending_prod++;
1403 pending_ring_idx_t index; 1669 spin_unlock_irqrestore(&vif->response_lock, flags);
1404 pending_ring_idx_t idx = pending_index(head);
1405 u16 info_idx = vif->pending_ring[idx];
1406
1407 pending_tx_info = &vif->pending_tx_info[info_idx];
1408 make_tx_response(vif, &pending_tx_info->req, status);
1409
1410 /* Setting any number other than
1411 * INVALID_PENDING_RING_IDX indicates this slot is
1412 * starting a new packet / ending a previous packet.
1413 */
1414 pending_tx_info->head = 0;
1415
1416 index = pending_index(vif->pending_prod++);
1417 vif->pending_ring[index] = vif->pending_ring[info_idx];
1418
1419 peek = vif->pending_ring[pending_index(++head)];
1420
1421 } while (!pending_tx_is_head(vif, peek));
1422
1423 put_page(vif->mmap_pages[pending_idx]);
1424 vif->mmap_pages[pending_idx] = NULL;
1425} 1670}
1426 1671
1427 1672
@@ -1469,23 +1714,54 @@ static struct xen_netif_rx_response *make_rx_response(struct xenvif *vif,
1469 return resp; 1714 return resp;
1470} 1715}
1471 1716
1717void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
1718{
1719 int ret;
1720 struct gnttab_unmap_grant_ref tx_unmap_op;
1721
1722 gnttab_set_unmap_op(&tx_unmap_op,
1723 idx_to_kaddr(vif, pending_idx),
1724 GNTMAP_host_map,
1725 vif->grant_tx_handle[pending_idx]);
1726 xenvif_grant_handle_reset(vif, pending_idx);
1727
1728 ret = gnttab_unmap_refs(&tx_unmap_op, NULL,
1729 &vif->mmap_pages[pending_idx], 1);
1730 if (ret) {
1731 netdev_err(vif->dev,
1732 "Unmap fail: ret: %d pending_idx: %d host_addr: %llx handle: %x status: %d\n",
1733 ret,
1734 pending_idx,
1735 tx_unmap_op.host_addr,
1736 tx_unmap_op.handle,
1737 tx_unmap_op.status);
1738 BUG();
1739 }
1740
1741 xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
1742}
1743
1472static inline int rx_work_todo(struct xenvif *vif) 1744static inline int rx_work_todo(struct xenvif *vif)
1473{ 1745{
1474 return !skb_queue_empty(&vif->rx_queue) && 1746 return (!skb_queue_empty(&vif->rx_queue) &&
1475 xenvif_rx_ring_slots_available(vif, vif->rx_last_skb_slots); 1747 xenvif_rx_ring_slots_available(vif, vif->rx_last_skb_slots)) ||
1748 vif->rx_queue_purge;
1476} 1749}
1477 1750
1478static inline int tx_work_todo(struct xenvif *vif) 1751static inline int tx_work_todo(struct xenvif *vif)
1479{ 1752{
1480 1753
1481 if (likely(RING_HAS_UNCONSUMED_REQUESTS(&vif->tx)) && 1754 if (likely(RING_HAS_UNCONSUMED_REQUESTS(&vif->tx)))
1482 (nr_pending_reqs(vif) + XEN_NETBK_LEGACY_SLOTS_MAX
1483 < MAX_PENDING_REQS))
1484 return 1; 1755 return 1;
1485 1756
1486 return 0; 1757 return 0;
1487} 1758}
1488 1759
1760static inline bool tx_dealloc_work_todo(struct xenvif *vif)
1761{
1762 return vif->dealloc_cons != vif->dealloc_prod;
1763}
1764
1489void xenvif_unmap_frontend_rings(struct xenvif *vif) 1765void xenvif_unmap_frontend_rings(struct xenvif *vif)
1490{ 1766{
1491 if (vif->tx.sring) 1767 if (vif->tx.sring)
@@ -1543,7 +1819,7 @@ static void xenvif_start_queue(struct xenvif *vif)
1543 netif_wake_queue(vif->dev); 1819 netif_wake_queue(vif->dev);
1544} 1820}
1545 1821
1546int xenvif_kthread(void *data) 1822int xenvif_kthread_guest_rx(void *data)
1547{ 1823{
1548 struct xenvif *vif = data; 1824 struct xenvif *vif = data;
1549 struct sk_buff *skb; 1825 struct sk_buff *skb;
@@ -1551,16 +1827,34 @@ int xenvif_kthread(void *data)
1551 while (!kthread_should_stop()) { 1827 while (!kthread_should_stop()) {
1552 wait_event_interruptible(vif->wq, 1828 wait_event_interruptible(vif->wq,
1553 rx_work_todo(vif) || 1829 rx_work_todo(vif) ||
1830 vif->disabled ||
1554 kthread_should_stop()); 1831 kthread_should_stop());
1832
1833 /* This frontend is found to be rogue, disable it in
1834 * kthread context. Currently this is only set when
1835 * netback finds out frontend sends malformed packet,
1836 * but we cannot disable the interface in softirq
1837 * context so we defer it here.
1838 */
1839 if (unlikely(vif->disabled && netif_carrier_ok(vif->dev)))
1840 xenvif_carrier_off(vif);
1841
1555 if (kthread_should_stop()) 1842 if (kthread_should_stop())
1556 break; 1843 break;
1557 1844
1845 if (vif->rx_queue_purge) {
1846 skb_queue_purge(&vif->rx_queue);
1847 vif->rx_queue_purge = false;
1848 }
1849
1558 if (!skb_queue_empty(&vif->rx_queue)) 1850 if (!skb_queue_empty(&vif->rx_queue))
1559 xenvif_rx_action(vif); 1851 xenvif_rx_action(vif);
1560 1852
1561 if (skb_queue_empty(&vif->rx_queue) && 1853 if (skb_queue_empty(&vif->rx_queue) &&
1562 netif_queue_stopped(vif->dev)) 1854 netif_queue_stopped(vif->dev)) {
1855 del_timer_sync(&vif->wake_queue);
1563 xenvif_start_queue(vif); 1856 xenvif_start_queue(vif);
1857 }
1564 1858
1565 cond_resched(); 1859 cond_resched();
1566 } 1860 }
@@ -1572,6 +1866,28 @@ int xenvif_kthread(void *data)
1572 return 0; 1866 return 0;
1573} 1867}
1574 1868
1869int xenvif_dealloc_kthread(void *data)
1870{
1871 struct xenvif *vif = data;
1872
1873 while (!kthread_should_stop()) {
1874 wait_event_interruptible(vif->dealloc_wq,
1875 tx_dealloc_work_todo(vif) ||
1876 kthread_should_stop());
1877 if (kthread_should_stop())
1878 break;
1879
1880 xenvif_tx_dealloc_action(vif);
1881 cond_resched();
1882 }
1883
1884 /* Unmap anything remaining*/
1885 if (tx_dealloc_work_todo(vif))
1886 xenvif_tx_dealloc_action(vif);
1887
1888 return 0;
1889}
1890
1575static int __init netback_init(void) 1891static int __init netback_init(void)
1576{ 1892{
1577 int rc = 0; 1893 int rc = 0;
@@ -1589,6 +1905,8 @@ static int __init netback_init(void)
1589 if (rc) 1905 if (rc)
1590 goto failed_init; 1906 goto failed_init;
1591 1907
1908 rx_drain_timeout_jiffies = msecs_to_jiffies(rx_drain_timeout_msecs);
1909
1592 return 0; 1910 return 0;
1593 1911
1594failed_init: 1912failed_init: