aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/inet_lro.c
diff options
context:
space:
mode:
authorJan-Bernd Themann <themann@de.ibm.com>2007-08-09 01:38:05 -0400
committerDavid S. Miller <davem@sunset.davemloft.net>2007-10-10 19:47:46 -0400
commit71c87e0cedca843162206c698cfa02e5fea9e2e3 (patch)
tree8f0138754aaee3c15d1b00c4d2774b5e12da2c78 /net/ipv4/inet_lro.c
parente314dbdc1c0dc6a548ecf0afce28ecfd538ff568 (diff)
[NET]: Generic Large Receive Offload for TCP traffic
This patch provides generic Large Receive Offload (LRO) functionality for IPv4/TCP traffic. LRO combines received tcp packets to a single larger tcp packet and passes them then to the network stack in order to increase performance (throughput). The interface supports two modes: Drivers can either pass SKBs or fragment lists to the LRO engine. Signed-off-by: Jan-Bernd Themann <themann@de.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/inet_lro.c')
-rw-r--r--net/ipv4/inet_lro.c600
1 files changed, 600 insertions, 0 deletions
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
new file mode 100644
index 000000000000..20bc593bb963
--- /dev/null
+++ b/net/ipv4/inet_lro.c
@@ -0,0 +1,600 @@
1/*
2 * linux/net/ipv4/inet_lro.c
3 *
4 * Large Receive Offload (ipv4 / tcp)
5 *
6 * (C) Copyright IBM Corp. 2007
7 *
8 * Authors:
9 * Jan-Bernd Themann <themann@de.ibm.com>
10 * Christoph Raisch <raisch@de.ibm.com>
11 *
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2, or (at your option)
16 * any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26 */
27
28
29#include <linux/module.h>
30#include <linux/if_vlan.h>
31#include <linux/inet_lro.h>
32
33MODULE_LICENSE("GPL");
34MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
35MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
36
37#define TCP_HDR_LEN(tcph) (tcph->doff << 2)
38#define IP_HDR_LEN(iph) (iph->ihl << 2)
39#define TCP_PAYLOAD_LENGTH(iph, tcph) \
40 (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph))
41
42#define IPH_LEN_WO_OPTIONS 5
43#define TCPH_LEN_WO_OPTIONS 5
44#define TCPH_LEN_W_TIMESTAMP 8
45
46#define LRO_MAX_PG_HLEN 64
47
48#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; }
49
50/*
51 * Basic tcp checks whether packet is suitable for LRO
52 */
53
54static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph,
55 int len, struct net_lro_desc *lro_desc)
56{
57 /* check ip header: don't aggregate padded frames */
58 if (ntohs(iph->tot_len) != len)
59 return -1;
60
61 if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0)
62 return -1;
63
64 if (iph->ihl != IPH_LEN_WO_OPTIONS)
65 return -1;
66
67 if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack
68 || tcph->rst || tcph->syn || tcph->fin)
69 return -1;
70
71 if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
72 return -1;
73
74 if (tcph->doff != TCPH_LEN_WO_OPTIONS
75 && tcph->doff != TCPH_LEN_W_TIMESTAMP)
76 return -1;
77
78 /* check tcp options (only timestamp allowed) */
79 if (tcph->doff == TCPH_LEN_W_TIMESTAMP) {
80 u32 *topt = (u32 *)(tcph + 1);
81
82 if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
83 | (TCPOPT_TIMESTAMP << 8)
84 | TCPOLEN_TIMESTAMP))
85 return -1;
86
87 /* timestamp should be in right order */
88 topt++;
89 if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval),
90 ntohl(*topt)))
91 return -1;
92
93 /* timestamp reply should not be zero */
94 topt++;
95 if (*topt == 0)
96 return -1;
97 }
98
99 return 0;
100}
101
102static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
103{
104 struct iphdr *iph = lro_desc->iph;
105 struct tcphdr *tcph = lro_desc->tcph;
106 u32 *p;
107 __wsum tcp_hdr_csum;
108
109 tcph->ack_seq = lro_desc->tcp_ack;
110 tcph->window = lro_desc->tcp_window;
111
112 if (lro_desc->tcp_saw_tstamp) {
113 p = (u32 *)(tcph + 1);
114 *(p+2) = lro_desc->tcp_rcv_tsecr;
115 }
116
117 iph->tot_len = htons(lro_desc->ip_tot_len);
118
119 iph->check = 0;
120 iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl);
121
122 tcph->check = 0;
123 tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), 0);
124 lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
125 tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
126 lro_desc->ip_tot_len -
127 IP_HDR_LEN(iph), IPPROTO_TCP,
128 lro_desc->data_csum);
129}
130
131static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
132{
133 __wsum tcp_csum;
134 __wsum tcp_hdr_csum;
135 __wsum tcp_ps_hdr_csum;
136
137 tcp_csum = ~csum_unfold(tcph->check);
138 tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), tcp_csum);
139
140 tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
141 len + TCP_HDR_LEN(tcph),
142 IPPROTO_TCP, 0);
143
144 return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum),
145 tcp_ps_hdr_csum);
146}
147
148static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
149 struct iphdr *iph, struct tcphdr *tcph,
150 u16 vlan_tag, struct vlan_group *vgrp)
151{
152 int nr_frags;
153 u32 *ptr;
154 u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
155
156 nr_frags = skb_shinfo(skb)->nr_frags;
157 lro_desc->parent = skb;
158 lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]);
159 lro_desc->iph = iph;
160 lro_desc->tcph = tcph;
161 lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len;
162 lro_desc->tcp_ack = ntohl(tcph->ack_seq);
163 lro_desc->tcp_window = tcph->window;
164
165 lro_desc->pkt_aggr_cnt = 1;
166 lro_desc->ip_tot_len = ntohs(iph->tot_len);
167
168 if (tcph->doff == 8) {
169 ptr = (u32 *)(tcph+1);
170 lro_desc->tcp_saw_tstamp = 1;
171 lro_desc->tcp_rcv_tsval = *(ptr+1);
172 lro_desc->tcp_rcv_tsecr = *(ptr+2);
173 }
174
175 lro_desc->mss = tcp_data_len;
176 lro_desc->vgrp = vgrp;
177 lro_desc->vlan_tag = vlan_tag;
178 lro_desc->active = 1;
179
180 lro_desc->data_csum = lro_tcp_data_csum(iph, tcph,
181 tcp_data_len);
182}
183
184static inline void lro_clear_desc(struct net_lro_desc *lro_desc)
185{
186 memset(lro_desc, 0, sizeof(struct net_lro_desc));
187}
188
189static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph,
190 struct tcphdr *tcph, int tcp_data_len)
191{
192 struct sk_buff *parent = lro_desc->parent;
193 u32 *topt;
194
195 lro_desc->pkt_aggr_cnt++;
196 lro_desc->ip_tot_len += tcp_data_len;
197 lro_desc->tcp_next_seq += tcp_data_len;
198 lro_desc->tcp_window = tcph->window;
199 lro_desc->tcp_ack = tcph->ack_seq;
200
201 /* don't update tcp_rcv_tsval, would not work with PAWS */
202 if (lro_desc->tcp_saw_tstamp) {
203 topt = (u32 *) (tcph + 1);
204 lro_desc->tcp_rcv_tsecr = *(topt + 2);
205 }
206
207 lro_desc->data_csum = csum_block_add(lro_desc->data_csum,
208 lro_tcp_data_csum(iph, tcph,
209 tcp_data_len),
210 parent->len);
211
212 parent->len += tcp_data_len;
213 parent->data_len += tcp_data_len;
214 if (tcp_data_len > lro_desc->mss)
215 lro_desc->mss = tcp_data_len;
216}
217
218static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb,
219 struct iphdr *iph, struct tcphdr *tcph)
220{
221 struct sk_buff *parent = lro_desc->parent;
222 int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
223
224 lro_add_common(lro_desc, iph, tcph, tcp_data_len);
225
226 skb_pull(skb, (skb->len - tcp_data_len));
227 parent->truesize += skb->truesize;
228
229 if (lro_desc->last_skb)
230 lro_desc->last_skb->next = skb;
231 else
232 skb_shinfo(parent)->frag_list = skb;
233
234 lro_desc->last_skb = skb;
235}
236
237static void lro_add_frags(struct net_lro_desc *lro_desc,
238 int len, int hlen, int truesize,
239 struct skb_frag_struct *skb_frags,
240 struct iphdr *iph, struct tcphdr *tcph)
241{
242 struct sk_buff *skb = lro_desc->parent;
243 int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
244
245 lro_add_common(lro_desc, iph, tcph, tcp_data_len);
246
247 skb->truesize += truesize;
248
249 skb_frags[0].page_offset += hlen;
250 skb_frags[0].size -= hlen;
251
252 while (tcp_data_len > 0) {
253 *(lro_desc->next_frag) = *skb_frags;
254 tcp_data_len -= skb_frags->size;
255 lro_desc->next_frag++;
256 skb_frags++;
257 skb_shinfo(skb)->nr_frags++;
258 }
259}
260
261static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
262 struct iphdr *iph,
263 struct tcphdr *tcph)
264{
265 if ((lro_desc->iph->saddr != iph->saddr)
266 || (lro_desc->iph->daddr != iph->daddr)
267 || (lro_desc->tcph->source != tcph->source)
268 || (lro_desc->tcph->dest != tcph->dest))
269 return -1;
270 return 0;
271}
272
273static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr,
274 struct net_lro_desc *lro_arr,
275 struct iphdr *iph,
276 struct tcphdr *tcph)
277{
278 struct net_lro_desc *lro_desc = NULL;
279 struct net_lro_desc *tmp;
280 int max_desc = lro_mgr->max_desc;
281 int i;
282
283 for (i = 0; i < max_desc; i++) {
284 tmp = &lro_arr[i];
285 if (tmp->active)
286 if (!lro_check_tcp_conn(tmp, iph, tcph)) {
287 lro_desc = tmp;
288 goto out;
289 }
290 }
291
292 for (i = 0; i < max_desc; i++) {
293 if (!lro_arr[i].active) {
294 lro_desc = &lro_arr[i];
295 goto out;
296 }
297 }
298
299 LRO_INC_STATS(lro_mgr, no_desc);
300out:
301 return lro_desc;
302}
303
304static void lro_flush(struct net_lro_mgr *lro_mgr,
305 struct net_lro_desc *lro_desc)
306{
307 if (lro_desc->pkt_aggr_cnt > 1)
308 lro_update_tcp_ip_header(lro_desc);
309
310 skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss;
311
312 if (lro_desc->vgrp) {
313 if (test_bit(LRO_F_NAPI, &lro_mgr->features))
314 vlan_hwaccel_receive_skb(lro_desc->parent,
315 lro_desc->vgrp,
316 lro_desc->vlan_tag);
317 else
318 vlan_hwaccel_rx(lro_desc->parent,
319 lro_desc->vgrp,
320 lro_desc->vlan_tag);
321
322 } else {
323 if (test_bit(LRO_F_NAPI, &lro_mgr->features))
324 netif_receive_skb(lro_desc->parent);
325 else
326 netif_rx(lro_desc->parent);
327 }
328
329 LRO_INC_STATS(lro_mgr, flushed);
330 lro_clear_desc(lro_desc);
331}
332
333static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
334 struct vlan_group *vgrp, u16 vlan_tag, void *priv)
335{
336 struct net_lro_desc *lro_desc;
337 struct iphdr *iph;
338 struct tcphdr *tcph;
339 u64 flags;
340 int vlan_hdr_len = 0;
341
342 if (!lro_mgr->get_skb_header
343 || lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
344 &flags, priv))
345 goto out;
346
347 if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
348 goto out;
349
350 lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
351 if (!lro_desc)
352 goto out;
353
354 if ((skb->protocol == htons(ETH_P_8021Q))
355 && !test_bit(LRO_F_EXTRACT_VLAN_ID, &lro_mgr->features))
356 vlan_hdr_len = VLAN_HLEN;
357
358 if (!lro_desc->active) { /* start new lro session */
359 if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL))
360 goto out;
361
362 skb->ip_summed = lro_mgr->ip_summed_aggr;
363 lro_init_desc(lro_desc, skb, iph, tcph, vlan_tag, vgrp);
364 LRO_INC_STATS(lro_mgr, aggregated);
365 return 0;
366 }
367
368 if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
369 goto out2;
370
371 if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc))
372 goto out2;
373
374 lro_add_packet(lro_desc, skb, iph, tcph);
375 LRO_INC_STATS(lro_mgr, aggregated);
376
377 if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) ||
378 lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
379 lro_flush(lro_mgr, lro_desc);
380
381 return 0;
382
383out2: /* send aggregated SKBs to stack */
384 lro_flush(lro_mgr, lro_desc);
385
386out: /* Original SKB has to be posted to stack */
387 skb->ip_summed = lro_mgr->ip_summed;
388 return 1;
389}
390
391
392static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr,
393 struct skb_frag_struct *frags,
394 int len, int true_size,
395 void *mac_hdr,
396 int hlen, __wsum sum,
397 u32 ip_summed)
398{
399 struct sk_buff *skb;
400 struct skb_frag_struct *skb_frags;
401 int data_len = len;
402 int hdr_len = min(len, hlen);
403
404 skb = netdev_alloc_skb(lro_mgr->dev, hlen);
405 if (!skb)
406 return NULL;
407
408 skb->len = len;
409 skb->data_len = len - hdr_len;
410 skb->truesize += true_size;
411 skb->tail += hdr_len;
412
413 memcpy(skb->data, mac_hdr, hdr_len);
414
415 skb_frags = skb_shinfo(skb)->frags;
416 while (data_len > 0) {
417 *skb_frags = *frags;
418 data_len -= frags->size;
419 skb_frags++;
420 frags++;
421 skb_shinfo(skb)->nr_frags++;
422 }
423
424 skb_shinfo(skb)->frags[0].page_offset += hdr_len;
425 skb_shinfo(skb)->frags[0].size -= hdr_len;
426
427 skb->ip_summed = ip_summed;
428 skb->csum = sum;
429 skb->protocol = eth_type_trans(skb, lro_mgr->dev);
430 return skb;
431}
432
433static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
434 struct skb_frag_struct *frags,
435 int len, int true_size,
436 struct vlan_group *vgrp,
437 u16 vlan_tag, void *priv, __wsum sum)
438{
439 struct net_lro_desc *lro_desc;
440 struct iphdr *iph;
441 struct tcphdr *tcph;
442 struct sk_buff *skb;
443 u64 flags;
444 void *mac_hdr;
445 int mac_hdr_len;
446 int hdr_len = LRO_MAX_PG_HLEN;
447 int vlan_hdr_len = 0;
448
449 if (!lro_mgr->get_frag_header
450 || lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph,
451 (void *)&tcph, &flags, priv)) {
452 mac_hdr = page_address(frags->page) + frags->page_offset;
453 goto out1;
454 }
455
456 if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
457 goto out1;
458
459 hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr);
460 mac_hdr_len = (int)((void *)(iph) - mac_hdr);
461
462 lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
463 if (!lro_desc)
464 goto out1;
465
466 if (!lro_desc->active) { /* start new lro session */
467 if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL))
468 goto out1;
469
470 skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr,
471 hdr_len, 0, lro_mgr->ip_summed_aggr);
472 if (!skb)
473 goto out;
474
475 if ((skb->protocol == htons(ETH_P_8021Q))
476 && !test_bit(LRO_F_EXTRACT_VLAN_ID, &lro_mgr->features))
477 vlan_hdr_len = VLAN_HLEN;
478
479 iph = (void *)(skb->data + vlan_hdr_len);
480 tcph = (void *)((u8 *)skb->data + vlan_hdr_len
481 + IP_HDR_LEN(iph));
482
483 lro_init_desc(lro_desc, skb, iph, tcph, 0, NULL);
484 LRO_INC_STATS(lro_mgr, aggregated);
485 return 0;
486 }
487
488 if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
489 goto out2;
490
491 if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc))
492 goto out2;
493
494 lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph);
495 LRO_INC_STATS(lro_mgr, aggregated);
496
497 if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) ||
498 lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
499 lro_flush(lro_mgr, lro_desc);
500
501 return NULL;
502
503out2: /* send aggregated packets to the stack */
504 lro_flush(lro_mgr, lro_desc);
505
506out1: /* Original packet has to be posted to the stack */
507 skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr,
508 hdr_len, sum, lro_mgr->ip_summed);
509out:
510 return skb;
511}
512
513void lro_receive_skb(struct net_lro_mgr *lro_mgr,
514 struct sk_buff *skb,
515 void *priv)
516{
517 if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) {
518 if (test_bit(LRO_F_NAPI, &lro_mgr->features))
519 netif_receive_skb(skb);
520 else
521 netif_rx(skb);
522 }
523}
524EXPORT_SYMBOL(lro_receive_skb);
525
526void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr,
527 struct sk_buff *skb,
528 struct vlan_group *vgrp,
529 u16 vlan_tag,
530 void *priv)
531{
532 if (__lro_proc_skb(lro_mgr, skb, vgrp, vlan_tag, priv)) {
533 if (test_bit(LRO_F_NAPI, &lro_mgr->features))
534 vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
535 else
536 vlan_hwaccel_rx(skb, vgrp, vlan_tag);
537 }
538}
539EXPORT_SYMBOL(lro_vlan_hwaccel_receive_skb);
540
541void lro_receive_frags(struct net_lro_mgr *lro_mgr,
542 struct skb_frag_struct *frags,
543 int len, int true_size, void *priv, __wsum sum)
544{
545 struct sk_buff *skb;
546
547 skb = __lro_proc_segment(lro_mgr, frags, len, true_size, NULL, 0,
548 priv, sum);
549 if (!skb)
550 return;
551
552 if (test_bit(LRO_F_NAPI, &lro_mgr->features))
553 netif_receive_skb(skb);
554 else
555 netif_rx(skb);
556}
557EXPORT_SYMBOL(lro_receive_frags);
558
559void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr,
560 struct skb_frag_struct *frags,
561 int len, int true_size,
562 struct vlan_group *vgrp,
563 u16 vlan_tag, void *priv, __wsum sum)
564{
565 struct sk_buff *skb;
566
567 skb = __lro_proc_segment(lro_mgr, frags, len, true_size, vgrp,
568 vlan_tag, priv, sum);
569 if (!skb)
570 return;
571
572 if (test_bit(LRO_F_NAPI, &lro_mgr->features))
573 vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
574 else
575 vlan_hwaccel_rx(skb, vgrp, vlan_tag);
576}
577EXPORT_SYMBOL(lro_vlan_hwaccel_receive_frags);
578
579void lro_flush_all(struct net_lro_mgr *lro_mgr)
580{
581 int i;
582 struct net_lro_desc *lro_desc = lro_mgr->lro_arr;
583
584 for (i = 0; i < lro_mgr->max_desc; i++) {
585 if (lro_desc[i].active)
586 lro_flush(lro_mgr, &lro_desc[i]);
587 }
588}
589EXPORT_SYMBOL(lro_flush_all);
590
591void lro_flush_pkt(struct net_lro_mgr *lro_mgr,
592 struct iphdr *iph, struct tcphdr *tcph)
593{
594 struct net_lro_desc *lro_desc;
595
596 lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
597 if (lro_desc->active)
598 lro_flush(lro_mgr, lro_desc);
599}
600EXPORT_SYMBOL(lro_flush_pkt);