aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Oskolkov <posk@google.com>2018-08-02 19:34:39 -0400
committerDavid S. Miller <davem@davemloft.net>2018-08-05 20:16:46 -0400
commitfa0f527358bd900ef92f925878ed6bfbd51305cc (patch)
tree4ecb0bb64c8baa58162b3f03046c0643a140ece0
parent385114dec8a49b5e5945e77ba7de6356106713f4 (diff)
ip: use rb trees for IP frag queue.
Similar to TCP OOO RX queue, it makes sense to use rb trees to store IP fragments, so that OOO fragments are inserted faster. Tested: - a follow-up patch contains a rather comprehensive ip defrag self-test (functional) - ran neper `udp_stream -c -H <host> -F 100 -l 300 -T 20`: netstat --statistics Ip: 282078937 total packets received 0 forwarded 0 incoming packets discarded 946760 incoming packets delivered 18743456 requests sent out 101 fragments dropped after timeout 282077129 reassemblies required 944952 packets reassembled ok 262734239 packet reassembles failed (The numbers/stats above are somewhat better re: reassemblies vs a kernel without this patchset. More comprehensive performance testing TBD). Reported-by: Jann Horn <jannh@google.com> Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi> Suggested-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Peter Oskolkov <posk@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Florian Westphal <fw@strlen.de> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/skbuff.h9
-rw-r--r--include/net/inet_frag.h3
-rw-r--r--net/ipv4/inet_fragment.c16
-rw-r--r--net/ipv4/ip_fragment.c182
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c1
-rw-r--r--net/ipv6/reassembly.c1
6 files changed, 121 insertions, 91 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 47848367c816..7ebdf158a795 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -676,13 +676,16 @@ struct sk_buff {
676 * UDP receive path is one user. 676 * UDP receive path is one user.
677 */ 677 */
678 unsigned long dev_scratch; 678 unsigned long dev_scratch;
679 int ip_defrag_offset;
680 }; 679 };
681 }; 680 };
682 struct rb_node rbnode; /* used in netem & tcp stack */ 681 struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
683 struct list_head list; 682 struct list_head list;
684 }; 683 };
685 struct sock *sk; 684
685 union {
686 struct sock *sk;
687 int ip_defrag_offset;
688 };
686 689
687 union { 690 union {
688 ktime_t tstamp; 691 ktime_t tstamp;
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index f4272a29dc44..b86d14528188 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -75,7 +75,8 @@ struct inet_frag_queue {
75 struct timer_list timer; 75 struct timer_list timer;
76 spinlock_t lock; 76 spinlock_t lock;
77 refcount_t refcnt; 77 refcount_t refcnt;
78 struct sk_buff *fragments; 78 struct sk_buff *fragments; /* Used in IPv6. */
79 struct rb_root rb_fragments; /* Used in IPv4. */
79 struct sk_buff *fragments_tail; 80 struct sk_buff *fragments_tail;
80 ktime_t stamp; 81 ktime_t stamp;
81 int len; 82 int len;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index ccd140e4082d..6d258a5669e7 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -137,12 +137,16 @@ void inet_frag_destroy(struct inet_frag_queue *q)
137 fp = q->fragments; 137 fp = q->fragments;
138 nf = q->net; 138 nf = q->net;
139 f = nf->f; 139 f = nf->f;
140 while (fp) { 140 if (fp) {
141 struct sk_buff *xp = fp->next; 141 do {
142 142 struct sk_buff *xp = fp->next;
143 sum_truesize += fp->truesize; 143
144 kfree_skb(fp); 144 sum_truesize += fp->truesize;
145 fp = xp; 145 kfree_skb(fp);
146 fp = xp;
147 } while (fp);
148 } else {
149 sum_truesize = skb_rbtree_purge(&q->rb_fragments);
146 } 150 }
147 sum = sum_truesize + f->qsize; 151 sum = sum_truesize + f->qsize;
148 152
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 960bf5eab59f..0e8f8de77e71 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -136,7 +136,7 @@ static void ip_expire(struct timer_list *t)
136{ 136{
137 struct inet_frag_queue *frag = from_timer(frag, t, timer); 137 struct inet_frag_queue *frag = from_timer(frag, t, timer);
138 const struct iphdr *iph; 138 const struct iphdr *iph;
139 struct sk_buff *head; 139 struct sk_buff *head = NULL;
140 struct net *net; 140 struct net *net;
141 struct ipq *qp; 141 struct ipq *qp;
142 int err; 142 int err;
@@ -152,14 +152,31 @@ static void ip_expire(struct timer_list *t)
152 152
153 ipq_kill(qp); 153 ipq_kill(qp);
154 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); 154 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
155
156 head = qp->q.fragments;
157
158 __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); 155 __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
159 156
160 if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head) 157 if (!qp->q.flags & INET_FRAG_FIRST_IN)
161 goto out; 158 goto out;
162 159
160 /* sk_buff::dev and sk_buff::rbnode are unionized. So we
161 * pull the head out of the tree in order to be able to
162 * deal with head->dev.
163 */
164 if (qp->q.fragments) {
165 head = qp->q.fragments;
166 qp->q.fragments = head->next;
167 } else {
168 head = skb_rb_first(&qp->q.rb_fragments);
169 if (!head)
170 goto out;
171 rb_erase(&head->rbnode, &qp->q.rb_fragments);
172 memset(&head->rbnode, 0, sizeof(head->rbnode));
173 barrier();
174 }
175 if (head == qp->q.fragments_tail)
176 qp->q.fragments_tail = NULL;
177
178 sub_frag_mem_limit(qp->q.net, head->truesize);
179
163 head->dev = dev_get_by_index_rcu(net, qp->iif); 180 head->dev = dev_get_by_index_rcu(net, qp->iif);
164 if (!head->dev) 181 if (!head->dev)
165 goto out; 182 goto out;
@@ -179,16 +196,16 @@ static void ip_expire(struct timer_list *t)
179 (skb_rtable(head)->rt_type != RTN_LOCAL)) 196 (skb_rtable(head)->rt_type != RTN_LOCAL))
180 goto out; 197 goto out;
181 198
182 skb_get(head);
183 spin_unlock(&qp->q.lock); 199 spin_unlock(&qp->q.lock);
184 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); 200 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
185 kfree_skb(head);
186 goto out_rcu_unlock; 201 goto out_rcu_unlock;
187 202
188out: 203out:
189 spin_unlock(&qp->q.lock); 204 spin_unlock(&qp->q.lock);
190out_rcu_unlock: 205out_rcu_unlock:
191 rcu_read_unlock(); 206 rcu_read_unlock();
207 if (head)
208 kfree_skb(head);
192 ipq_put(qp); 209 ipq_put(qp);
193} 210}
194 211
@@ -231,7 +248,7 @@ static int ip_frag_too_far(struct ipq *qp)
231 end = atomic_inc_return(&peer->rid); 248 end = atomic_inc_return(&peer->rid);
232 qp->rid = end; 249 qp->rid = end;
233 250
234 rc = qp->q.fragments && (end - start) > max; 251 rc = qp->q.fragments_tail && (end - start) > max;
235 252
236 if (rc) { 253 if (rc) {
237 struct net *net; 254 struct net *net;
@@ -245,7 +262,6 @@ static int ip_frag_too_far(struct ipq *qp)
245 262
246static int ip_frag_reinit(struct ipq *qp) 263static int ip_frag_reinit(struct ipq *qp)
247{ 264{
248 struct sk_buff *fp;
249 unsigned int sum_truesize = 0; 265 unsigned int sum_truesize = 0;
250 266
251 if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { 267 if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
@@ -253,20 +269,14 @@ static int ip_frag_reinit(struct ipq *qp)
253 return -ETIMEDOUT; 269 return -ETIMEDOUT;
254 } 270 }
255 271
256 fp = qp->q.fragments; 272 sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
257 do {
258 struct sk_buff *xp = fp->next;
259
260 sum_truesize += fp->truesize;
261 kfree_skb(fp);
262 fp = xp;
263 } while (fp);
264 sub_frag_mem_limit(qp->q.net, sum_truesize); 273 sub_frag_mem_limit(qp->q.net, sum_truesize);
265 274
266 qp->q.flags = 0; 275 qp->q.flags = 0;
267 qp->q.len = 0; 276 qp->q.len = 0;
268 qp->q.meat = 0; 277 qp->q.meat = 0;
269 qp->q.fragments = NULL; 278 qp->q.fragments = NULL;
279 qp->q.rb_fragments = RB_ROOT;
270 qp->q.fragments_tail = NULL; 280 qp->q.fragments_tail = NULL;
271 qp->iif = 0; 281 qp->iif = 0;
272 qp->ecn = 0; 282 qp->ecn = 0;
@@ -278,7 +288,8 @@ static int ip_frag_reinit(struct ipq *qp)
278static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) 288static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
279{ 289{
280 struct net *net = container_of(qp->q.net, struct net, ipv4.frags); 290 struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
281 struct sk_buff *prev, *next; 291 struct rb_node **rbn, *parent;
292 struct sk_buff *skb1;
282 struct net_device *dev; 293 struct net_device *dev;
283 unsigned int fragsize; 294 unsigned int fragsize;
284 int flags, offset; 295 int flags, offset;
@@ -341,58 +352,58 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
341 if (err) 352 if (err)
342 goto err; 353 goto err;
343 354
344 /* Find out which fragments are in front and at the back of us 355 /* Note : skb->rbnode and skb->dev share the same location. */
345 * in the chain of fragments so far. We must know where to put 356 dev = skb->dev;
346 * this fragment, right? 357 /* Makes sure compiler wont do silly aliasing games */
347 */ 358 barrier();
348 prev = qp->q.fragments_tail;
349 if (!prev || prev->ip_defrag_offset < offset) {
350 next = NULL;
351 goto found;
352 }
353 prev = NULL;
354 for (next = qp->q.fragments; next != NULL; next = next->next) {
355 if (next->ip_defrag_offset >= offset)
356 break; /* bingo! */
357 prev = next;
358 }
359 359
360found:
361 /* RFC5722, Section 4, amended by Errata ID : 3089 360 /* RFC5722, Section 4, amended by Errata ID : 3089
362 * When reassembling an IPv6 datagram, if 361 * When reassembling an IPv6 datagram, if
363 * one or more its constituent fragments is determined to be an 362 * one or more its constituent fragments is determined to be an
364 * overlapping fragment, the entire datagram (and any constituent 363 * overlapping fragment, the entire datagram (and any constituent
365 * fragments) MUST be silently discarded. 364 * fragments) MUST be silently discarded.
366 * 365 *
367 * We do the same here for IPv4. 366 * We do the same here for IPv4 (and increment an snmp counter).
368 */ 367 */
369 368
370 /* Is there an overlap with the previous fragment? */ 369 /* Find out where to put this fragment. */
371 if (prev && 370 skb1 = qp->q.fragments_tail;
372 (prev->ip_defrag_offset + prev->len) > offset) 371 if (!skb1) {
373 goto discard_qp; 372 /* This is the first fragment we've received. */
374 373 rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
375 /* Is there an overlap with the next fragment? */ 374 qp->q.fragments_tail = skb;
376 if (next && next->ip_defrag_offset < end) 375 } else if ((skb1->ip_defrag_offset + skb1->len) < end) {
377 goto discard_qp; 376 /* This is the common/special case: skb goes to the end. */
377 /* Detect and discard overlaps. */
378 if (offset < (skb1->ip_defrag_offset + skb1->len))
379 goto discard_qp;
380 /* Insert after skb1. */
381 rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
382 qp->q.fragments_tail = skb;
383 } else {
384 /* Binary search. Note that skb can become the first fragment, but
385 * not the last (covered above). */
386 rbn = &qp->q.rb_fragments.rb_node;
387 do {
388 parent = *rbn;
389 skb1 = rb_to_skb(parent);
390 if (end <= skb1->ip_defrag_offset)
391 rbn = &parent->rb_left;
392 else if (offset >= skb1->ip_defrag_offset + skb1->len)
393 rbn = &parent->rb_right;
394 else /* Found an overlap with skb1. */
395 goto discard_qp;
396 } while (*rbn);
397 /* Here we have parent properly set, and rbn pointing to
398 * one of its NULL left/right children. Insert skb. */
399 rb_link_node(&skb->rbnode, parent, rbn);
400 }
401 rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
378 402
379 /* Note : skb->ip_defrag_offset and skb->dev share the same location */
380 dev = skb->dev;
381 if (dev) 403 if (dev)
382 qp->iif = dev->ifindex; 404 qp->iif = dev->ifindex;
383 /* Makes sure compiler wont do silly aliasing games */
384 barrier();
385 skb->ip_defrag_offset = offset; 405 skb->ip_defrag_offset = offset;
386 406
387 /* Insert this fragment in the chain of fragments. */
388 skb->next = next;
389 if (!next)
390 qp->q.fragments_tail = skb;
391 if (prev)
392 prev->next = skb;
393 else
394 qp->q.fragments = skb;
395
396 qp->q.stamp = skb->tstamp; 407 qp->q.stamp = skb->tstamp;
397 qp->q.meat += skb->len; 408 qp->q.meat += skb->len;
398 qp->ecn |= ecn; 409 qp->ecn |= ecn;
@@ -414,7 +425,7 @@ found:
414 unsigned long orefdst = skb->_skb_refdst; 425 unsigned long orefdst = skb->_skb_refdst;
415 426
416 skb->_skb_refdst = 0UL; 427 skb->_skb_refdst = 0UL;
417 err = ip_frag_reasm(qp, prev, dev); 428 err = ip_frag_reasm(qp, skb, dev);
418 skb->_skb_refdst = orefdst; 429 skb->_skb_refdst = orefdst;
419 return err; 430 return err;
420 } 431 }
@@ -431,15 +442,15 @@ err:
431 return err; 442 return err;
432} 443}
433 444
434
435/* Build a new IP datagram from all its fragments. */ 445/* Build a new IP datagram from all its fragments. */
436 446static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
437static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
438 struct net_device *dev) 447 struct net_device *dev)
439{ 448{
440 struct net *net = container_of(qp->q.net, struct net, ipv4.frags); 449 struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
441 struct iphdr *iph; 450 struct iphdr *iph;
442 struct sk_buff *fp, *head = qp->q.fragments; 451 struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
452 struct sk_buff **nextp; /* To build frag_list. */
453 struct rb_node *rbn;
443 int len; 454 int len;
444 int ihlen; 455 int ihlen;
445 int err; 456 int err;
@@ -453,25 +464,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
453 goto out_fail; 464 goto out_fail;
454 } 465 }
455 /* Make the one we just received the head. */ 466 /* Make the one we just received the head. */
456 if (prev) { 467 if (head != skb) {
457 head = prev->next; 468 fp = skb_clone(skb, GFP_ATOMIC);
458 fp = skb_clone(head, GFP_ATOMIC);
459 if (!fp) 469 if (!fp)
460 goto out_nomem; 470 goto out_nomem;
461 471 rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
462 fp->next = head->next; 472 if (qp->q.fragments_tail == skb)
463 if (!fp->next)
464 qp->q.fragments_tail = fp; 473 qp->q.fragments_tail = fp;
465 prev->next = fp; 474 skb_morph(skb, head);
466 475 rb_replace_node(&head->rbnode, &skb->rbnode,
467 skb_morph(head, qp->q.fragments); 476 &qp->q.rb_fragments);
468 head->next = qp->q.fragments->next; 477 consume_skb(head);
469 478 head = skb;
470 consume_skb(qp->q.fragments);
471 qp->q.fragments = head;
472 } 479 }
473 480
474 WARN_ON(!head);
475 WARN_ON(head->ip_defrag_offset != 0); 481 WARN_ON(head->ip_defrag_offset != 0);
476 482
477 /* Allocate a new buffer for the datagram. */ 483 /* Allocate a new buffer for the datagram. */
@@ -496,24 +502,35 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
496 clone = alloc_skb(0, GFP_ATOMIC); 502 clone = alloc_skb(0, GFP_ATOMIC);
497 if (!clone) 503 if (!clone)
498 goto out_nomem; 504 goto out_nomem;
499 clone->next = head->next;
500 head->next = clone;
501 skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; 505 skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
502 skb_frag_list_init(head); 506 skb_frag_list_init(head);
503 for (i = 0; i < skb_shinfo(head)->nr_frags; i++) 507 for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
504 plen += skb_frag_size(&skb_shinfo(head)->frags[i]); 508 plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
505 clone->len = clone->data_len = head->data_len - plen; 509 clone->len = clone->data_len = head->data_len - plen;
506 head->data_len -= clone->len; 510 skb->truesize += clone->truesize;
507 head->len -= clone->len;
508 clone->csum = 0; 511 clone->csum = 0;
509 clone->ip_summed = head->ip_summed; 512 clone->ip_summed = head->ip_summed;
510 add_frag_mem_limit(qp->q.net, clone->truesize); 513 add_frag_mem_limit(qp->q.net, clone->truesize);
514 skb_shinfo(head)->frag_list = clone;
515 nextp = &clone->next;
516 } else {
517 nextp = &skb_shinfo(head)->frag_list;
511 } 518 }
512 519
513 skb_shinfo(head)->frag_list = head->next;
514 skb_push(head, head->data - skb_network_header(head)); 520 skb_push(head, head->data - skb_network_header(head));
515 521
516 for (fp=head->next; fp; fp = fp->next) { 522 /* Traverse the tree in order, to build frag_list. */
523 rbn = rb_next(&head->rbnode);
524 rb_erase(&head->rbnode, &qp->q.rb_fragments);
525 while (rbn) {
526 struct rb_node *rbnext = rb_next(rbn);
527 fp = rb_to_skb(rbn);
528 rb_erase(rbn, &qp->q.rb_fragments);
529 rbn = rbnext;
530 *nextp = fp;
531 nextp = &fp->next;
532 fp->prev = NULL;
533 memset(&fp->rbnode, 0, sizeof(fp->rbnode));
517 head->data_len += fp->len; 534 head->data_len += fp->len;
518 head->len += fp->len; 535 head->len += fp->len;
519 if (head->ip_summed != fp->ip_summed) 536 if (head->ip_summed != fp->ip_summed)
@@ -524,7 +541,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
524 } 541 }
525 sub_frag_mem_limit(qp->q.net, head->truesize); 542 sub_frag_mem_limit(qp->q.net, head->truesize);
526 543
544 *nextp = NULL;
527 head->next = NULL; 545 head->next = NULL;
546 head->prev = NULL;
528 head->dev = dev; 547 head->dev = dev;
529 head->tstamp = qp->q.stamp; 548 head->tstamp = qp->q.stamp;
530 IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); 549 IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
@@ -552,6 +571,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
552 571
553 __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); 572 __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
554 qp->q.fragments = NULL; 573 qp->q.fragments = NULL;
574 qp->q.rb_fragments = RB_ROOT;
555 qp->q.fragments_tail = NULL; 575 qp->q.fragments_tail = NULL;
556 return 0; 576 return 0;
557 577
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 0610bdab721c..38d69ef516d5 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -463,6 +463,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic
463 head->csum); 463 head->csum);
464 464
465 fq->q.fragments = NULL; 465 fq->q.fragments = NULL;
466 fq->q.rb_fragments = RB_ROOT;
466 fq->q.fragments_tail = NULL; 467 fq->q.fragments_tail = NULL;
467 468
468 return true; 469 return true;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 6edd2ac8ae4b..b4e558ab39fa 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -405,6 +405,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
405 __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); 405 __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
406 rcu_read_unlock(); 406 rcu_read_unlock();
407 fq->q.fragments = NULL; 407 fq->q.fragments = NULL;
408 fq->q.rb_fragments = RB_ROOT;
408 fq->q.fragments_tail = NULL; 409 fq->q.fragments_tail = NULL;
409 return 1; 410 return 1;
410 411